LLVM 19.0.0git
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements a TargetTransformInfo analysis pass specific to the
10/// X86 target machine. It uses the target's detailed information to provide
11/// more precise answers to certain TTI queries, while letting the target
12/// independent and default TTI implementations handle the rest.
13///
14//===----------------------------------------------------------------------===//
15/// About Cost Model numbers used below it's necessary to say the following:
16/// the numbers correspond to some "generic" X86 CPU instead of usage of a
17/// specific CPU model. Usually the numbers correspond to the CPU where the
18/// feature first appeared. For example, if we do Subtarget.hasSSE42() in
19/// the lookups below the cost is based on Nehalem as that was the first CPU
20/// to support that feature level and thus has most likely the worst case cost,
21/// although we may discard an outlying worst cost from one CPU (e.g. Atom).
22///
23/// Some examples of other technologies/CPUs:
24/// SSE 3 - Pentium4 / Athlon64
25/// SSE 4.1 - Penryn
26/// SSE 4.2 - Nehalem / Silvermont
27/// AVX - Sandy Bridge / Jaguar / Bulldozer
28/// AVX2 - Haswell / Ryzen
29/// AVX-512 - Xeon Phi / Skylake
30///
31/// And some examples of instruction target dependent costs (latency)
32/// divss sqrtss rsqrtss
33/// AMD K7 11-16 19 3
34/// Piledriver 9-24 13-15 5
35/// Jaguar 14 16 2
36/// Pentium II,III 18 30 2
37/// Nehalem 7-14 7-18 3
38/// Haswell 10-13 11 5
39///
40/// Interpreting the 4 TargetCostKind types:
41/// TCK_RecipThroughput and TCK_Latency should try to match the worst case
42/// values reported by the CPU scheduler models (and llvm-mca).
43/// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the
44/// actual encoding size of the instruction.
45/// TCK_SizeAndLatency should match the worst case micro-op counts reported by
46/// by the CPU scheduler models (and llvm-mca), to ensure that they are
47/// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are
48/// often used as the cost thresholds where TCK_SizeAndLatency is requested.
49//===----------------------------------------------------------------------===//
50
58#include "llvm/Support/Debug.h"
59#include <optional>
60
61using namespace llvm;
62
63#define DEBUG_TYPE "x86tti"
64
65//===----------------------------------------------------------------------===//
66//
67// X86 cost model.
68//
69//===----------------------------------------------------------------------===//
70
71// Helper struct to store/access costs for each cost kind.
72// TODO: Move this to allow other targets to use it?
74 unsigned RecipThroughputCost = ~0U;
75 unsigned LatencyCost = ~0U;
76 unsigned CodeSizeCost = ~0U;
77 unsigned SizeAndLatencyCost = ~0U;
78
79 std::optional<unsigned>
81 unsigned Cost = ~0U;
82 switch (Kind) {
85 break;
88 break;
91 break;
94 break;
95 }
96 if (Cost == ~0U)
97 return std::nullopt;
98 return Cost;
99 }
100};
103
106 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
107 // TODO: Currently the __builtin_popcount() implementation using SSE3
108 // instructions is inefficient. Once the problem is fixed, we should
109 // call ST->hasSSE3() instead of ST->hasPOPCNT().
110 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
111}
112
113std::optional<unsigned> X86TTIImpl::getCacheSize(
115 switch (Level) {
117 // - Penryn
118 // - Nehalem
119 // - Westmere
120 // - Sandy Bridge
121 // - Ivy Bridge
122 // - Haswell
123 // - Broadwell
124 // - Skylake
125 // - Kabylake
126 return 32 * 1024; // 32 KByte
128 // - Penryn
129 // - Nehalem
130 // - Westmere
131 // - Sandy Bridge
132 // - Ivy Bridge
133 // - Haswell
134 // - Broadwell
135 // - Skylake
136 // - Kabylake
137 return 256 * 1024; // 256 KByte
138 }
139
140 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
141}
142
143std::optional<unsigned> X86TTIImpl::getCacheAssociativity(
145 // - Penryn
146 // - Nehalem
147 // - Westmere
148 // - Sandy Bridge
149 // - Ivy Bridge
150 // - Haswell
151 // - Broadwell
152 // - Skylake
153 // - Kabylake
154 switch (Level) {
156 [[fallthrough]];
158 return 8;
159 }
160
161 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
162}
163
164unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
165 bool Vector = (ClassID == 1);
166 if (Vector && !ST->hasSSE1())
167 return 0;
168
169 if (ST->is64Bit()) {
170 if (Vector && ST->hasAVX512())
171 return 32;
172 if (!Vector && ST->hasEGPR())
173 return 32;
174 return 16;
175 }
176 return 8;
177}
178
180 if (!ST->hasCF())
181 return false;
182 if (!Ty)
183 return true;
184 // Conditional faulting is supported by CFCMOV, which only accepts
185 // 16/32/64-bit operands.
186 // TODO: Support f32/f64 with VMOVSS/VMOVSD with zero mask when it's
187 // profitable.
188 if (!Ty->isIntegerTy())
189 return false;
190 switch (cast<IntegerType>(Ty)->getBitWidth()) {
191 default:
192 return false;
193 case 16:
194 case 32:
195 case 64:
196 return true;
197 }
198}
199
202 unsigned PreferVectorWidth = ST->getPreferVectorWidth();
203 switch (K) {
205 return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
207 if (ST->hasAVX512() && ST->hasEVEX512() && PreferVectorWidth >= 512)
208 return TypeSize::getFixed(512);
209 if (ST->hasAVX() && PreferVectorWidth >= 256)
210 return TypeSize::getFixed(256);
211 if (ST->hasSSE1() && PreferVectorWidth >= 128)
212 return TypeSize::getFixed(128);
213 return TypeSize::getFixed(0);
215 return TypeSize::getScalable(0);
216 }
217
218 llvm_unreachable("Unsupported register kind");
219}
220
223 .getFixedValue();
224}
225
227 // If the loop will not be vectorized, don't interleave the loop.
228 // Let regular unroll to unroll the loop, which saves the overflow
229 // check and memory check cost.
230 if (VF.isScalar())
231 return 1;
232
233 if (ST->isAtom())
234 return 1;
235
236 // Sandybridge and Haswell have multiple execution ports and pipelined
237 // vector units.
238 if (ST->hasAVX())
239 return 4;
240
241 return 2;
242}
243
245 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
248 const Instruction *CxtI) {
249
250 // vXi8 multiplications are always promoted to vXi16.
251 // Sub-128-bit types can be extended/packed more efficiently.
252 if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
253 Ty->getPrimitiveSizeInBits() <= 64 && Ty->getScalarSizeInBits() == 8) {
254 Type *WideVecTy =
255 VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
256 return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
258 CostKind) +
259 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
261 CostKind) +
262 getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info);
263 }
264
265 // Legalize the type.
266 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
267
268 int ISD = TLI->InstructionOpcodeToISD(Opcode);
269 assert(ISD && "Invalid opcode");
270
271 if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
272 (LT.second.getScalarType() == MVT::i32 ||
273 LT.second.getScalarType() == MVT::i64)) {
274 // Check if the operands can be represented as a smaller datatype.
275 bool Op1Signed = false, Op2Signed = false;
276 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
277 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
278 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
279 bool SignedMode = Op1Signed || Op2Signed;
280
281 // If both vXi32 are representable as i15 and at least one is constant,
282 // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
283 // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
284 if (OpMinSize <= 15 && !ST->isPMADDWDSlow() &&
285 LT.second.getScalarType() == MVT::i32) {
286 bool Op1Constant =
287 isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
288 bool Op2Constant =
289 isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
290 bool Op1Sext = isa<SExtInst>(Args[0]) &&
291 (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
292 bool Op2Sext = isa<SExtInst>(Args[1]) &&
293 (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
294
295 bool IsZeroExtended = !Op1Signed || !Op2Signed;
296 bool IsConstant = Op1Constant || Op2Constant;
297 bool IsSext = Op1Sext || Op2Sext;
298 if (IsConstant || IsZeroExtended || IsSext)
299 LT.second =
300 MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
301 }
302
303 // Check if the vXi32 operands can be shrunk into a smaller datatype.
304 // This should match the codegen from reduceVMULWidth.
305 // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()).
306 if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) {
307 if (OpMinSize <= 7)
308 return LT.first * 3; // pmullw/sext
309 if (!SignedMode && OpMinSize <= 8)
310 return LT.first * 3; // pmullw/zext
311 if (OpMinSize <= 15)
312 return LT.first * 5; // pmullw/pmulhw/pshuf
313 if (!SignedMode && OpMinSize <= 16)
314 return LT.first * 5; // pmullw/pmulhw/pshuf
315 }
316
317 // If both vXi64 are representable as (unsigned) i32, then we can perform
318 // the multiple with a single PMULUDQ instruction.
319 // TODO: Add (SSE41+) PMULDQ handling for signed extensions.
320 if (!SignedMode && OpMinSize <= 32 && LT.second.getScalarType() == MVT::i64)
321 ISD = X86ISD::PMULUDQ;
322 }
323
324 // Vector multiply by pow2 will be simplified to shifts.
325 // Vector multiply by -pow2 will be simplified to shifts/negates.
326 if (ISD == ISD::MUL && Op2Info.isConstant() &&
327 (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) {
329 getArithmeticInstrCost(Instruction::Shl, Ty, CostKind,
330 Op1Info.getNoProps(), Op2Info.getNoProps());
331 if (Op2Info.isNegatedPowerOf2())
332 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
333 return Cost;
334 }
335
336 // On X86, vector signed division by constants power-of-two are
337 // normally expanded to the sequence SRA + SRL + ADD + SRA.
338 // The OperandValue properties may not be the same as that of the previous
339 // operation; conservatively assume OP_None.
340 if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
341 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
343 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
344 Op1Info.getNoProps(), Op2Info.getNoProps());
345 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
346 Op1Info.getNoProps(), Op2Info.getNoProps());
347 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
348 Op1Info.getNoProps(), Op2Info.getNoProps());
349
350 if (ISD == ISD::SREM) {
351 // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
352 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(),
353 Op2Info.getNoProps());
354 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(),
355 Op2Info.getNoProps());
356 }
357
358 return Cost;
359 }
360
361 // Vector unsigned division/remainder will be simplified to shifts/masks.
362 if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
363 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
364 if (ISD == ISD::UDIV)
365 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
366 Op1Info.getNoProps(), Op2Info.getNoProps());
367 // UREM
368 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
369 Op1Info.getNoProps(), Op2Info.getNoProps());
370 }
371
372 static const CostKindTblEntry GFNIUniformConstCostTable[] = {
373 { ISD::SHL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
374 { ISD::SRL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
375 { ISD::SRA, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
376 { ISD::SHL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
377 { ISD::SRL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
378 { ISD::SRA, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
379 { ISD::SHL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
380 { ISD::SRL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
381 { ISD::SRA, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
382 };
383
384 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasGFNI())
385 if (const auto *Entry =
386 CostTableLookup(GFNIUniformConstCostTable, ISD, LT.second))
387 if (auto KindCost = Entry->Cost[CostKind])
388 return LT.first * *KindCost;
389
390 static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
391 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
392 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
393 { ISD::SRA, MVT::v16i8, { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb.
394 { ISD::SHL, MVT::v32i8, { 1, 8, 2, 3 } }, // psllw + pand.
395 { ISD::SRL, MVT::v32i8, { 1, 8, 2, 3 } }, // psrlw + pand.
396 { ISD::SRA, MVT::v32i8, { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb.
397 { ISD::SHL, MVT::v64i8, { 1, 8, 2, 3 } }, // psllw + pand.
398 { ISD::SRL, MVT::v64i8, { 1, 8, 2, 3 } }, // psrlw + pand.
399 { ISD::SRA, MVT::v64i8, { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb.
400
401 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // psllw
402 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
403 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
404 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // psllw
405 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
406 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
407 };
408
409 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI())
410 if (const auto *Entry =
411 CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second))
412 if (auto KindCost = Entry->Cost[CostKind])
413 return LT.first * *KindCost;
414
415 static const CostKindTblEntry AVX512UniformConstCostTable[] = {
416 { ISD::SHL, MVT::v64i8, { 2, 12, 5, 6 } }, // psllw + pand.
417 { ISD::SRL, MVT::v64i8, { 2, 12, 5, 6 } }, // psrlw + pand.
418 { ISD::SRA, MVT::v64i8, { 3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb.
419
420 { ISD::SHL, MVT::v16i16, { 2, 7, 4, 4 } }, // psllw + split.
421 { ISD::SRL, MVT::v16i16, { 2, 7, 4, 4 } }, // psrlw + split.
422 { ISD::SRA, MVT::v16i16, { 2, 7, 4, 4 } }, // psraw + split.
423
424 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, // pslld
425 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, // psrld
426 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, // psrad
427 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, // pslld
428 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, // psrld
429 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, // psrad
430
431 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, // psraq
432 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, // psllq
433 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, // psrlq
434 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, // psraq
435 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, // psllq
436 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, // psrlq
437 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, // psraq
438
439 { ISD::SDIV, MVT::v16i32, { 6 } }, // pmuludq sequence
440 { ISD::SREM, MVT::v16i32, { 8 } }, // pmuludq+mul+sub sequence
441 { ISD::UDIV, MVT::v16i32, { 5 } }, // pmuludq sequence
442 { ISD::UREM, MVT::v16i32, { 7 } }, // pmuludq+mul+sub sequence
443 };
444
445 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512())
446 if (const auto *Entry =
447 CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second))
448 if (auto KindCost = Entry->Cost[CostKind])
449 return LT.first * *KindCost;
450
451 static const CostKindTblEntry AVX2UniformConstCostTable[] = {
452 { ISD::SHL, MVT::v16i8, { 1, 8, 2, 3 } }, // psllw + pand.
453 { ISD::SRL, MVT::v16i8, { 1, 8, 2, 3 } }, // psrlw + pand.
454 { ISD::SRA, MVT::v16i8, { 2, 10, 5, 6 } }, // psrlw, pand, pxor, psubb.
455 { ISD::SHL, MVT::v32i8, { 2, 8, 2, 4 } }, // psllw + pand.
456 { ISD::SRL, MVT::v32i8, { 2, 8, 2, 4 } }, // psrlw + pand.
457 { ISD::SRA, MVT::v32i8, { 3, 10, 5, 9 } }, // psrlw, pand, pxor, psubb.
458
459 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw
460 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw
461 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw
462 { ISD::SHL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psllw
463 { ISD::SRL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psrlw
464 { ISD::SRA, MVT::v16i16,{ 2, 2, 1, 2 } }, // psraw
465
466 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
467 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld
468 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad
469 { ISD::SHL, MVT::v8i32, { 2, 2, 1, 2 } }, // pslld
470 { ISD::SRL, MVT::v8i32, { 2, 2, 1, 2 } }, // psrld
471 { ISD::SRA, MVT::v8i32, { 2, 2, 1, 2 } }, // psrad
472
473 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq
474 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq
475 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
476 { ISD::SHL, MVT::v4i64, { 2, 2, 1, 2 } }, // psllq
477 { ISD::SRL, MVT::v4i64, { 2, 2, 1, 2 } }, // psrlq
478 { ISD::SRA, MVT::v4i64, { 4, 4, 3, 6 } }, // psrad + shuffle + split.
479
480 { ISD::SDIV, MVT::v8i32, { 6 } }, // pmuludq sequence
481 { ISD::SREM, MVT::v8i32, { 8 } }, // pmuludq+mul+sub sequence
482 { ISD::UDIV, MVT::v8i32, { 5 } }, // pmuludq sequence
483 { ISD::UREM, MVT::v8i32, { 7 } }, // pmuludq+mul+sub sequence
484 };
485
486 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2())
487 if (const auto *Entry =
488 CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second))
489 if (auto KindCost = Entry->Cost[CostKind])
490 return LT.first * *KindCost;
491
492 static const CostKindTblEntry AVXUniformConstCostTable[] = {
493 { ISD::SHL, MVT::v16i8, { 2, 7, 2, 3 } }, // psllw + pand.
494 { ISD::SRL, MVT::v16i8, { 2, 7, 2, 3 } }, // psrlw + pand.
495 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
496 { ISD::SHL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psllw + pand) + split.
497 { ISD::SRL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psrlw + pand) + split.
498 { ISD::SRA, MVT::v32i8, { 7, 7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split.
499
500 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 1 } }, // psllw.
501 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 1 } }, // psrlw.
502 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 1 } }, // psraw.
503 { ISD::SHL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psllw + split.
504 { ISD::SRL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psrlw + split.
505 { ISD::SRA, MVT::v16i16,{ 3, 6, 4, 5 } }, // psraw + split.
506
507 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 1 } }, // pslld.
508 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 1 } }, // psrld.
509 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 1 } }, // psrad.
510 { ISD::SHL, MVT::v8i32, { 3, 6, 4, 5 } }, // pslld + split.
511 { ISD::SRL, MVT::v8i32, { 3, 6, 4, 5 } }, // psrld + split.
512 { ISD::SRA, MVT::v8i32, { 3, 6, 4, 5 } }, // psrad + split.
513
514 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 1 } }, // psllq.
515 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 1 } }, // psrlq.
516 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
517 { ISD::SHL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
518 { ISD::SRL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
519 { ISD::SRA, MVT::v4i64, { 5, 7, 8, 9 } }, // 2 x psrad + shuffle + split.
520
521 { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split.
522 { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split.
523 { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split.
524 { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split.
525 };
526
527 // XOP has faster vXi8 shifts.
528 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() &&
529 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
530 if (const auto *Entry =
531 CostTableLookup(AVXUniformConstCostTable, ISD, LT.second))
532 if (auto KindCost = Entry->Cost[CostKind])
533 return LT.first * *KindCost;
534
535 static const CostKindTblEntry SSE2UniformConstCostTable[] = {
536 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
537 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
538 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
539
540 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw.
541 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw.
542 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw.
543
544 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
545 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld.
546 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad.
547
548 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq.
549 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq.
550 { ISD::SRA, MVT::v2i64, { 3, 5, 6, 6 } }, // 2 x psrad + shuffle.
551
552 { ISD::SDIV, MVT::v4i32, { 6 } }, // pmuludq sequence
553 { ISD::SREM, MVT::v4i32, { 8 } }, // pmuludq+mul+sub sequence
554 { ISD::UDIV, MVT::v4i32, { 5 } }, // pmuludq sequence
555 { ISD::UREM, MVT::v4i32, { 7 } }, // pmuludq+mul+sub sequence
556 };
557
558 // XOP has faster vXi8 shifts.
559 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() &&
560 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
561 if (const auto *Entry =
562 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
563 if (auto KindCost = Entry->Cost[CostKind])
564 return LT.first * *KindCost;
565
566 static const CostKindTblEntry AVX512BWConstCostTable[] = {
567 { ISD::SDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
568 { ISD::SREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
569 { ISD::UDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
570 { ISD::UREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
571
572 { ISD::SDIV, MVT::v32i16, { 6 } }, // vpmulhw sequence
573 { ISD::SREM, MVT::v32i16, { 8 } }, // vpmulhw+mul+sub sequence
574 { ISD::UDIV, MVT::v32i16, { 6 } }, // vpmulhuw sequence
575 { ISD::UREM, MVT::v32i16, { 8 } }, // vpmulhuw+mul+sub sequence
576 };
577
578 if (Op2Info.isConstant() && ST->hasBWI())
579 if (const auto *Entry =
580 CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
581 if (auto KindCost = Entry->Cost[CostKind])
582 return LT.first * *KindCost;
583
584 static const CostKindTblEntry AVX512ConstCostTable[] = {
585 { ISD::SDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
586 { ISD::SREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
587 { ISD::UDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
588 { ISD::UREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
589
590 { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence
591 { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence
592 { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence
593 { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence
594
595 { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence
596 { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence
597 { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence
598 { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence
599 };
600
601 if (Op2Info.isConstant() && ST->hasAVX512())
602 if (const auto *Entry =
603 CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
604 if (auto KindCost = Entry->Cost[CostKind])
605 return LT.first * *KindCost;
606
607 static const CostKindTblEntry AVX2ConstCostTable[] = {
608 { ISD::SDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
609 { ISD::SREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
610 { ISD::UDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
611 { ISD::UREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
612
613 { ISD::SDIV, MVT::v16i16, { 6 } }, // vpmulhw sequence
614 { ISD::SREM, MVT::v16i16, { 8 } }, // vpmulhw+mul+sub sequence
615 { ISD::UDIV, MVT::v16i16, { 6 } }, // vpmulhuw sequence
616 { ISD::UREM, MVT::v16i16, { 8 } }, // vpmulhuw+mul+sub sequence
617
618 { ISD::SDIV, MVT::v8i32, { 15 } }, // vpmuldq sequence
619 { ISD::SREM, MVT::v8i32, { 19 } }, // vpmuldq+mul+sub sequence
620 { ISD::UDIV, MVT::v8i32, { 15 } }, // vpmuludq sequence
621 { ISD::UREM, MVT::v8i32, { 19 } }, // vpmuludq+mul+sub sequence
622 };
623
624 if (Op2Info.isConstant() && ST->hasAVX2())
625 if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
626 if (auto KindCost = Entry->Cost[CostKind])
627 return LT.first * *KindCost;
628
629 static const CostKindTblEntry AVXConstCostTable[] = {
630 { ISD::SDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
631 { ISD::SREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
632 { ISD::UDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
633 { ISD::UREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
634
635 { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split.
636 { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split.
637 { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split.
638 { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split.
639
640 { ISD::SDIV, MVT::v8i32, { 32 } }, // vpmuludq sequence
641 { ISD::SREM, MVT::v8i32, { 38 } }, // vpmuludq+mul+sub sequence
642 { ISD::UDIV, MVT::v8i32, { 32 } }, // 2*pmuludq sequence + split.
643 { ISD::UREM, MVT::v8i32, { 42 } }, // 2*pmuludq+mul+sub sequence + split.
644 };
645
646 if (Op2Info.isConstant() && ST->hasAVX())
647 if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second))
648 if (auto KindCost = Entry->Cost[CostKind])
649 return LT.first * *KindCost;
650
651 static const CostKindTblEntry SSE41ConstCostTable[] = {
652 { ISD::SDIV, MVT::v4i32, { 15 } }, // vpmuludq sequence
653 { ISD::SREM, MVT::v4i32, { 20 } }, // vpmuludq+mul+sub sequence
654 };
655
656 if (Op2Info.isConstant() && ST->hasSSE41())
657 if (const auto *Entry =
658 CostTableLookup(SSE41ConstCostTable, ISD, LT.second))
659 if (auto KindCost = Entry->Cost[CostKind])
660 return LT.first * *KindCost;
661
662 static const CostKindTblEntry SSE2ConstCostTable[] = {
663 { ISD::SDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
664 { ISD::SREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
665 { ISD::UDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
666 { ISD::UREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
667
668 { ISD::SDIV, MVT::v8i16, { 6 } }, // pmulhw sequence
669 { ISD::SREM, MVT::v8i16, { 8 } }, // pmulhw+mul+sub sequence
670 { ISD::UDIV, MVT::v8i16, { 6 } }, // pmulhuw sequence
671 { ISD::UREM, MVT::v8i16, { 8 } }, // pmulhuw+mul+sub sequence
672
673 { ISD::SDIV, MVT::v4i32, { 19 } }, // pmuludq sequence
674 { ISD::SREM, MVT::v4i32, { 24 } }, // pmuludq+mul+sub sequence
675 { ISD::UDIV, MVT::v4i32, { 15 } }, // pmuludq sequence
676 { ISD::UREM, MVT::v4i32, { 20 } }, // pmuludq+mul+sub sequence
677 };
678
679 if (Op2Info.isConstant() && ST->hasSSE2())
680 if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
681 if (auto KindCost = Entry->Cost[CostKind])
682 return LT.first * *KindCost;
683
684 static const CostKindTblEntry AVX512BWUniformCostTable[] = {
685 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
686 { ISD::SRL, MVT::v16i8, { 3,10, 5, 8 } }, // psrlw + pand.
687 { ISD::SRA, MVT::v16i8, { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb.
688 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
689 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
690 { ISD::SRA, MVT::v32i8, { 5,10,10,13 } }, // psrlw, pand, pxor, psubb.
691 { ISD::SHL, MVT::v64i8, { 4, 7, 6, 8 } }, // psllw + pand.
692 { ISD::SRL, MVT::v64i8, { 4, 8, 7,10 } }, // psrlw + pand.
693 { ISD::SRA, MVT::v64i8, { 5,10,10,15 } }, // psrlw, pand, pxor, psubb.
694
695 { ISD::SHL, MVT::v32i16, { 2, 4, 2, 3 } }, // psllw
696 { ISD::SRL, MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw
697 { ISD::SRA, MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw
698 };
699
700 if (ST->hasBWI() && Op2Info.isUniform())
701 if (const auto *Entry =
702 CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second))
703 if (auto KindCost = Entry->Cost[CostKind])
704 return LT.first * *KindCost;
705
706 static const CostKindTblEntry AVX512UniformCostTable[] = {
707 { ISD::SHL, MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split.
708 { ISD::SRL, MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split.
709 { ISD::SRA, MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split.
710
711 { ISD::SHL, MVT::v16i32, { 2, 4, 2, 3 } }, // pslld
712 { ISD::SRL, MVT::v16i32, { 2, 4, 2, 3 } }, // psrld
713 { ISD::SRA, MVT::v16i32, { 2, 4, 2, 3 } }, // psrad
714
715 { ISD::SRA, MVT::v2i64, { 1, 2, 1, 2 } }, // psraq
716 { ISD::SHL, MVT::v4i64, { 1, 4, 1, 2 } }, // psllq
717 { ISD::SRL, MVT::v4i64, { 1, 4, 1, 2 } }, // psrlq
718 { ISD::SRA, MVT::v4i64, { 1, 4, 1, 2 } }, // psraq
719 { ISD::SHL, MVT::v8i64, { 1, 4, 1, 2 } }, // psllq
720 { ISD::SRL, MVT::v8i64, { 1, 4, 1, 2 } }, // psrlq
721 { ISD::SRA, MVT::v8i64, { 1, 4, 1, 2 } }, // psraq
722 };
723
724 if (ST->hasAVX512() && Op2Info.isUniform())
725 if (const auto *Entry =
726 CostTableLookup(AVX512UniformCostTable, ISD, LT.second))
727 if (auto KindCost = Entry->Cost[CostKind])
728 return LT.first * *KindCost;
729
730 static const CostKindTblEntry AVX2UniformCostTable[] = {
731 // Uniform splats are cheaper for the following instructions.
732 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
733 { ISD::SRL, MVT::v16i8, { 3, 9, 5, 8 } }, // psrlw + pand.
734 { ISD::SRA, MVT::v16i8, { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb.
735 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
736 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
737 { ISD::SRA, MVT::v32i8, { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb.
738
739 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 2 } }, // psllw.
740 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 2 } }, // psrlw.
741 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 2 } }, // psraw.
742 { ISD::SHL, MVT::v16i16, { 2, 4, 2, 3 } }, // psllw.
743 { ISD::SRL, MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw.
744 { ISD::SRA, MVT::v16i16, { 2, 4, 2, 3 } }, // psraw.
745
746 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 2 } }, // pslld
747 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 2 } }, // psrld
748 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 2 } }, // psrad
749 { ISD::SHL, MVT::v8i32, { 2, 4, 2, 3 } }, // pslld
750 { ISD::SRL, MVT::v8i32, { 2, 4, 2, 3 } }, // psrld
751 { ISD::SRA, MVT::v8i32, { 2, 4, 2, 3 } }, // psrad
752
753 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 2 } }, // psllq
754 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 2 } }, // psrlq
755 { ISD::SRA, MVT::v2i64, { 2, 4, 5, 7 } }, // 2 x psrad + shuffle.
756 { ISD::SHL, MVT::v4i64, { 2, 4, 1, 2 } }, // psllq
757 { ISD::SRL, MVT::v4i64, { 2, 4, 1, 2 } }, // psrlq
758 { ISD::SRA, MVT::v4i64, { 4, 6, 5, 9 } }, // 2 x psrad + shuffle.
759 };
760
761 if (ST->hasAVX2() && Op2Info.isUniform())
762 if (const auto *Entry =
763 CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
764 if (auto KindCost = Entry->Cost[CostKind])
765 return LT.first * *KindCost;
766
767 static const CostKindTblEntry AVXUniformCostTable[] = {
768 { ISD::SHL, MVT::v16i8, { 4, 4, 6, 8 } }, // psllw + pand.
769 { ISD::SRL, MVT::v16i8, { 4, 8, 5, 8 } }, // psrlw + pand.
770 { ISD::SRA, MVT::v16i8, { 6, 6, 9,13 } }, // psrlw, pand, pxor, psubb.
771 { ISD::SHL, MVT::v32i8, { 7, 8,11,14 } }, // psllw + pand + split.
772 { ISD::SRL, MVT::v32i8, { 7, 9,10,14 } }, // psrlw + pand + split.
773 { ISD::SRA, MVT::v32i8, { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split.
774
775 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 2 } }, // psllw.
776 { ISD::SRL, MVT::v8i16, { 1, 3, 1, 2 } }, // psrlw.
777 { ISD::SRA, MVT::v8i16, { 1, 3, 1, 2 } }, // psraw.
778 { ISD::SHL, MVT::v16i16, { 3, 7, 5, 7 } }, // psllw + split.
779 { ISD::SRL, MVT::v16i16, { 3, 7, 5, 7 } }, // psrlw + split.
780 { ISD::SRA, MVT::v16i16, { 3, 7, 5, 7 } }, // psraw + split.
781
782 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 2 } }, // pslld.
783 { ISD::SRL, MVT::v4i32, { 1, 3, 1, 2 } }, // psrld.
784 { ISD::SRA, MVT::v4i32, { 1, 3, 1, 2 } }, // psrad.
785 { ISD::SHL, MVT::v8i32, { 3, 7, 5, 7 } }, // pslld + split.
786 { ISD::SRL, MVT::v8i32, { 3, 7, 5, 7 } }, // psrld + split.
787 { ISD::SRA, MVT::v8i32, { 3, 7, 5, 7 } }, // psrad + split.
788
789 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 2 } }, // psllq.
790 { ISD::SRL, MVT::v2i64, { 1, 3, 1, 2 } }, // psrlq.
791 { ISD::SRA, MVT::v2i64, { 3, 4, 5, 7 } }, // 2 x psrad + shuffle.
792 { ISD::SHL, MVT::v4i64, { 3, 7, 4, 6 } }, // psllq + split.
793 { ISD::SRL, MVT::v4i64, { 3, 7, 4, 6 } }, // psrlq + split.
794 { ISD::SRA, MVT::v4i64, { 6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split.
795 };
796
797 // XOP has faster vXi8 shifts.
798 if (ST->hasAVX() && Op2Info.isUniform() &&
799 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
800 if (const auto *Entry =
801 CostTableLookup(AVXUniformCostTable, ISD, LT.second))
802 if (auto KindCost = Entry->Cost[CostKind])
803 return LT.first * *KindCost;
804
805 static const CostKindTblEntry SSE2UniformCostTable[] = {
806 // Uniform splats are cheaper for the following instructions.
807 { ISD::SHL, MVT::v16i8, { 9, 10, 6, 9 } }, // psllw + pand.
808 { ISD::SRL, MVT::v16i8, { 9, 13, 5, 9 } }, // psrlw + pand.
809 { ISD::SRA, MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence.
810
811 { ISD::SHL, MVT::v8i16, { 2, 2, 1, 2 } }, // psllw.
812 { ISD::SRL, MVT::v8i16, { 2, 2, 1, 2 } }, // psrlw.
813 { ISD::SRA, MVT::v8i16, { 2, 2, 1, 2 } }, // psraw.
814
815 { ISD::SHL, MVT::v4i32, { 2, 2, 1, 2 } }, // pslld
816 { ISD::SRL, MVT::v4i32, { 2, 2, 1, 2 } }, // psrld.
817 { ISD::SRA, MVT::v4i32, { 2, 2, 1, 2 } }, // psrad.
818
819 { ISD::SHL, MVT::v2i64, { 2, 2, 1, 2 } }, // psllq.
820 { ISD::SRL, MVT::v2i64, { 2, 2, 1, 2 } }, // psrlq.
821 { ISD::SRA, MVT::v2i64, { 5, 9, 5, 7 } }, // 2*psrlq + xor + sub.
822 };
823
824 if (ST->hasSSE2() && Op2Info.isUniform() &&
825 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
826 if (const auto *Entry =
827 CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
828 if (auto KindCost = Entry->Cost[CostKind])
829 return LT.first * *KindCost;
830
831 static const CostKindTblEntry AVX512DQCostTable[] = {
832 { ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq
833 { ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq
834 { ISD::MUL, MVT::v8i64, { 3, 15, 1, 3 } } // pmullq
835 };
836
837 // Look for AVX512DQ lowering tricks for custom cases.
838 if (ST->hasDQI())
839 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
840 if (auto KindCost = Entry->Cost[CostKind])
841 return LT.first * *KindCost;
842
843 static const CostKindTblEntry AVX512BWCostTable[] = {
844 { ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence.
845 { ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence.
846 { ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence.
847 { ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence.
848 { ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence.
849 { ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence.
850 { ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence.
851 { ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence.
852 { ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence.
853
854 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw
855 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw
856 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw
857 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw
858 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw
859 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw
860 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw
861 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw
862 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw
863
864 { ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb
865 { ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw
866
867 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb
868 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw
869 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd
870 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq
871
872 { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb
873 { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw
874
875 { ISD::MUL, MVT::v16i8, { 4, 12, 4, 5 } }, // extend/pmullw/trunc
876 { ISD::MUL, MVT::v32i8, { 3, 10, 7,10 } }, // pmaddubsw
877 { ISD::MUL, MVT::v64i8, { 3, 11, 7,10 } }, // pmaddubsw
878 { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw
879
880 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb
881 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw
882 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd
883 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq
884 };
885
886 // Look for AVX512BW lowering tricks for custom cases.
887 if (ST->hasBWI())
888 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
889 if (auto KindCost = Entry->Cost[CostKind])
890 return LT.first * *KindCost;
891
892 static const CostKindTblEntry AVX512CostTable[] = {
893 { ISD::SHL, MVT::v64i8, { 15, 19,27,33 } }, // vpblendv+split sequence.
894 { ISD::SRL, MVT::v64i8, { 15, 19,30,36 } }, // vpblendv+split sequence.
895 { ISD::SRA, MVT::v64i8, { 37, 37,51,63 } }, // vpblendv+split sequence.
896
897 { ISD::SHL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
898 { ISD::SRL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
899 { ISD::SRA, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence.
900
901 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } },
902 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } },
903 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } },
904 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } },
905 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } },
906 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } },
907 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } },
908 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } },
909 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } },
910
911 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } },
912 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } },
913 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } },
914 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } },
915 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } },
916 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } },
917 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } },
918 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } },
919 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } },
920
921 { ISD::ADD, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*paddb + split
922 { ISD::ADD, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*paddw + split
923
924 { ISD::SUB, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*psubb + split
925 { ISD::SUB, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*psubw + split
926
927 { ISD::AND, MVT::v32i8, { 1, 1, 1, 1 } },
928 { ISD::AND, MVT::v16i16, { 1, 1, 1, 1 } },
929 { ISD::AND, MVT::v8i32, { 1, 1, 1, 1 } },
930 { ISD::AND, MVT::v4i64, { 1, 1, 1, 1 } },
931
932 { ISD::OR, MVT::v32i8, { 1, 1, 1, 1 } },
933 { ISD::OR, MVT::v16i16, { 1, 1, 1, 1 } },
934 { ISD::OR, MVT::v8i32, { 1, 1, 1, 1 } },
935 { ISD::OR, MVT::v4i64, { 1, 1, 1, 1 } },
936
937 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 1 } },
938 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 1 } },
939 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } },
940 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } },
941
942 { ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
943 { ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
944 { ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
945 { ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add
946 { ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/
947
948 { X86ISD::PMULUDQ, MVT::v8i64, { 1, 5, 1, 1 } },
949
950 { ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
951 { ISD::FADD, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
952 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
953 { ISD::FSUB, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
954 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
955 { ISD::FMUL, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
956 { ISD::FMUL, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
957 { ISD::FMUL, MVT::v2f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
958 { ISD::FMUL, MVT::f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
959
960 { ISD::FDIV, MVT::f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
961 { ISD::FDIV, MVT::v2f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
962 { ISD::FDIV, MVT::v4f64, { 8, 14, 1, 1 } }, // Skylake from http://www.agner.org/
963 { ISD::FDIV, MVT::v8f64, { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/
964
965 { ISD::FNEG, MVT::v16f32, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
966 { ISD::FADD, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
967 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
968 { ISD::FSUB, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
969 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
970 { ISD::FMUL, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
971 { ISD::FMUL, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
972 { ISD::FMUL, MVT::v4f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
973 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
974
975 { ISD::FDIV, MVT::f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
976 { ISD::FDIV, MVT::v4f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
977 { ISD::FDIV, MVT::v8f32, { 5, 11, 1, 1 } }, // Skylake from http://www.agner.org/
978 { ISD::FDIV, MVT::v16f32, { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/
979 };
980
981 if (ST->hasAVX512())
982 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
983 if (auto KindCost = Entry->Cost[CostKind])
984 return LT.first * *KindCost;
985
986 static const CostKindTblEntry AVX2ShiftCostTable[] = {
987 // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
988 // customize them to detect the cases where shift amount is a scalar one.
989 { ISD::SHL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org)
990 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
991 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org)
992 { ISD::SHL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org)
993 { ISD::SRL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
994 { ISD::SRA, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org)
995 { ISD::SHL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org)
996 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org)
997 { ISD::SHL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org)
998 { ISD::SRL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org)
999 };
1000
1001 if (ST->hasAVX512()) {
1002 if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant())
1003 // On AVX512, a packed v32i16 shift left by a constant build_vector
1004 // is lowered into a vector multiply (vpmullw).
1005 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1006 Op1Info.getNoProps(), Op2Info.getNoProps());
1007 }
1008
1009 // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
1010 if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
1011 if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
1012 Op2Info.isConstant())
1013 // On AVX2, a packed v16i16 shift left by a constant build_vector
1014 // is lowered into a vector multiply (vpmullw).
1015 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1016 Op1Info.getNoProps(), Op2Info.getNoProps());
1017
1018 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
1019 if (auto KindCost = Entry->Cost[CostKind])
1020 return LT.first * *KindCost;
1021 }
1022
1023 static const CostKindTblEntry XOPShiftCostTable[] = {
1024 // 128bit shifts take 1cy, but right shifts require negation beforehand.
1025 { ISD::SHL, MVT::v16i8, { 1, 3, 1, 1 } },
1026 { ISD::SRL, MVT::v16i8, { 2, 3, 1, 1 } },
1027 { ISD::SRA, MVT::v16i8, { 2, 3, 1, 1 } },
1028 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 1 } },
1029 { ISD::SRL, MVT::v8i16, { 2, 3, 1, 1 } },
1030 { ISD::SRA, MVT::v8i16, { 2, 3, 1, 1 } },
1031 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 1 } },
1032 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 1 } },
1033 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 1 } },
1034 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 1 } },
1035 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } },
1036 { ISD::SRA, MVT::v2i64, { 2, 3, 1, 1 } },
1037 // 256bit shifts require splitting if AVX2 didn't catch them above.
1038 { ISD::SHL, MVT::v32i8, { 4, 7, 5, 6 } },
1039 { ISD::SRL, MVT::v32i8, { 6, 7, 5, 6 } },
1040 { ISD::SRA, MVT::v32i8, { 6, 7, 5, 6 } },
1041 { ISD::SHL, MVT::v16i16, { 4, 7, 5, 6 } },
1042 { ISD::SRL, MVT::v16i16, { 6, 7, 5, 6 } },
1043 { ISD::SRA, MVT::v16i16, { 6, 7, 5, 6 } },
1044 { ISD::SHL, MVT::v8i32, { 4, 7, 5, 6 } },
1045 { ISD::SRL, MVT::v8i32, { 6, 7, 5, 6 } },
1046 { ISD::SRA, MVT::v8i32, { 6, 7, 5, 6 } },
1047 { ISD::SHL, MVT::v4i64, { 4, 7, 5, 6 } },
1048 { ISD::SRL, MVT::v4i64, { 6, 7, 5, 6 } },
1049 { ISD::SRA, MVT::v4i64, { 6, 7, 5, 6 } },
1050 };
1051
1052 // Look for XOP lowering tricks.
1053 if (ST->hasXOP()) {
1054 // If the right shift is constant then we'll fold the negation so
1055 // it's as cheap as a left shift.
1056 int ShiftISD = ISD;
1057 if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant())
1058 ShiftISD = ISD::SHL;
1059 if (const auto *Entry =
1060 CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
1061 if (auto KindCost = Entry->Cost[CostKind])
1062 return LT.first * *KindCost;
1063 }
1064
1065 if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) {
1066 MVT VT = LT.second;
1067 // Vector shift left by non uniform constant can be lowered
1068 // into vector multiply.
1069 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
1070 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
1071 ISD = ISD::MUL;
1072 }
1073
1074 static const CostKindTblEntry GLMCostTable[] = {
1075 { ISD::FDIV, MVT::f32, { 18, 19, 1, 1 } }, // divss
1076 { ISD::FDIV, MVT::v4f32, { 35, 36, 1, 1 } }, // divps
1077 { ISD::FDIV, MVT::f64, { 33, 34, 1, 1 } }, // divsd
1078 { ISD::FDIV, MVT::v2f64, { 65, 66, 1, 1 } }, // divpd
1079 };
1080
1081 if (ST->useGLMDivSqrtCosts())
1082 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second))
1083 if (auto KindCost = Entry->Cost[CostKind])
1084 return LT.first * *KindCost;
1085
1086 static const CostKindTblEntry SLMCostTable[] = {
1087 { ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld
1088 { ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw
1089 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd
1090 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss
1091 { ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd
1092 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // mulps
1093 { ISD::FDIV, MVT::f32, { 17, 19, 1, 1 } }, // divss
1094 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 6 } }, // divps
1095 { ISD::FDIV, MVT::f64, { 32, 34, 1, 1 } }, // divsd
1096 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 6 } }, // divpd
1097 { ISD::FADD, MVT::v2f64, { 2, 4, 1, 1 } }, // addpd
1098 { ISD::FSUB, MVT::v2f64, { 2, 4, 1, 1 } }, // subpd
1099 // v2i64/v4i64 mul is custom lowered as a series of long:
1100 // multiplies(3), shifts(3) and adds(2)
1101 // slm muldq version throughput is 2 and addq throughput 4
1102 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
1103 // 3X4 (addq throughput) = 17
1104 { ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } },
1105 // slm addq\subq throughput is 4
1106 { ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } },
1107 { ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } },
1108 };
1109
1110 if (ST->useSLMArithCosts())
1111 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second))
1112 if (auto KindCost = Entry->Cost[CostKind])
1113 return LT.first * *KindCost;
1114
1115 static const CostKindTblEntry AVX2CostTable[] = {
1116 { ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence.
1117 { ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence.
1118 { ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence.
1119 { ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1120
1121 { ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence.
1122 { ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence.
1123 { ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence.
1124 { ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1125
1126 { ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence.
1127 { ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence.
1128 { ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence.
1129 { ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence.
1130 { ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence.
1131 { ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence.
1132
1133 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb
1134 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb
1135 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw
1136 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw
1137 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd
1138 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd
1139 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq
1140 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq
1141
1142 { ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack
1143 { ISD::MUL, MVT::v32i8, { 4, 8, 8,16 } }, // pmaddubsw
1144 { ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw
1145 { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld
1146 { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld
1147 { ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add
1148 { ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
1149
1150 { X86ISD::PMULUDQ, MVT::v4i64, { 1, 5, 1, 1 } },
1151
1152 { ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd
1153 { ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps
1154
1155 { ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd
1156 { ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss
1157 { ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd
1158 { ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps
1159 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd
1160 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps
1161
1162 { ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd
1163 { ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss
1164 { ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd
1165 { ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps
1166 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd
1167 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps
1168
1169 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd
1170 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss
1171 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd
1172 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps
1173 { ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd
1174 { ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps
1175
1176 { ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss
1177 { ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps
1178 { ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps
1179 { ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd
1180 { ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd
1181 { ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd
1182 };
1183
1184 // Look for AVX2 lowering tricks for custom cases.
1185 if (ST->hasAVX2())
1186 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
1187 if (auto KindCost = Entry->Cost[CostKind])
1188 return LT.first * *KindCost;
1189
1190 static const CostKindTblEntry AVX1CostTable[] = {
1191 // We don't have to scalarize unsupported ops. We can issue two half-sized
1192 // operations and we only need to extract the upper YMM half.
1193 // Two ops + 1 extract + 1 insert = 4.
1194 { ISD::MUL, MVT::v32i8, { 10, 11, 18, 19 } }, // pmaddubsw + split
1195 { ISD::MUL, MVT::v16i8, { 5, 6, 8, 12 } }, // 2*pmaddubsw/3*and/psllw/or
1196 { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split
1197 { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split
1198 { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
1199 { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } },
1200
1201 { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps
1202 { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps
1203 { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps
1204 { ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps
1205
1206 { ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps
1207 { ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps
1208 { ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps
1209 { ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps
1210
1211 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps
1212 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps
1213 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps
1214 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps
1215
1216 { ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split
1217 { ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split
1218 { ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split
1219 { ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split
1220 { ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split
1221 { ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split
1222 { ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split
1223 { ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split
1224 { ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq
1225 { ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq
1226
1227 { ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence.
1228 { ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split.
1229 { ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence.
1230 { ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split.
1231 { ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld
1232 { ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split
1233 { ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1234 { ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1235
1236 { ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence.
1237 { ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split.
1238 { ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1239 { ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1240 { ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1241 { ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1242 { ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1243 { ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1244
1245 { ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence.
1246 { ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split.
1247 { ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1248 { ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1249 { ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1250 { ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1251 { ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend.
1252 { ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split.
1253
1254 { ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1255 { ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1256
1257 { ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1258 { ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1259 { ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1260 { ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1261 { ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1262 { ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1263
1264 { ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1265 { ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1266 { ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1267 { ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1268 { ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1269 { ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1270
1271 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1272 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1273 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1274 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1275 { ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1276 { ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1277
1278 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1279 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1280 { ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/
1281 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1282 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1283 { ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/
1284 };
1285
1286 if (ST->hasAVX())
1287 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
1288 if (auto KindCost = Entry->Cost[CostKind])
1289 return LT.first * *KindCost;
1290
1291 static const CostKindTblEntry SSE42CostTable[] = {
1292 { ISD::FADD, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1293 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1294 { ISD::FADD, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1295 { ISD::FADD, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1296
1297 { ISD::FSUB, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1298 { ISD::FSUB, MVT::f32 , { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1299 { ISD::FSUB, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1300 { ISD::FSUB, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1301
1302 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1303 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1304 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1305 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1306
1307 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1308 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1309 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1310 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1311
1312 { ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add
1313 };
1314
1315 if (ST->hasSSE42())
1316 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
1317 if (auto KindCost = Entry->Cost[CostKind])
1318 return LT.first * *KindCost;
1319
1320 static const CostKindTblEntry SSE41CostTable[] = {
1321 { ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence.
1322 { ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence.
1323 { ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld
1324
1325 { ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence.
1326 { ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1327 { ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1328 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1329
1330 { ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence.
1331 { ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1332 { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1333 { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence.
1334
1335 { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org)
1336 };
1337
1338 if (ST->hasSSE41())
1339 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
1340 if (auto KindCost = Entry->Cost[CostKind])
1341 return LT.first * *KindCost;
1342
1343 static const CostKindTblEntry SSSE3CostTable[] = {
1344 { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*pmaddubsw/3*and/psllw/or
1345 };
1346
1347 if (ST->hasSSSE3())
1348 if (const auto *Entry = CostTableLookup(SSSE3CostTable, ISD, LT.second))
1349 if (auto KindCost = Entry->Cost[CostKind])
1350 return LT.first * *KindCost;
1351
1352 static const CostKindTblEntry SSE2CostTable[] = {
1353 // We don't correctly identify costs of casts because they are marked as
1354 // custom.
1355 { ISD::SHL, MVT::v16i8, { 13, 21,26,28 } }, // cmpgtb sequence.
1356 { ISD::SHL, MVT::v8i16, { 24, 27,16,20 } }, // cmpgtw sequence.
1357 { ISD::SHL, MVT::v4i32, { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq.
1358 { ISD::SHL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1359
1360 { ISD::SRL, MVT::v16i8, { 14, 28,27,30 } }, // cmpgtb sequence.
1361 { ISD::SRL, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1362 { ISD::SRL, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1363 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1364
1365 { ISD::SRA, MVT::v16i8, { 27, 30,54,54 } }, // unpacked cmpgtb sequence.
1366 { ISD::SRA, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1367 { ISD::SRA, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1368 { ISD::SRA, MVT::v2i64, { 8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence.
1369
1370 { ISD::AND, MVT::v16i8, { 1, 1, 1, 1 } }, // pand
1371 { ISD::AND, MVT::v8i16, { 1, 1, 1, 1 } }, // pand
1372 { ISD::AND, MVT::v4i32, { 1, 1, 1, 1 } }, // pand
1373 { ISD::AND, MVT::v2i64, { 1, 1, 1, 1 } }, // pand
1374
1375 { ISD::OR, MVT::v16i8, { 1, 1, 1, 1 } }, // por
1376 { ISD::OR, MVT::v8i16, { 1, 1, 1, 1 } }, // por
1377 { ISD::OR, MVT::v4i32, { 1, 1, 1, 1 } }, // por
1378 { ISD::OR, MVT::v2i64, { 1, 1, 1, 1 } }, // por
1379
1380 { ISD::XOR, MVT::v16i8, { 1, 1, 1, 1 } }, // pxor
1381 { ISD::XOR, MVT::v8i16, { 1, 1, 1, 1 } }, // pxor
1382 { ISD::XOR, MVT::v4i32, { 1, 1, 1, 1 } }, // pxor
1383 { ISD::XOR, MVT::v2i64, { 1, 1, 1, 1 } }, // pxor
1384
1385 { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq
1386 { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq
1387
1388 { ISD::MUL, MVT::v16i8, { 6, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack
1389 { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw
1390 { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle
1391 { ISD::MUL, MVT::v2i64, { 7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add
1392
1393 { X86ISD::PMULUDQ, MVT::v2i64, { 1, 5, 1, 1 } },
1394
1395 { ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/
1396 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/
1397 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/
1398 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/
1399
1400 { ISD::FNEG, MVT::f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1401 { ISD::FNEG, MVT::f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1402 { ISD::FNEG, MVT::v4f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1403 { ISD::FNEG, MVT::v2f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1404
1405 { ISD::FADD, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1406 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1407 { ISD::FADD, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1408
1409 { ISD::FSUB, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1410 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1411 { ISD::FSUB, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1412
1413 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1414 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1415 };
1416
1417 if (ST->hasSSE2())
1418 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1419 if (auto KindCost = Entry->Cost[CostKind])
1420 return LT.first * *KindCost;
1421
1422 static const CostKindTblEntry SSE1CostTable[] = {
1423 { ISD::FDIV, MVT::f32, { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/
1424 { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/
1425
1426 { ISD::FNEG, MVT::f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1427 { ISD::FNEG, MVT::v4f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1428
1429 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1430 { ISD::FADD, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1431
1432 { ISD::FSUB, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1433 { ISD::FSUB, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1434
1435 { ISD::FMUL, MVT::f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1436 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1437 };
1438
1439 if (ST->hasSSE1())
1440 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1441 if (auto KindCost = Entry->Cost[CostKind])
1442 return LT.first * *KindCost;
1443
1444 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
1445 { ISD::ADD, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1446 { ISD::SUB, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1447 { ISD::MUL, MVT::i64, { 2, 6, 1, 2 } },
1448 };
1449
1450 if (ST->is64Bit())
1451 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1452 if (auto KindCost = Entry->Cost[CostKind])
1453 return LT.first * *KindCost;
1454
1455 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1456 { ISD::ADD, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1457 { ISD::ADD, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1458 { ISD::ADD, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1459
1460 { ISD::SUB, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1461 { ISD::SUB, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1462 { ISD::SUB, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1463
1464 { ISD::MUL, MVT::i8, { 3, 4, 1, 1 } },
1465 { ISD::MUL, MVT::i16, { 2, 4, 1, 1 } },
1466 { ISD::MUL, MVT::i32, { 1, 4, 1, 1 } },
1467
1468 { ISD::FNEG, MVT::f64, { 2, 2, 1, 3 } }, // (x87)
1469 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1470 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1471 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // (x87)
1472 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87)
1473 };
1474
1475 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1476 if (auto KindCost = Entry->Cost[CostKind])
1477 return LT.first * *KindCost;
1478
1479 // It is not a good idea to vectorize division. We have to scalarize it and
1480 // in the process we will often end up having to spilling regular
1481 // registers. The overhead of division is going to dominate most kernels
1482 // anyways so try hard to prevent vectorization of division - it is
1483 // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1484 // to hide "20 cycles" for each lane.
1485 if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() &&
1486 (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
1487 ISD == ISD::UREM)) {
1488 InstructionCost ScalarCost =
1490 Op1Info.getNoProps(), Op2Info.getNoProps());
1491 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1492 }
1493
1494 // Handle some basic single instruction code size cases.
1495 if (CostKind == TTI::TCK_CodeSize) {
1496 switch (ISD) {
1497 case ISD::FADD:
1498 case ISD::FSUB:
1499 case ISD::FMUL:
1500 case ISD::FDIV:
1501 case ISD::FNEG:
1502 case ISD::AND:
1503 case ISD::OR:
1504 case ISD::XOR:
1505 return LT.first;
1506 break;
1507 }
1508 }
1509
1510 // Fallback to the default implementation.
1511 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1512 Args, CxtI);
1513}
1514
1517 unsigned Opcode1, const SmallBitVector &OpcodeMask,
1519 if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
1520 return TTI::TCC_Basic;
1522}
1523
1525 TTI::ShuffleKind Kind, VectorType *BaseTp, ArrayRef<int> Mask,
1527 ArrayRef<const Value *> Args, const Instruction *CxtI) {
1528 // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1529 // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1530 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(BaseTp);
1531
1532 Kind = improveShuffleKindFromMask(Kind, Mask, BaseTp, Index, SubTp);
1533
1534 // Recognize a basic concat_vector shuffle.
1535 if (Kind == TTI::SK_PermuteTwoSrc &&
1536 Mask.size() == (2 * BaseTp->getElementCount().getKnownMinValue()) &&
1537 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
1540 CostKind, Mask.size() / 2, BaseTp);
1541
1542 // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1543 if (Kind == TTI::SK_Transpose)
1544 Kind = TTI::SK_PermuteTwoSrc;
1545
1546 if (Kind == TTI::SK_Broadcast) {
1547 // For Broadcasts we are splatting the first element from the first input
1548 // register, so only need to reference that input and all the output
1549 // registers are the same.
1550 LT.first = 1;
1551
1552 // If we're broadcasting a load then AVX/AVX2 can do this for free.
1553 using namespace PatternMatch;
1554 if (!Args.empty() && match(Args[0], m_OneUse(m_Load(m_Value()))) &&
1555 (ST->hasAVX2() ||
1556 (ST->hasAVX() && LT.second.getScalarSizeInBits() >= 32)))
1557 return TTI::TCC_Free;
1558 }
1559
1560 // Treat <X x bfloat> shuffles as <X x half>.
1561 if (LT.second.isVector() && LT.second.getScalarType() == MVT::bf16)
1562 LT.second = LT.second.changeVectorElementType(MVT::f16);
1563
1564 // Subvector extractions are free if they start at the beginning of a
1565 // vector and cheap if the subvectors are aligned.
1566 if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1567 int NumElts = LT.second.getVectorNumElements();
1568 if ((Index % NumElts) == 0)
1569 return 0;
1570 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1571 if (SubLT.second.isVector()) {
1572 int NumSubElts = SubLT.second.getVectorNumElements();
1573 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1574 return SubLT.first;
1575 // Handle some cases for widening legalization. For now we only handle
1576 // cases where the original subvector was naturally aligned and evenly
1577 // fit in its legalized subvector type.
1578 // FIXME: Remove some of the alignment restrictions.
1579 // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1580 // vectors.
1581 int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1582 if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1583 (NumSubElts % OrigSubElts) == 0 &&
1584 LT.second.getVectorElementType() ==
1585 SubLT.second.getVectorElementType() &&
1586 LT.second.getVectorElementType().getSizeInBits() ==
1588 assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1589 "Unexpected number of elements!");
1590 auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
1591 LT.second.getVectorNumElements());
1592 auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
1593 SubLT.second.getVectorNumElements());
1594 int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1595 InstructionCost ExtractCost =
1596 getShuffleCost(TTI::SK_ExtractSubvector, VecTy, std::nullopt,
1597 CostKind, ExtractIndex, SubTy);
1598
1599 // If the original size is 32-bits or more, we can use pshufd. Otherwise
1600 // if we have SSSE3 we can use pshufb.
1601 if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1602 return ExtractCost + 1; // pshufd or pshufb
1603
1604 assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1605 "Unexpected vector size");
1606
1607 return ExtractCost + 2; // worst case pshufhw + pshufd
1608 }
1609 }
1610 // If the extract subvector is not optimal, treat it as single op shuffle.
1612 }
1613
1614 // Subvector insertions are cheap if the subvectors are aligned.
1615 // Note that in general, the insertion starting at the beginning of a vector
1616 // isn't free, because we need to preserve the rest of the wide vector.
1617 if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1618 int NumElts = LT.second.getVectorNumElements();
1619 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1620 if (SubLT.second.isVector()) {
1621 int NumSubElts = SubLT.second.getVectorNumElements();
1622 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1623 return SubLT.first;
1624 }
1625
1626 // If the insertion isn't aligned, treat it like a 2-op shuffle.
1627 Kind = TTI::SK_PermuteTwoSrc;
1628 }
1629
1630 // Handle some common (illegal) sub-vector types as they are often very cheap
1631 // to shuffle even on targets without PSHUFB.
1632 EVT VT = TLI->getValueType(DL, BaseTp);
1633 if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1634 !ST->hasSSSE3()) {
1635 static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
1636 {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw
1637 {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw
1638 {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw
1639 {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw
1640 {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck
1641
1642 {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw
1643 {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw
1644 {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus
1645 {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck
1646
1647 {TTI::SK_Splice, MVT::v4i16, 2}, // punpck+psrldq
1648 {TTI::SK_Splice, MVT::v2i16, 2}, // punpck+psrldq
1649 {TTI::SK_Splice, MVT::v4i8, 2}, // punpck+psrldq
1650 {TTI::SK_Splice, MVT::v2i8, 2}, // punpck+psrldq
1651
1652 {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw
1653 {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw
1654 {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw
1655 {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw
1656 {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck
1657
1658 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
1659 {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
1660 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw
1661 {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw
1662 {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck
1663 };
1664
1665 if (ST->hasSSE2())
1666 if (const auto *Entry =
1667 CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1668 return Entry->Cost;
1669 }
1670
1671 // We are going to permute multiple sources and the result will be in multiple
1672 // destinations. Providing an accurate cost only for splits where the element
1673 // type remains the same.
1674 if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
1675 MVT LegalVT = LT.second;
1676 if (LegalVT.isVector() &&
1677 LegalVT.getVectorElementType().getSizeInBits() ==
1679 LegalVT.getVectorNumElements() <
1680 cast<FixedVectorType>(BaseTp)->getNumElements()) {
1681 unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
1682 unsigned LegalVTSize = LegalVT.getStoreSize();
1683 // Number of source vectors after legalization:
1684 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1685 // Number of destination vectors after legalization:
1686 InstructionCost NumOfDests = LT.first;
1687
1688 auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
1689 LegalVT.getVectorNumElements());
1690
1691 if (!Mask.empty() && NumOfDests.isValid()) {
1692 // Try to perform better estimation of the permutation.
1693 // 1. Split the source/destination vectors into real registers.
1694 // 2. Do the mask analysis to identify which real registers are
1695 // permuted. If more than 1 source registers are used for the
1696 // destination register building, the cost for this destination register
1697 // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
1698 // source register is used, build mask and calculate the cost as a cost
1699 // of PermuteSingleSrc.
1700 // Also, for the single register permute we try to identify if the
1701 // destination register is just a copy of the source register or the
1702 // copy of the previous destination register (the cost is
1703 // TTI::TCC_Basic). If the source register is just reused, the cost for
1704 // this operation is 0.
1705 NumOfDests =
1707 FixedVectorType::get(BaseTp->getElementType(), Mask.size()))
1708 .first;
1709 unsigned E = *NumOfDests.getValue();
1710 unsigned NormalizedVF =
1711 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
1712 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
1713 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
1714 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
1715 copy(Mask, NormalizedMask.begin());
1716 unsigned PrevSrcReg = 0;
1717 ArrayRef<int> PrevRegMask;
1720 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
1721 [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask,
1722 &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
1723 if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) {
1724 // Check if the previous register can be just copied to the next
1725 // one.
1726 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
1727 PrevRegMask != RegMask)
1729 RegMask, CostKind, 0, nullptr);
1730 else
1731 // Just a copy of previous destination register.
1733 return;
1734 }
1735 if (SrcReg != DestReg &&
1736 any_of(RegMask, [](int I) { return I != PoisonMaskElem; })) {
1737 // Just a copy of the source register.
1739 }
1740 PrevSrcReg = SrcReg;
1741 PrevRegMask = RegMask;
1742 },
1743 [this, SingleOpTy, CostKind, &Cost](ArrayRef<int> RegMask,
1744 unsigned /*Unused*/,
1745 unsigned /*Unused*/) {
1746 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
1747 CostKind, 0, nullptr);
1748 });
1749 return Cost;
1750 }
1751
1752 InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1753 return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1754 std::nullopt, CostKind, 0, nullptr);
1755 }
1756
1757 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
1758 }
1759
1760 // For 2-input shuffles, we must account for splitting the 2 inputs into many.
1761 if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
1762 // We assume that source and destination have the same vector type.
1763 InstructionCost NumOfDests = LT.first;
1764 InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1;
1765 LT.first = NumOfDests * NumOfShufflesPerDest;
1766 }
1767
1768 static const CostTblEntry AVX512VBMIShuffleTbl[] = {
1769 {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
1770 {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
1771
1772 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
1773 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
1774
1775 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
1776 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
1777 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b
1778 };
1779
1780 if (ST->hasVBMI())
1781 if (const auto *Entry =
1782 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1783 return LT.first * Entry->Cost;
1784
1785 static const CostTblEntry AVX512BWShuffleTbl[] = {
1786 {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1787 {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
1788 {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1789
1790 {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
1791 {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
1792 {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
1793 {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
1794
1795 {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
1796 {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
1797 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
1798 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
1799 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
1800
1801 {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
1802 {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
1803 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
1804 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
1805 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1806
1807 {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
1808 {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
1809
1810 {TTI::SK_Splice, MVT::v32i16, 2}, // vshufi64x2 + palignr
1811 {TTI::SK_Splice, MVT::v32f16, 2}, // vshufi64x2 + palignr
1812 {TTI::SK_Splice, MVT::v64i8, 2}, // vshufi64x2 + palignr
1813 };
1814
1815 if (ST->hasBWI())
1816 if (const auto *Entry =
1817 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1818 return LT.first * Entry->Cost;
1819
1820 static const CostKindTblEntry AVX512ShuffleTbl[] = {
1821 {TTI::SK_Broadcast, MVT::v8f64, { 1, 1, 1, 1 } }, // vbroadcastsd
1822 {TTI::SK_Broadcast, MVT::v16f32, { 1, 1, 1, 1 } }, // vbroadcastss
1823 {TTI::SK_Broadcast, MVT::v8i64, { 1, 1, 1, 1 } }, // vpbroadcastq
1824 {TTI::SK_Broadcast, MVT::v16i32, { 1, 1, 1, 1 } }, // vpbroadcastd
1825 {TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw
1826 {TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw
1827 {TTI::SK_Broadcast, MVT::v64i8, { 1, 1, 1, 1 } }, // vpbroadcastb
1828
1829 {TTI::SK_Reverse, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1830 {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1831 {TTI::SK_Reverse, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1832 {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1833 {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca
1834 {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca
1835 {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca
1836
1837 {TTI::SK_Splice, MVT::v8f64, { 1, 1, 1, 1 } }, // vpalignd
1838 {TTI::SK_Splice, MVT::v4f64, { 1, 1, 1, 1 } }, // vpalignd
1839 {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd
1840 {TTI::SK_Splice, MVT::v8f32, { 1, 1, 1, 1 } }, // vpalignd
1841 {TTI::SK_Splice, MVT::v8i64, { 1, 1, 1, 1 } }, // vpalignd
1842 {TTI::SK_Splice, MVT::v4i64, { 1, 1, 1, 1 } }, // vpalignd
1843 {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd
1844 {TTI::SK_Splice, MVT::v8i32, { 1, 1, 1, 1 } }, // vpalignd
1845 {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr
1846 {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr
1847 {TTI::SK_Splice, MVT::v64i8, { 4, 4, 4, 4 } }, // split + palignr
1848
1849 {TTI::SK_PermuteSingleSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1850 {TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermpd
1851 {TTI::SK_PermuteSingleSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermpd
1852 {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1853 {TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermps
1854 {TTI::SK_PermuteSingleSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermps
1855 {TTI::SK_PermuteSingleSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1856 {TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermq
1857 {TTI::SK_PermuteSingleSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermq
1858 {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1859 {TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermd
1860 {TTI::SK_PermuteSingleSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermd
1861 {TTI::SK_PermuteSingleSrc, MVT::v16i8, { 1, 3, 1, 1 } }, // pshufb
1862
1863 {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermt2pd
1864 {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermt2ps
1865 {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermt2q
1866 {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermt2d
1867 {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermt2pd
1868 {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermt2ps
1869 {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermt2q
1870 {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermt2d
1871 {TTI::SK_PermuteTwoSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermt2pd
1872 {TTI::SK_PermuteTwoSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermt2ps
1873 {TTI::SK_PermuteTwoSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermt2q
1874 {TTI::SK_PermuteTwoSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermt2d
1875
1876 // FIXME: This just applies the type legalization cost rules above
1877 // assuming these completely split.
1878 {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } },
1879 {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } },
1880 {TTI::SK_PermuteSingleSrc, MVT::v64i8, { 14, 14, 14, 14 } },
1881 {TTI::SK_PermuteTwoSrc, MVT::v32i16, { 42, 42, 42, 42 } },
1882 {TTI::SK_PermuteTwoSrc, MVT::v32f16, { 42, 42, 42, 42 } },
1883 {TTI::SK_PermuteTwoSrc, MVT::v64i8, { 42, 42, 42, 42 } },
1884
1885 {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq
1886 {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq
1887 {TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vpternlogq
1888 {TTI::SK_Select, MVT::v8f64, { 1, 1, 1, 1 } }, // vblendmpd
1889 {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps
1890 {TTI::SK_Select, MVT::v8i64, { 1, 1, 1, 1 } }, // vblendmq
1891 {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd
1892 };
1893
1894 if (ST->hasAVX512())
1895 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1896 if (auto KindCost = Entry->Cost[CostKind])
1897 return LT.first * *KindCost;
1898
1899 static const CostTblEntry AVX2ShuffleTbl[] = {
1900 {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1901 {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1902 {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1903 {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1904 {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1905 {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
1906 {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1907
1908 {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1909 {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1910 {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1911 {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1912 {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1913 {TTI::SK_Reverse, MVT::v16f16, 2}, // vperm2i128 + pshufb
1914 {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1915
1916 {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1917 {TTI::SK_Select, MVT::v16f16, 1}, // vpblendvb
1918 {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1919
1920 {TTI::SK_Splice, MVT::v8i32, 2}, // vperm2i128 + vpalignr
1921 {TTI::SK_Splice, MVT::v8f32, 2}, // vperm2i128 + vpalignr
1922 {TTI::SK_Splice, MVT::v16i16, 2}, // vperm2i128 + vpalignr
1923 {TTI::SK_Splice, MVT::v16f16, 2}, // vperm2i128 + vpalignr
1924 {TTI::SK_Splice, MVT::v32i8, 2}, // vperm2i128 + vpalignr
1925
1926 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1927 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1928 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1929 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1930 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1931 // + vpblendvb
1932 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vperm2i128 + 2*vpshufb
1933 // + vpblendvb
1934 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1935 // + vpblendvb
1936
1937 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1938 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1939 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1940 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1941 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1942 // + vpblendvb
1943 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 7}, // 2*vperm2i128 + 4*vpshufb
1944 // + vpblendvb
1945 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
1946 // + vpblendvb
1947 };
1948
1949 if (ST->hasAVX2())
1950 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1951 return LT.first * Entry->Cost;
1952
1953 static const CostTblEntry XOPShuffleTbl[] = {
1954 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
1955 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
1956 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
1957 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
1958 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1959 // + vinsertf128
1960 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
1961 // + vinsertf128
1962
1963 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1964 // + vinsertf128
1965 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
1966 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
1967 // + vinsertf128
1968 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
1969 };
1970
1971 if (ST->hasXOP())
1972 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1973 return LT.first * Entry->Cost;
1974
1975 static const CostTblEntry AVX1ShuffleTbl[] = {
1976 {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1977 {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1978 {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1979 {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1980 {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1981 {TTI::SK_Broadcast, MVT::v16f16, 3}, // vpshuflw + vpshufd + vinsertf128
1982 {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
1983
1984 {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1985 {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1986 {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1987 {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1988 {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1989 // + vinsertf128
1990 {TTI::SK_Reverse, MVT::v16f16, 4}, // vextractf128 + 2*pshufb
1991 // + vinsertf128
1992 {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
1993 // + vinsertf128
1994
1995 {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
1996 {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
1997 {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
1998 {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
1999 {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
2000 {TTI::SK_Select, MVT::v16f16, 3}, // vpand + vpandn + vpor
2001 {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
2002
2003 {TTI::SK_Splice, MVT::v4i64, 2}, // vperm2f128 + shufpd
2004 {TTI::SK_Splice, MVT::v4f64, 2}, // vperm2f128 + shufpd
2005 {TTI::SK_Splice, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
2006 {TTI::SK_Splice, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
2007 {TTI::SK_Splice, MVT::v16i16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2008 {TTI::SK_Splice, MVT::v16f16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2009 {TTI::SK_Splice, MVT::v32i8, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2010
2011 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
2012 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
2013 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
2014 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
2015 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
2016 // + 2*por + vinsertf128
2017 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 8}, // vextractf128 + 4*pshufb
2018 // + 2*por + vinsertf128
2019 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
2020 // + 2*por + vinsertf128
2021
2022 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
2023 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
2024 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
2025 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
2026 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
2027 // + 4*por + vinsertf128
2028 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 15}, // 2*vextractf128 + 8*pshufb
2029 // + 4*por + vinsertf128
2030 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
2031 // + 4*por + vinsertf128
2032 };
2033
2034 if (ST->hasAVX())
2035 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
2036 return LT.first * Entry->Cost;
2037
2038 static const CostTblEntry SSE41ShuffleTbl[] = {
2039 {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
2040 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
2041 {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
2042 {TTI::SK_Select, MVT::v4f32, 1}, // blendps
2043 {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
2044 {TTI::SK_Select, MVT::v8f16, 1}, // pblendw
2045 {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
2046 };
2047
2048 if (ST->hasSSE41())
2049 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
2050 return LT.first * Entry->Cost;
2051
2052 static const CostTblEntry SSSE3ShuffleTbl[] = {
2053 {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
2054 {TTI::SK_Broadcast, MVT::v8f16, 1}, // pshufb
2055 {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
2056
2057 {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
2058 {TTI::SK_Reverse, MVT::v8f16, 1}, // pshufb
2059 {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
2060
2061 {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
2062 {TTI::SK_Select, MVT::v8f16, 3}, // 2*pshufb + por
2063 {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
2064
2065 {TTI::SK_Splice, MVT::v4i32, 1}, // palignr
2066 {TTI::SK_Splice, MVT::v4f32, 1}, // palignr
2067 {TTI::SK_Splice, MVT::v8i16, 1}, // palignr
2068 {TTI::SK_Splice, MVT::v8f16, 1}, // palignr
2069 {TTI::SK_Splice, MVT::v16i8, 1}, // palignr
2070
2071 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
2072 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // pshufb
2073 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
2074
2075 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
2076 {TTI::SK_PermuteTwoSrc, MVT::v8f16, 3}, // 2*pshufb + por
2077 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
2078 };
2079
2080 if (ST->hasSSSE3())
2081 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
2082 return LT.first * Entry->Cost;
2083
2084 static const CostTblEntry SSE2ShuffleTbl[] = {
2085 {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
2086 {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
2087 {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
2088 {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
2089 {TTI::SK_Broadcast, MVT::v8f16, 2}, // pshuflw + pshufd
2090 {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
2091
2092 {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
2093 {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
2094 {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
2095 {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
2096 {TTI::SK_Reverse, MVT::v8f16, 3}, // pshuflw + pshufhw + pshufd
2097 {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
2098 // + 2*pshufd + 2*unpck + packus
2099
2100 {TTI::SK_Select, MVT::v2i64, 1}, // movsd
2101 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
2102 {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
2103 {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
2104 {TTI::SK_Select, MVT::v8f16, 3}, // pand + pandn + por
2105 {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
2106
2107 {TTI::SK_Splice, MVT::v2i64, 1}, // shufpd
2108 {TTI::SK_Splice, MVT::v2f64, 1}, // shufpd
2109 {TTI::SK_Splice, MVT::v4i32, 2}, // 2*{unpck,movsd,pshufd}
2110 {TTI::SK_Splice, MVT::v8i16, 3}, // psrldq + psrlldq + por
2111 {TTI::SK_Splice, MVT::v8f16, 3}, // psrldq + psrlldq + por
2112 {TTI::SK_Splice, MVT::v16i8, 3}, // psrldq + psrlldq + por
2113
2114 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
2115 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
2116 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
2117 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
2118 // + pshufd/unpck
2119 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 5}, // 2*pshuflw + 2*pshufhw
2120 // + pshufd/unpck
2121 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
2122 // + 2*pshufd + 2*unpck + 2*packus
2123
2124 { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
2125 { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
2126 { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
2127 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
2128 { TTI::SK_PermuteTwoSrc, MVT::v8f16, 8 }, // blend+permute
2129 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
2130 };
2131
2132 static const CostTblEntry SSE3BroadcastLoadTbl[] = {
2133 {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
2134 };
2135
2136 if (ST->hasSSE2()) {
2137 bool IsLoad =
2138 llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
2139 if (ST->hasSSE3() && IsLoad)
2140 if (const auto *Entry =
2141 CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
2143 LT.second.getVectorElementCount()) &&
2144 "Table entry missing from isLegalBroadcastLoad()");
2145 return LT.first * Entry->Cost;
2146 }
2147
2148 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
2149 return LT.first * Entry->Cost;
2150 }
2151
2152 static const CostTblEntry SSE1ShuffleTbl[] = {
2153 { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
2154 { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
2155 { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
2156 { TTI::SK_Splice, MVT::v4f32, 2 }, // 2*shufps
2157 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
2158 { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
2159 };
2160
2161 if (ST->hasSSE1())
2162 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
2163 return LT.first * Entry->Cost;
2164
2165 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
2166}
2167
2169 Type *Src,
2172 const Instruction *I) {
2173 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2174 assert(ISD && "Invalid opcode");
2175
2176 // The cost tables include both specific, custom (non-legal) src/dst type
2177 // conversions and generic, legalized types. We test for customs first, before
2178 // falling back to legalization.
2179 // FIXME: Need a better design of the cost table to handle non-simple types of
2180 // potential massive combinations (elem_num x src_type x dst_type).
2181 static const TypeConversionCostKindTblEntry AVX512BWConversionTbl[]{
2182 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2183 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2184
2185 // Mask sign extend has an instruction.
2186 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2187 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2188 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2189 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2190 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2191 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2192 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2193 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2194 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2195 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2196 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2197 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2198 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2199 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2200 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, { 1, 1, 1, 1 } },
2201 { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, { 1, 1, 1, 1 } },
2202 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, { 1, 1, 1, 1 } },
2203
2204 // Mask zero extend is a sext + shift.
2205 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2206 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2207 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2208 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2209 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2210 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2211 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2212 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2213 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2214 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2215 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2216 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2217 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2218 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2219 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, { 2, 1, 1, 1 } },
2220 { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, { 2, 1, 1, 1 } },
2221 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, { 2, 1, 1, 1 } },
2222
2223 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2224 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2225 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2226 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2227 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2228 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2229 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2230 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2231 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2232 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2233 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2234 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2235 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2236 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2237 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, { 2, 1, 1, 1 } },
2238 { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, { 2, 1, 1, 1 } },
2239 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, { 2, 1, 1, 1 } },
2240
2241 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 2, 1, 1, 1 } },
2242 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // widen to zmm
2243 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, { 2, 1, 1, 1 } }, // vpmovwb
2244 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, { 2, 1, 1, 1 } }, // vpmovwb
2245 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, { 2, 1, 1, 1 } }, // vpmovwb
2246 };
2247
2248 static const TypeConversionCostKindTblEntry AVX512DQConversionTbl[] = {
2249 // Mask sign extend has an instruction.
2250 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2251 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2252 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2253 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2254 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2255 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, { 1, 1, 1, 1 } },
2256 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } },
2257 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } },
2258
2259 // Mask zero extend is a sext + shift.
2260 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1, } },
2261 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1, } },
2262 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1, } },
2263 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1, } },
2264 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1, } },
2265 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, { 2, 1, 1, 1, } },
2266 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1, } },
2267 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1, } },
2268
2269 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2270 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2271 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2272 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2273 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2274 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } },
2275 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } },
2276 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, { 2, 1, 1, 1 } },
2277
2278 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2279 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2280
2281 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2282 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2283
2284 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2285 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2286
2287 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2288 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2289 };
2290
2291 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
2292 // 256-bit wide vectors.
2293
2294 static const TypeConversionCostKindTblEntry AVX512FConversionTbl[] = {
2295 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 1, 1, 1, 1 } },
2296 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, { 3, 1, 1, 1 } },
2297 { ISD::FP_EXTEND, MVT::v16f64, MVT::v16f32, { 4, 1, 1, 1 } }, // 2*vcvtps2pd+vextractf64x4
2298 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 1, 1, 1, 1 } },
2299
2300 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2301 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2302 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2303 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2304 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2305 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2306 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2307 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2308 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2309 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2310 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2311 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2312 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2313 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2314 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2315 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, { 2, 1, 1, 1 } }, // vpmovdb
2316 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, { 2, 1, 1, 1 } }, // vpmovdb
2317 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2318 { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2319 { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2320 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2321 { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2322 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, { 2, 1, 1, 1 } }, // vpmovqb
2323 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, { 1, 1, 1, 1 } }, // vpshufb
2324 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2325 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2326 { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2327 { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2328 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2329 { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2330 { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2331 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, { 1, 1, 1, 1 } }, // vpmovqd
2332 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // zmm vpmovqd
2333 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, { 5, 1, 1, 1 } },// 2*vpmovqd+concat+vpmovdb
2334
2335 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } }, // extend to v16i32
2336 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 8, 1, 1, 1 } },
2337 { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, { 8, 1, 1, 1 } },
2338
2339 // Sign extend is zmm vpternlogd+vptruncdb.
2340 // Zero extend is zmm broadcast load+vptruncdw.
2341 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 3, 1, 1, 1 } },
2342 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 4, 1, 1, 1 } },
2343 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 3, 1, 1, 1 } },
2344 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 4, 1, 1, 1 } },
2345 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 3, 1, 1, 1 } },
2346 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 4, 1, 1, 1 } },
2347 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 3, 1, 1, 1 } },
2348 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 4, 1, 1, 1 } },
2349
2350 // Sign extend is zmm vpternlogd+vptruncdw.
2351 // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
2352 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 3, 1, 1, 1 } },
2353 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2354 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 3, 1, 1, 1 } },
2355 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2356 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 3, 1, 1, 1 } },
2357 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2358 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 3, 1, 1, 1 } },
2359 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2360
2361 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2362 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2363 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2364 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2365 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2366 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2367 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2368 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2369 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2370 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2371
2372 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2373 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2374 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogq
2375 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2376
2377 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2378 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2379 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2380 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2381 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2382 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2383 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2384 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2385 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2386 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2387
2388 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2389 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2390
2391 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2392 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2393 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2394 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2395 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2396 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2397 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2398 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2399
2400 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2401 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2402 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2403 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2404 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2405 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2406 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2407 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2408 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, {26, 1, 1, 1 } },
2409 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 5, 1, 1, 1 } },
2410
2411 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2412 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, { 7, 1, 1, 1 } },
2413 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64, {15, 1, 1, 1 } },
2414 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32, {11, 1, 1, 1 } },
2415 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64, {31, 1, 1, 1 } },
2416 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2417 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, { 7, 1, 1, 1 } },
2418 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, { 5, 1, 1, 1 } },
2419 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64, {15, 1, 1, 1 } },
2420 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2421 { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, { 3, 1, 1, 1 } },
2422
2423 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2424 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2425 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, { 3, 1, 1, 1 } },
2426 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, { 1, 1, 1, 1 } },
2427 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, { 3, 1, 1, 1 } },
2428 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, { 3, 1, 1, 1 } },
2429 };
2430
2431 static const TypeConversionCostKindTblEntry AVX512BWVLConversionTbl[] {
2432 // Mask sign extend has an instruction.
2433 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2434 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2435 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2436 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2437 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2438 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2439 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2440 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2441 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2442 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2443 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2444 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2445 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2446 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2447 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, { 1, 1, 1, 1 } },
2448 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, { 1, 1, 1, 1 } },
2449 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, { 1, 1, 1, 1 } },
2450
2451 // Mask zero extend is a sext + shift.
2452 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2453 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2454 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2455 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2456 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2457 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2458 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2459 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2460 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2461 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2462 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2463 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2464 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2465 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2466 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, { 2, 1, 1, 1 } },
2467 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, { 2, 1, 1, 1 } },
2468 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, { 2, 1, 1, 1 } },
2469
2470 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2471 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2472 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2473 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2474 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2475 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2476 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2477 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2478 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2479 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2480 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2481 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2482 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2483 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2484 { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, { 2, 1, 1, 1 } },
2485 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, { 2, 1, 1, 1 } },
2486 { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, { 2, 1, 1, 1 } },
2487
2488 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } },
2489 };
2490
2491 static const TypeConversionCostKindTblEntry AVX512DQVLConversionTbl[] = {
2492 // Mask sign extend has an instruction.
2493 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2494 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2495 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2496 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, { 1, 1, 1, 1 } },
2497 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2498 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, { 1, 1, 1, 1 } },
2499 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } },
2500 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2501
2502 // Mask zero extend is a sext + shift.
2503 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } },
2504 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1 } },
2505 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } },
2506 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, { 2, 1, 1, 1 } },
2507 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } },
2508 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, { 2, 1, 1, 1 } },
2509 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } },
2510 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } },
2511
2512 { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, { 2, 1, 1, 1 } },
2513 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } },
2514 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2515 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2516 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2517 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2518 { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, { 2, 1, 1, 1 } },
2519 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2520
2521 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2522 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2523 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2524 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2525
2526 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2527 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2528 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2529 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2530
2531 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2532 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2533 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2534 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2535
2536 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2537 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2538 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2539 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2540 };
2541
2542 static const TypeConversionCostKindTblEntry AVX512VLConversionTbl[] = {
2543 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2544 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2545 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2546 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 8, 1, 1, 1 } }, // split+2*v8i8
2547 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2548 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2549 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2550 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 8, 1, 1, 1 } }, // split+2*v8i16
2551 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2552 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2553 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2554 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2555 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2556 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2557 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // vpmovqd
2558 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqb
2559 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqw
2560 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, { 2, 1, 1, 1 } }, // vpmovwb
2561
2562 // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
2563 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
2564 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 5, 1, 1, 1 } },
2565 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 6, 1, 1, 1 } },
2566 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 5, 1, 1, 1 } },
2567 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 6, 1, 1, 1 } },
2568 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 5, 1, 1, 1 } },
2569 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 6, 1, 1, 1 } },
2570 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, {10, 1, 1, 1 } },
2571 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, {12, 1, 1, 1 } },
2572
2573 // sign extend is vpcmpeq+maskedmove+vpmovdw
2574 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
2575 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2576 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 5, 1, 1, 1 } },
2577 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2578 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 5, 1, 1, 1 } },
2579 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2580 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 5, 1, 1, 1 } },
2581 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, {10, 1, 1, 1 } },
2582 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, {12, 1, 1, 1 } },
2583
2584 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogd
2585 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2586 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogd
2587 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2588 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogd
2589 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2590 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2591 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2592
2593 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogq
2594 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2595 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogq
2596 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2597
2598 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2599 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2600 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2601 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2602 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2603 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2604 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2605 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2606 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2607 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2608 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2609 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2610
2611 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2612 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2613 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2614 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2615
2616 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
2617 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
2618 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2619 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2620 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2621 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2622 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 1, 1, 1, 1 } },
2623 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
2624 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2625 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2626 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 5, 1, 1, 1 } },
2627 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2628 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 5, 1, 1, 1 } },
2629
2630 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2631 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2632 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, { 5, 1, 1, 1 } },
2633
2634 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
2635 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
2636 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
2637 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
2638 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2639 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2640 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2641 };
2642
2643 static const TypeConversionCostKindTblEntry AVX2ConversionTbl[] = {
2644 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2645 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2646 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2647 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2648 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2649 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2650
2651 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2652 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2653 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2654 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2655 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2656 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2657 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2658 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2659 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2660 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2661 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2662 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2663 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2664 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2665
2666 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2667
2668 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 4, 1, 1, 1 } },
2669 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 4, 1, 1, 1 } },
2670 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 1, 1, 1, 1 } },
2671 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 1, 1, 1, 1 } },
2672 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 1, 1, 1, 1 } },
2673 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 4, 1, 1, 1 } },
2674 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 4, 1, 1, 1 } },
2675 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 1, 1, 1, 1 } },
2676 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 1, 1, 1, 1 } },
2677 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 5, 1, 1, 1 } },
2678 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } },
2679 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 2, 1, 1, 1 } },
2680
2681 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 3, 1, 1, 1 } },
2682 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 3, 1, 1, 1 } },
2683
2684 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2685 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2686 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2687 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 3, 1, 1, 1 } },
2688
2689 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 3, 1, 1, 1 } },
2690 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 3, 1, 1, 1 } },
2691 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2692 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2693 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2694 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 4, 1, 1, 1 } },
2695 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 3, 1, 1, 1 } },
2696 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 4, 1, 1, 1 } },
2697
2698 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2699 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2700 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2701 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2702 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2703 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2704 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 3, 1, 1, 1 } },
2705
2706 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2707 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2708 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2709 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2710 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 2, 1, 1, 1 } },
2711 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 1, 1, 1, 1 } },
2712 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 2, 1, 1, 1 } },
2713 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2714 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2715 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2716 };
2717
2718 static const TypeConversionCostKindTblEntry AVXConversionTbl[] = {
2719 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2720 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2721 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2722 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2723 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2724 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2725
2726 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2727 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2728 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2729 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2730 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2731 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2732 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2733 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2734 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2735 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2736 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2737 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2738
2739 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 4, 1, 1, 1 } },
2740 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 5, 1, 1, 1 } },
2741 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 4, 1, 1, 1 } },
2742 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 9, 1, 1, 1 } },
2743 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, {11, 1, 1, 1 } },
2744
2745 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 6, 1, 1, 1 } },
2746 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 6, 1, 1, 1 } },
2747 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // and+extract+packuswb
2748 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 5, 1, 1, 1 } },
2749 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
2750 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 5, 1, 1, 1 } },
2751 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 3, 1, 1, 1 } }, // and+extract+2*packusdw
2752 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 2, 1, 1, 1 } },
2753
2754 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, { 3, 1, 1, 1 } },
2755 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, { 3, 1, 1, 1 } },
2756 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, { 8, 1, 1, 1 } },
2757 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2758 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2759 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2760 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2761 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2762 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2763 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2764 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 5, 1, 1, 1 } },
2765 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 8, 1, 1, 1 } },
2766
2767 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, { 7, 1, 1, 1 } },
2768 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, { 7, 1, 1, 1 } },
2769 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, { 6, 1, 1, 1 } },
2770 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2771 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2772 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2773 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2774 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 4, 1, 1, 1 } },
2775 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 4, 1, 1, 1 } },
2776 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
2777 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 6, 1, 1, 1 } },
2778 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 8, 1, 1, 1 } },
2779 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, {10, 1, 1, 1 } },
2780 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, {10, 1, 1, 1 } },
2781 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {18, 1, 1, 1 } },
2782 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2783 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, {10, 1, 1, 1 } },
2784
2785 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2786 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2787 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2788 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2789 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2790 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2791 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2792 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2793 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 2, 1, 1, 1 } },
2794 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 2, 1, 1, 1 } },
2795 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 5, 1, 1, 1 } },
2796
2797 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2798 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2799 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2800 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2801 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2802 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2803 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2804 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2805 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2806 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2807 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 6, 1, 1, 1 } },
2808 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 7, 1, 1, 1 } },
2809 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 7, 1, 1, 1 } },
2810
2811 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, { 1, 1, 1, 1 } },
2812 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, { 1, 1, 1, 1 } },
2813 };
2814
2815 static const TypeConversionCostKindTblEntry SSE41ConversionTbl[] = {
2816 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
2817 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
2818 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
2819 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
2820 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2821 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2822 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
2823 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
2824 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2825 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2826 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2827 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2828
2829 // These truncates end up widening elements.
2830 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 1, 1, 1, 1 } }, // PMOVXZBQ
2831 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 1, 1, 1, 1 } }, // PMOVXZWQ
2832 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 1, 1, 1, 1 } }, // PMOVXZBD
2833
2834 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 2, 1, 1, 1 } },
2835 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 2, 1, 1, 1 } },
2836 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 2, 1, 1, 1 } },
2837
2838 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
2839 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
2840 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
2841 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
2842 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
2843 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2844 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
2845 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2846 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
2847 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 1, 1, 1, 1 } },
2848 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2849
2850 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
2851 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
2852 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 4, 1, 1, 1 } },
2853 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 4, 1, 1, 1 } },
2854 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
2855 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2856 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
2857 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2858 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 3, 1, 1, 1 } },
2859 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
2860 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 2, 1, 1, 1 } },
2861 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {12, 1, 1, 1 } },
2862 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {22, 1, 1, 1 } },
2863 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 4, 1, 1, 1 } },
2864
2865 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
2866 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
2867 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
2868 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
2869 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
2870 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
2871 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
2872 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
2873 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
2874 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
2875
2876 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
2877 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
2878 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
2879 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
2880 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
2881 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
2882 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
2883 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
2884 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
2885 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2886 };
2887
2888 static const TypeConversionCostKindTblEntry SSE2ConversionTbl[] = {
2889 // These are somewhat magic numbers justified by comparing the
2890 // output of llvm-mca for our various supported scheduler models
2891 // and basing it off the worst case scenario.
2892 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
2893 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
2894 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 3, 1, 1, 1 } },
2895 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 3, 1, 1, 1 } },
2896 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 3, 1, 1, 1 } },
2897 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
2898 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 3, 1, 1, 1 } },
2899 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
2900 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
2901 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 4, 1, 1, 1 } },
2902 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 8, 1, 1, 1 } },
2903 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 8, 1, 1, 1 } },
2904
2905 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
2906 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
2907 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 8, 1, 1, 1 } },
2908 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 9, 1, 1, 1 } },
2909 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
2910 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 4, 1, 1, 1 } },
2911 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 4, 1, 1, 1 } },
2912 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
2913 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 7, 1, 1, 1 } },
2914 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 7, 1, 1, 1 } },
2915 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
2916 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, {15, 1, 1, 1 } },
2917 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {18, 1, 1, 1 } },
2918
2919 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
2920 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
2921 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
2922 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
2923 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
2924 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
2925 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
2926 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
2927 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
2928 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2929
2930 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
2931 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
2932 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
2933 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, {15, 1, 1, 1 } },
2934 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
2935 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
2936 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
2937 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
2938 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 8, 1, 1, 1 } },
2939 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 8, 1, 1, 1 } },
2940
2941 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
2942 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
2943 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 2, 1, 1, 1 } },
2944 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 3, 1, 1, 1 } },
2945 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2946 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 2, 1, 1, 1 } },
2947 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 2, 1, 1, 1 } },
2948 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 3, 1, 1, 1 } },
2949 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2950 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 2, 1, 1, 1 } },
2951 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2952 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 2, 1, 1, 1 } },
2953
2954 // These truncates are really widening elements.
2955 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 1, 1, 1, 1 } }, // PSHUFD
2956 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } }, // PUNPCKLWD+DQ
2957 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // PUNPCKLBW+WD+PSHUFD
2958 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 1, 1, 1, 1 } }, // PUNPCKLWD
2959 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } }, // PUNPCKLBW+WD
2960 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 1, 1, 1, 1 } }, // PUNPCKLBW
2961
2962 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 2, 1, 1, 1 } }, // PAND+PACKUSWB
2963 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } },
2964 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 3, 1, 1, 1 } }, // PAND+2*PACKUSWB
2965 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 7, 1, 1, 1 } },
2966 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, { 1, 1, 1, 1 } },
2967 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 3, 1, 1, 1 } },
2968 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
2969 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, {10, 1, 1, 1 } },
2970 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 4, 1, 1, 1 } }, // PAND+3*PACKUSWB
2971 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 2, 1, 1, 1 } }, // PSHUFD+PSHUFLW
2972 { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, { 1, 1, 1, 1 } }, // PSHUFD
2973 };
2974
2975 // Attempt to map directly to (simple) MVT types to let us match custom entries.
2976 EVT SrcTy = TLI->getValueType(DL, Src);
2977 EVT DstTy = TLI->getValueType(DL, Dst);
2978
2979 // The function getSimpleVT only handles simple value types.
2980 if (SrcTy.isSimple() && DstTy.isSimple()) {
2981 MVT SimpleSrcTy = SrcTy.getSimpleVT();
2982 MVT SimpleDstTy = DstTy.getSimpleVT();
2983
2984 if (ST->useAVX512Regs()) {
2985 if (ST->hasBWI())
2986 if (const auto *Entry = ConvertCostTableLookup(
2987 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2988 if (auto KindCost = Entry->Cost[CostKind])
2989 return *KindCost;
2990
2991 if (ST->hasDQI())
2992 if (const auto *Entry = ConvertCostTableLookup(
2993 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2994 if (auto KindCost = Entry->Cost[CostKind])
2995 return *KindCost;
2996
2997 if (ST->hasAVX512())
2998 if (const auto *Entry = ConvertCostTableLookup(
2999 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3000 if (auto KindCost = Entry->Cost[CostKind])
3001 return *KindCost;
3002 }
3003
3004 if (ST->hasBWI())
3005 if (const auto *Entry = ConvertCostTableLookup(
3006 AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3007 if (auto KindCost = Entry->Cost[CostKind])
3008 return *KindCost;
3009
3010 if (ST->hasDQI())
3011 if (const auto *Entry = ConvertCostTableLookup(
3012 AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3013 if (auto KindCost = Entry->Cost[CostKind])
3014 return *KindCost;
3015
3016 if (ST->hasAVX512())
3017 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3018 SimpleDstTy, SimpleSrcTy))
3019 if (auto KindCost = Entry->Cost[CostKind])
3020 return *KindCost;
3021
3022 if (ST->hasAVX2()) {
3023 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3024 SimpleDstTy, SimpleSrcTy))
3025 if (auto KindCost = Entry->Cost[CostKind])
3026 return *KindCost;
3027 }
3028
3029 if (ST->hasAVX()) {
3030 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3031 SimpleDstTy, SimpleSrcTy))
3032 if (auto KindCost = Entry->Cost[CostKind])
3033 return *KindCost;
3034 }
3035
3036 if (ST->hasSSE41()) {
3037 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3038 SimpleDstTy, SimpleSrcTy))
3039 if (auto KindCost = Entry->Cost[CostKind])
3040 return *KindCost;
3041 }
3042
3043 if (ST->hasSSE2()) {
3044 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3045 SimpleDstTy, SimpleSrcTy))
3046 if (auto KindCost = Entry->Cost[CostKind])
3047 return *KindCost;
3048 }
3049 }
3050
3051 // Fall back to legalized types.
3052 std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src);
3053 std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst);
3054
3055 // If we're truncating to the same legalized type - just assume its free.
3056 if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
3057 return TTI::TCC_Free;
3058
3059 if (ST->useAVX512Regs()) {
3060 if (ST->hasBWI())
3061 if (const auto *Entry = ConvertCostTableLookup(
3062 AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
3063 if (auto KindCost = Entry->Cost[CostKind])
3064 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3065
3066 if (ST->hasDQI())
3067 if (const auto *Entry = ConvertCostTableLookup(
3068 AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
3069 if (auto KindCost = Entry->Cost[CostKind])
3070 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3071
3072 if (ST->hasAVX512())
3073 if (const auto *Entry = ConvertCostTableLookup(
3074 AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
3075 if (auto KindCost = Entry->Cost[CostKind])
3076 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3077 }
3078
3079 if (ST->hasBWI())
3080 if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
3081 LTDest.second, LTSrc.second))
3082 if (auto KindCost = Entry->Cost[CostKind])
3083 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3084
3085 if (ST->hasDQI())
3086 if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
3087 LTDest.second, LTSrc.second))
3088 if (auto KindCost = Entry->Cost[CostKind])
3089 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3090
3091 if (ST->hasAVX512())
3092 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3093 LTDest.second, LTSrc.second))
3094 if (auto KindCost = Entry->Cost[CostKind])
3095 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3096
3097 if (ST->hasAVX2())
3098 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3099 LTDest.second, LTSrc.second))
3100 if (auto KindCost = Entry->Cost[CostKind])
3101 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3102
3103 if (ST->hasAVX())
3104 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3105 LTDest.second, LTSrc.second))
3106 if (auto KindCost = Entry->Cost[CostKind])
3107 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3108
3109 if (ST->hasSSE41())
3110 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3111 LTDest.second, LTSrc.second))
3112 if (auto KindCost = Entry->Cost[CostKind])
3113 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3114
3115 if (ST->hasSSE2())
3116 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3117 LTDest.second, LTSrc.second))
3118 if (auto KindCost = Entry->Cost[CostKind])
3119 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3120
3121 // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
3122 // sitofp.
3123 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3124 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
3125 Type *ExtSrc = Src->getWithNewBitWidth(32);
3126 unsigned ExtOpc =
3127 (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
3128
3129 // For scalar loads the extend would be free.
3130 InstructionCost ExtCost = 0;
3131 if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
3132 ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
3133
3134 return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
3136 }
3137
3138 // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
3139 // i32.
3140 if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
3141 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
3142 Type *TruncDst = Dst->getWithNewBitWidth(32);
3143 return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
3144 getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
3146 }
3147
3148 // TODO: Allow non-throughput costs that aren't binary.
3149 auto AdjustCost = [&CostKind](InstructionCost Cost,
3152 return Cost == 0 ? 0 : N;
3153 return Cost * N;
3154 };
3155 return AdjustCost(
3156 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3157}
3158
3160 Type *CondTy,
3161 CmpInst::Predicate VecPred,
3163 const Instruction *I) {
3164 // Early out if this type isn't scalar/vector integer/float.
3165 if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy()))
3166 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3167 I);
3168
3169 // Legalize the type.
3170 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3171
3172 MVT MTy = LT.second;
3173
3174 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3175 assert(ISD && "Invalid opcode");
3176
3177 InstructionCost ExtraCost = 0;
3178 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
3179 // Some vector comparison predicates cost extra instructions.
3180 // TODO: Adjust ExtraCost based on CostKind?
3181 // TODO: Should we invert this and assume worst case cmp costs
3182 // and reduce for particular predicates?
3183 if (MTy.isVector() &&
3184 !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
3185 (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
3186 ST->hasBWI())) {
3187 // Fallback to I if a specific predicate wasn't specified.
3188 CmpInst::Predicate Pred = VecPred;
3189 if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
3191 Pred = cast<CmpInst>(I)->getPredicate();
3192
3193 bool CmpWithConstant = false;
3194 if (auto *CmpInstr = dyn_cast_or_null<CmpInst>(I))
3195 CmpWithConstant = isa<Constant>(CmpInstr->getOperand(1));
3196
3197 switch (Pred) {
3199 // xor(cmpeq(x,y),-1)
3200 ExtraCost = CmpWithConstant ? 0 : 1;
3201 break;
3204 // xor(cmpgt(x,y),-1)
3205 ExtraCost = CmpWithConstant ? 0 : 1;
3206 break;
3209 // cmpgt(xor(x,signbit),xor(y,signbit))
3210 // xor(cmpeq(pmaxu(x,y),x),-1)
3211 ExtraCost = CmpWithConstant ? 1 : 2;
3212 break;
3215 if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
3216 (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
3217 // cmpeq(psubus(x,y),0)
3218 // cmpeq(pminu(x,y),x)
3219 ExtraCost = 1;
3220 } else {
3221 // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
3222 ExtraCost = CmpWithConstant ? 2 : 3;
3223 }
3224 break;
3227 // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases.
3228 // Use FCMP_UEQ expansion - FCMP_ONE should be the same.
3229 if (CondTy && !ST->hasAVX())
3230 return getCmpSelInstrCost(Opcode, ValTy, CondTy,
3232 getCmpSelInstrCost(Opcode, ValTy, CondTy,
3234 getArithmeticInstrCost(Instruction::Or, CondTy, CostKind);
3235
3236 break;
3239 // Assume worst case scenario and add the maximum extra cost.
3240 ExtraCost = 3;
3241 break;
3242 default:
3243 break;
3244 }
3245 }
3246 }
3247
3248 static const CostKindTblEntry SLMCostTbl[] = {
3249 // slm pcmpeq/pcmpgt throughput is 2
3250 { ISD::SETCC, MVT::v2i64, { 2, 5, 1, 2 } },
3251 // slm pblendvb/blendvpd/blendvps throughput is 4
3252 { ISD::SELECT, MVT::v2f64, { 4, 4, 1, 3 } }, // vblendvpd
3253 { ISD::SELECT, MVT::v4f32, { 4, 4, 1, 3 } }, // vblendvps
3254 { ISD::SELECT, MVT::v2i64, { 4, 4, 1, 3 } }, // pblendvb
3255 { ISD::SELECT, MVT::v8i32, { 4, 4, 1, 3 } }, // pblendvb
3256 { ISD::SELECT, MVT::v8i16, { 4, 4, 1, 3 } }, // pblendvb
3257 { ISD::SELECT, MVT::v16i8, { 4, 4, 1, 3 } }, // pblendvb
3258 };
3259
3260 static const CostKindTblEntry AVX512BWCostTbl[] = {
3261 { ISD::SETCC, MVT::v32i16, { 1, 1, 1, 1 } },
3262 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 1 } },
3263 { ISD::SETCC, MVT::v64i8, { 1, 1, 1, 1 } },
3264 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 1 } },
3265
3266 { ISD::SELECT, MVT::v32i16, { 1, 1, 1, 1 } },
3267 { ISD::SELECT, MVT::v64i8, { 1, 1, 1, 1 } },
3268 };
3269
3270 static const CostKindTblEntry AVX512CostTbl[] = {
3271 { ISD::SETCC, MVT::v8f64, { 1, 4, 1, 1 } },
3272 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 1 } },
3273 { ISD::SETCC, MVT::v16f32, { 1, 4, 1, 1 } },
3274 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 1 } },
3275
3276 { ISD::SETCC, MVT::v8i64, { 1, 1, 1, 1 } },
3277 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 1 } },
3278 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3279 { ISD::SETCC, MVT::v16i32, { 1, 1, 1, 1 } },
3280 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 1 } },
3281 { ISD::SETCC, MVT::v32i16, { 3, 7, 5, 5 } },
3282 { ISD::SETCC, MVT::v64i8, { 3, 7, 5, 5 } },
3283
3284 { ISD::SELECT, MVT::v8i64, { 1, 1, 1, 1 } },
3285 { ISD::SELECT, MVT::v4i64, { 1, 1, 1, 1 } },
3286 { ISD::SELECT, MVT::v2i64, { 1, 1, 1, 1 } },
3287 { ISD::SELECT, MVT::v16i32, { 1, 1, 1, 1 } },
3288 { ISD::SELECT, MVT::v8i32, { 1, 1, 1, 1 } },
3289 { ISD::SELECT, MVT::v4i32, { 1, 1, 1, 1 } },
3290 { ISD::SELECT, MVT::v8f64, { 1, 1, 1, 1 } },
3291 { ISD::SELECT, MVT::v4f64, { 1, 1, 1, 1 } },
3292 { ISD::SELECT, MVT::v2f64, { 1, 1, 1, 1 } },
3293 { ISD::SELECT, MVT::f64, { 1, 1, 1, 1 } },
3294 { ISD::SELECT, MVT::v16f32, { 1, 1, 1, 1 } },
3295 { ISD::SELECT, MVT::v8f32 , { 1, 1, 1, 1 } },
3296 { ISD::SELECT, MVT::v4f32, { 1, 1, 1, 1 } },
3297 { ISD::SELECT, MVT::f32 , { 1, 1, 1, 1 } },
3298
3299 { ISD::SELECT, MVT::v32i16, { 2, 2, 4, 4 } },
3300 { ISD::SELECT, MVT::v16i16, { 1, 1, 1, 1 } },
3301 { ISD::SELECT, MVT::v8i16, { 1, 1, 1, 1 } },
3302 { ISD::SELECT, MVT::v64i8, { 2, 2, 4, 4 } },
3303 { ISD::SELECT, MVT::v32i8, { 1, 1, 1, 1 } },
3304 { ISD::SELECT, MVT::v16i8, { 1, 1, 1, 1 } },
3305 };
3306
3307 static const CostKindTblEntry AVX2CostTbl[] = {
3308 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 2 } },
3309 { ISD::SETCC, MVT::v2f64, { 1, 4, 1, 1 } },
3310 { ISD::SETCC, MVT::f64, { 1, 4, 1, 1 } },
3311 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 2 } },
3312 { ISD::SETCC, MVT::v4f32, { 1, 4, 1, 1 } },
3313 { ISD::SETCC, MVT::f32, { 1, 4, 1, 1 } },
3314
3315 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 2 } },
3316 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 2 } },
3317 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 2 } },
3318 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 2 } },
3319
3320 { ISD::SELECT, MVT::v4f64, { 2, 2, 1, 2 } }, // vblendvpd
3321 { ISD::SELECT, MVT::v8f32, { 2, 2, 1, 2 } }, // vblendvps
3322 { ISD::SELECT, MVT::v4i64, { 2, 2, 1, 2 } }, // pblendvb
3323 { ISD::SELECT, MVT::v8i32, { 2, 2, 1, 2 } }, // pblendvb
3324 { ISD::SELECT, MVT::v16i16, { 2, 2, 1, 2 } }, // pblendvb
3325 { ISD::SELECT, MVT::v32i8, { 2, 2, 1, 2 } }, // pblendvb
3326 };
3327
3328 static const CostKindTblEntry XOPCostTbl[] = {
3329 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3330 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3331 };
3332
3333 static const CostKindTblEntry AVX1CostTbl[] = {
3334 { ISD::SETCC, MVT::v4f64, { 2, 3, 1, 2 } },
3335 { ISD::SETCC, MVT::v2f64, { 1, 3, 1, 1 } },
3336 { ISD::SETCC, MVT::f64, { 1, 3, 1, 1 } },
3337 { ISD::SETCC, MVT::v8f32, { 2, 3, 1, 2 } },
3338 { ISD::SETCC, MVT::v4f32, { 1, 3, 1, 1 } },
3339 { ISD::SETCC, MVT::f32, { 1, 3, 1, 1 } },
3340
3341 // AVX1 does not support 8-wide integer compare.
3342 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3343 { ISD::SETCC, MVT::v8i32, { 4, 2, 5, 6 } },
3344 { ISD::SETCC, MVT::v16i16, { 4, 2, 5, 6 } },
3345 { ISD::SETCC, MVT::v32i8, { 4, 2, 5, 6 } },
3346
3347 { ISD::SELECT, MVT::v4f64, { 3, 3, 1, 2 } }, // vblendvpd
3348 { ISD::SELECT, MVT::v8f32, { 3, 3, 1, 2 } }, // vblendvps
3349 { ISD::SELECT, MVT::v4i64, { 3, 3, 1, 2 } }, // vblendvpd
3350 { ISD::SELECT, MVT::v8i32, { 3, 3, 1, 2 } }, // vblendvps
3351 { ISD::SELECT, MVT::v16i16, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3352 { ISD::SELECT, MVT::v32i8, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3353 };
3354
3355 static const CostKindTblEntry SSE42CostTbl[] = {
3356 { ISD::SETCC, MVT::v2i64, { 1, 2, 1, 2 } },
3357 };
3358
3359 static const CostKindTblEntry SSE41CostTbl[] = {
3360 { ISD::SETCC, MVT::v2f64, { 1, 5, 1, 1 } },
3361 { ISD::SETCC, MVT::v4f32, { 1, 5, 1, 1 } },
3362
3363 { ISD::SELECT, MVT::v2f64, { 2, 2, 1, 2 } }, // blendvpd
3364 { ISD::SELECT, MVT::f64, { 2, 2, 1, 2 } }, // blendvpd
3365 { ISD::SELECT, MVT::v4f32, { 2, 2, 1, 2 } }, // blendvps
3366 { ISD::SELECT, MVT::f32 , { 2, 2, 1, 2 } }, // blendvps
3367 { ISD::SELECT, MVT::v2i64, { 2, 2, 1, 2 } }, // pblendvb
3368 { ISD::SELECT, MVT::v4i32, { 2, 2, 1, 2 } }, // pblendvb
3369 { ISD::SELECT, MVT::v8i16, { 2, 2, 1, 2 } }, // pblendvb
3370 { ISD::SELECT, MVT::v16i8, { 2, 2, 1, 2 } }, // pblendvb
3371 };
3372
3373 static const CostKindTblEntry SSE2CostTbl[] = {
3374 { ISD::SETCC, MVT::v2f64, { 2, 5, 1, 1 } },
3375 { ISD::SETCC, MVT::f64, { 1, 5, 1, 1 } },
3376
3377 { ISD::SETCC, MVT::v2i64, { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion
3378 { ISD::SETCC, MVT::v4i32, { 1, 1, 1, 1 } },
3379 { ISD::SETCC, MVT::v8i16, { 1, 1, 1, 1 } },
3380 { ISD::SETCC, MVT::v16i8, { 1, 1, 1, 1 } },
3381
3382 { ISD::SELECT, MVT::v2f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3383 { ISD::SELECT, MVT::f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3384 { ISD::SELECT, MVT::v2i64, { 2, 2, 3, 3 } }, // pand + pandn + por
3385 { ISD::SELECT, MVT::v4i32, { 2, 2, 3, 3 } }, // pand + pandn + por
3386 { ISD::SELECT, MVT::v8i16, { 2, 2, 3, 3 } }, // pand + pandn + por
3387 { ISD::SELECT, MVT::v16i8, { 2, 2, 3, 3 } }, // pand + pandn + por
3388 };
3389
3390 static const CostKindTblEntry SSE1CostTbl[] = {
3391 { ISD::SETCC, MVT::v4f32, { 2, 5, 1, 1 } },
3392 { ISD::SETCC, MVT::f32, { 1, 5, 1, 1 } },
3393
3394 { ISD::SELECT, MVT::v4f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3395 { ISD::SELECT, MVT::f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3396 };
3397
3398 if (ST->useSLMArithCosts())
3399 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
3400 if (auto KindCost = Entry->Cost[CostKind])
3401 return LT.first * (ExtraCost + *KindCost);
3402
3403 if (ST->hasBWI())
3404 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3405 if (auto KindCost = Entry->Cost[CostKind])
3406 return LT.first * (ExtraCost + *KindCost);
3407
3408 if (ST->hasAVX512())
3409 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3410 if (auto KindCost = Entry->Cost[CostKind])
3411 return LT.first * (ExtraCost + *KindCost);
3412
3413 if (ST->hasAVX2())
3414 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3415 if (auto KindCost = Entry->Cost[CostKind])
3416 return LT.first * (ExtraCost + *KindCost);
3417
3418 if (ST->hasXOP())
3419 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3420 if (auto KindCost = Entry->Cost[CostKind])
3421 return LT.first * (ExtraCost + *KindCost);
3422
3423 if (ST->hasAVX())
3424 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3425 if (auto KindCost = Entry->Cost[CostKind])
3426 return LT.first * (ExtraCost + *KindCost);
3427
3428 if (ST->hasSSE42())
3429 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3430 if (auto KindCost = Entry->Cost[CostKind])
3431 return LT.first * (ExtraCost + *KindCost);
3432
3433 if (ST->hasSSE41())
3434 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3435 if (auto KindCost = Entry->Cost[CostKind])
3436 return LT.first * (ExtraCost + *KindCost);
3437
3438 if (ST->hasSSE2())
3439 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3440 if (auto KindCost = Entry->Cost[CostKind])
3441 return LT.first * (ExtraCost + *KindCost);
3442
3443 if (ST->hasSSE1())
3444 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3445 if (auto KindCost = Entry->Cost[CostKind])
3446 return LT.first * (ExtraCost + *KindCost);
3447
3448 // Assume a 3cy latency for fp select ops.
3449 if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select)
3450 if (ValTy->getScalarType()->isFloatingPointTy())
3451 return 3;
3452
3453 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
3454}
3455
3457
3461 // Costs should match the codegen from:
3462 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
3463 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
3464 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
3465 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
3466 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
3467
3468 // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
3469 // specialized in these tables yet.
3470 static const CostKindTblEntry AVX512VBMI2CostTbl[] = {
3471 { ISD::FSHL, MVT::v8i64, { 1, 1, 1, 1 } },
3472 { ISD::FSHL, MVT::v4i64, { 1, 1, 1, 1 } },
3473 { ISD::FSHL, MVT::v2i64, { 1, 1, 1, 1 } },
3474 { ISD::FSHL, MVT::v16i32, { 1, 1, 1, 1 } },
3475 { ISD::FSHL, MVT::v8i32, { 1, 1, 1, 1 } },
3476 { ISD::FSHL, MVT::v4i32, { 1, 1, 1, 1 } },
3477 { ISD::FSHL, MVT::v32i16, { 1, 1, 1, 1 } },
3478 { ISD::FSHL, MVT::v16i16, { 1, 1, 1, 1 } },
3479 { ISD::FSHL, MVT::v8i16, { 1, 1, 1, 1 } },
3480 { ISD::ROTL, MVT::v32i16, { 1, 1, 1, 1 } },
3481 { ISD::ROTL, MVT::v16i16, { 1, 1, 1, 1 } },
3482 { ISD::ROTL, MVT::v8i16, { 1, 1, 1, 1 } },
3483 { ISD::ROTR, MVT::v32i16, { 1, 1, 1, 1 } },
3484 { ISD::ROTR, MVT::v16i16, { 1, 1, 1, 1 } },
3485 { ISD::ROTR, MVT::v8i16, { 1, 1, 1, 1 } },
3486 { X86ISD::VROTLI, MVT::v32i16, { 1, 1, 1, 1 } },
3487 { X86ISD::VROTLI, MVT::v16i16, { 1, 1, 1, 1 } },
3488 { X86ISD::VROTLI, MVT::v8i16, { 1, 1, 1, 1 } },
3489 };
3490 static const CostKindTblEntry AVX512BITALGCostTbl[] = {
3491 { ISD::CTPOP, MVT::v32i16, { 1, 1, 1, 1 } },
3492 { ISD::CTPOP, MVT::v64i8, { 1, 1, 1, 1 } },
3493 { ISD::CTPOP, MVT::v16i16, { 1, 1, 1, 1 } },
3494 { ISD::CTPOP, MVT::v32i8, { 1, 1, 1, 1 } },
3495 { ISD::CTPOP, MVT::v8i16, { 1, 1, 1, 1 } },
3496 { ISD::CTPOP, MVT::v16i8, { 1, 1, 1, 1 } },
3497 };
3498 static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = {
3499 { ISD::CTPOP, MVT::v8i64, { 1, 1, 1, 1 } },
3500 { ISD::CTPOP, MVT::v16i32, { 1, 1, 1, 1 } },
3501 { ISD::CTPOP, MVT::v4i64, { 1, 1, 1, 1 } },
3502 { ISD::CTPOP, MVT::v8i32, { 1, 1, 1, 1 } },
3503 { ISD::CTPOP, MVT::v2i64, { 1, 1, 1, 1 } },
3504 { ISD::CTPOP, MVT::v4i32, { 1, 1, 1, 1 } },
3505 };
3506 static const CostKindTblEntry AVX512CDCostTbl[] = {
3507 { ISD::CTLZ, MVT::v8i64, { 1, 5, 1, 1 } },
3508 { ISD::CTLZ, MVT::v16i32, { 1, 5, 1, 1 } },
3509 { ISD::CTLZ, MVT::v32i16, { 18, 27, 23, 27 } },
3510 { ISD::CTLZ, MVT::v64i8, { 3, 16, 9, 11 } },
3511 { ISD::CTLZ, MVT::v4i64, { 1, 5, 1, 1 } },
3512 { ISD::CTLZ, MVT::v8i32, { 1, 5, 1, 1 } },
3513 { ISD::CTLZ, MVT::v16i16, { 8, 19, 11, 13 } },
3514 { ISD::CTLZ, MVT::v32i8, { 2, 11, 9, 10 } },
3515 { ISD::CTLZ, MVT::v2i64, { 1, 5, 1, 1 } },
3516 { ISD::CTLZ, MVT::v4i32, { 1, 5, 1, 1 } },
3517 { ISD::CTLZ, MVT::v8i16, { 3, 15, 4, 6 } },
3518 { ISD::CTLZ, MVT::v16i8, { 2, 10, 9, 10 } },
3519
3520 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3521 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3522 { ISD::CTTZ, MVT::v4i64, { 1, 8, 6, 6 } },
3523 { ISD::CTTZ, MVT::v8i32, { 1, 8, 6, 6 } },
3524 { ISD::CTTZ, MVT::v2i64, { 1, 8, 6, 6 } },
3525 { ISD::CTTZ, MVT::v4i32, { 1, 8, 6, 6 } },
3526 };
3527 static const CostKindTblEntry AVX512BWCostTbl[] = {
3528 { ISD::ABS, MVT::v32i16, { 1, 1, 1, 1 } },
3529 { ISD::ABS, MVT::v64i8, { 1, 1, 1, 1 } },
3530 { ISD::BITREVERSE, MVT::v2i64, { 3, 10, 10, 11 } },
3531 { ISD::BITREVERSE, MVT::v4i64, { 3, 11, 10, 11 } },
3532 { ISD::BITREVERSE, MVT::v8i64, { 3, 12, 10, 14 } },
3533 { ISD::BITREVERSE, MVT::v4i32, { 3, 10, 10, 11 } },
3534 { ISD::BITREVERSE, MVT::v8i32, { 3, 11, 10, 11 } },
3535 { ISD::BITREVERSE, MVT::v16i32, { 3, 12, 10, 14 } },
3536 { ISD::BITREVERSE, MVT::v8i16, { 3, 10, 10, 11 } },
3537 { ISD::BITREVERSE, MVT::v16i16, { 3, 11, 10, 11 } },
3538 { ISD::BITREVERSE, MVT::v32i16, { 3, 12, 10, 14 } },
3539 { ISD::BITREVERSE, MVT::v16i8, { 2, 5, 9, 9 } },
3540 { ISD::BITREVERSE, MVT::v32i8, { 2, 5, 9, 9 } },
3541 { ISD::BITREVERSE, MVT::v64i8, { 2, 5, 9, 12 } },
3542 { ISD::BSWAP, MVT::v2i64, { 1, 1, 1, 2 } },
3543 { ISD::BSWAP, MVT::v4i64, { 1, 1, 1, 2 } },
3544 { ISD::BSWAP, MVT::v8i64, { 1, 1, 1, 2 } },
3545 { ISD::BSWAP, MVT::v4i32, { 1, 1, 1, 2 } },
3546 { ISD::BSWAP, MVT::v8i32, { 1, 1, 1, 2 } },
3547 { ISD::BSWAP, MVT::v16i32, { 1, 1, 1, 2 } },
3548 { ISD::BSWAP, MVT::v8i16, { 1, 1, 1, 2 } },
3549 { ISD::BSWAP, MVT::v16i16, { 1, 1, 1, 2 } },
3550 { ISD::BSWAP, MVT::v32i16, { 1, 1, 1, 2 } },
3551 { ISD::CTLZ, MVT::v8i64, { 8, 22, 23, 23 } },
3552 { ISD::CTLZ, MVT::v16i32, { 8, 23, 25, 25 } },
3553 { ISD::CTLZ, MVT::v32i16, { 4, 15, 15, 16 } },
3554 { ISD::CTLZ, MVT::v64i8, { 3, 12, 10, 9 } },
3555 { ISD::CTPOP, MVT::v2i64, { 3, 7, 10, 10 } },
3556 { ISD::CTPOP, MVT::v4i64, { 3, 7, 10, 10 } },
3557 { ISD::CTPOP, MVT::v8i64, { 3, 8, 10, 12 } },
3558 { ISD::CTPOP, MVT::v4i32, { 7, 11, 14, 14 } },
3559 { ISD::CTPOP, MVT::v8i32, { 7, 11, 14, 14 } },
3560 { ISD::CTPOP, MVT::v16i32, { 7, 12, 14, 16 } },
3561 { ISD::CTPOP, MVT::v8i16, { 2, 7, 11, 11 } },
3562 { ISD::CTPOP, MVT::v16i16, { 2, 7, 11, 11 } },
3563 { ISD::CTPOP, MVT::v32i16, { 3, 7, 11, 13 } },
3564 { ISD::CTPOP, MVT::v16i8, { 2, 4, 8, 8 } },
3565 { ISD::CTPOP, MVT::v32i8, { 2, 4, 8, 8 } },
3566 { ISD::CTPOP, MVT::v64i8, { 2, 5, 8, 10 } },
3567 { ISD::CTTZ, MVT::v8i16, { 3, 9, 14, 14 } },
3568 { ISD::CTTZ, MVT::v16i16, { 3, 9, 14, 14 } },
3569 { ISD::CTTZ, MVT::v32i16, { 3, 10, 14, 16 } },
3570 { ISD::CTTZ, MVT::v16i8, { 2, 6, 11, 11 } },
3571 { ISD::CTTZ, MVT::v32i8, { 2, 6, 11, 11 } },
3572 { ISD::CTTZ, MVT::v64i8, { 3, 7, 11, 13 } },
3573 { ISD::ROTL, MVT::v32i16, { 2, 8, 6, 8 } },
3574 { ISD::ROTL, MVT::v16i16, { 2, 8, 6, 7 } },
3575 { ISD::ROTL, MVT::v8i16, { 2, 7, 6, 7 } },
3576 { ISD::ROTL, MVT::v64i8, { 5, 6, 11, 12 } },
3577 { ISD::ROTL, MVT::v32i8, { 5, 15, 7, 10 } },
3578 { ISD::ROTL, MVT::v16i8, { 5, 15, 7, 10 } },
3579 { ISD::ROTR, MVT::v32i16, { 2, 8, 6, 8 } },
3580 { ISD::ROTR, MVT::v16i16, { 2, 8, 6, 7 } },
3581 { ISD::ROTR, MVT::v8i16, { 2, 7, 6, 7 } },
3582 { ISD::ROTR, MVT::v64i8, { 5, 6, 12, 14 } },
3583 { ISD::ROTR, MVT::v32i8, { 5, 14, 6, 9 } },
3584 { ISD::ROTR, MVT::v16i8, { 5, 14, 6, 9 } },
3585 { X86ISD::VROTLI, MVT::v32i16, { 2, 5, 3, 3 } },
3586 { X86ISD::VROTLI, MVT::v16i16, { 1, 5, 3, 3 } },
3587 { X86ISD::VROTLI, MVT::v8i16, { 1, 5, 3, 3 } },
3588 { X86ISD::VROTLI, MVT::v64i8, { 2, 9, 3, 4 } },
3589 { X86ISD::VROTLI, MVT::v32i8, { 1, 9, 3, 4 } },
3590 { X86ISD::VROTLI, MVT::v16i8, { 1, 8, 3, 4 } },
3591 { ISD::SADDSAT, MVT::v32i16, { 1 } },
3592 { ISD::SADDSAT, MVT::v64i8, { 1 } },
3593 { ISD::SMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3594 { ISD::SMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3595 { ISD::SMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3596 { ISD::SMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3597 { ISD::SSUBSAT, MVT::v32i16, { 1 } },
3598 { ISD::SSUBSAT, MVT::v64i8, { 1 } },
3599 { ISD::UADDSAT, MVT::v32i16, { 1 } },
3600 { ISD::UADDSAT, MVT::v64i8, { 1 } },
3601 { ISD::UMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3602 { ISD::UMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3603 { ISD::UMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3604 { ISD::UMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3605 { ISD::USUBSAT, MVT::v32i16, { 1 } },
3606 { ISD::USUBSAT, MVT::v64i8, { 1 } },
3607 };
3608 static const CostKindTblEntry AVX512CostTbl[] = {
3609 { ISD::ABS, MVT::v8i64, { 1, 1, 1, 1 } },
3610 { ISD::ABS, MVT::v4i64, { 1, 1, 1, 1 } },
3611 { ISD::ABS, MVT::v2i64, { 1, 1, 1, 1 } },
3612 { ISD::ABS, MVT::v16i32, { 1, 1, 1, 1 } },
3613 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 1 } },
3614 { ISD::ABS, MVT::v32i16, { 2, 7, 4, 4 } },
3615 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 1 } },
3616 { ISD::ABS, MVT::v64i8, { 2, 7, 4, 4 } },
3617 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 1 } },
3618 { ISD::BITREVERSE, MVT::v8i64, { 9, 13, 20, 20 } },
3619 { ISD::BITREVERSE, MVT::v16i32, { 9, 13, 20, 20 } },
3620 { ISD::BITREVERSE, MVT::v32i16, { 9, 13, 20, 20 } },
3621 { ISD::BITREVERSE, MVT::v64i8, { 6, 11, 17, 17 } },
3622 { ISD::BSWAP, MVT::v8i64, { 4, 7, 5, 5 } },
3623 { ISD::BSWAP, MVT::v16i32, { 4, 7, 5, 5 } },
3624 { ISD::BSWAP, MVT::v32i16, { 4, 7, 5, 5 } },
3625 { ISD::CTLZ, MVT::v8i64, { 10, 28, 32, 32 } },
3626 { ISD::CTLZ, MVT::v16i32, { 12, 30, 38, 38 } },
3627 { ISD::CTLZ, MVT::v32i16, { 8, 15, 29, 29 } },
3628 { ISD::CTLZ, MVT::v64i8, { 6, 11, 19, 19 } },
3629 { ISD::CTPOP, MVT::v8i64, { 16, 16, 19, 19 } },
3630 { ISD::CTPOP, MVT::v16i32, { 24, 19, 27, 27 } },
3631 { ISD::CTPOP, MVT::v32i16, { 18, 15, 22, 22 } },
3632 { ISD::CTPOP, MVT::v64i8, { 12, 11, 16, 16 } },
3633 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3634 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3635 { ISD::CTTZ, MVT::v32i16, { 7, 17, 27, 27 } },
3636 { ISD::CTTZ, MVT::v64i8, { 6, 13, 21, 21 } },
3637 { ISD::ROTL, MVT::v8i64, { 1, 1, 1, 1 } },
3638 { ISD::ROTL, MVT::v4i64, { 1, 1, 1, 1 } },
3639 { ISD::ROTL, MVT::v2i64, { 1, 1, 1, 1 } },
3640 { ISD::ROTL, MVT::v16i32, { 1, 1, 1, 1 } },
3641 { ISD::ROTL, MVT::v8i32, { 1, 1, 1, 1 } },
3642 { ISD::ROTL, MVT::v4i32, { 1, 1, 1, 1 } },
3643 { ISD::ROTR, MVT::v8i64, { 1, 1, 1, 1 } },
3644 { ISD::ROTR, MVT::v4i64, { 1, 1, 1, 1 } },
3645 { ISD::ROTR, MVT::v2i64, { 1, 1, 1, 1 } },
3646 { ISD::ROTR, MVT::v16i32, { 1, 1, 1, 1 } },
3647 { ISD::ROTR, MVT::v8i32, { 1, 1, 1, 1 } },
3648 { ISD::ROTR, MVT::v4i32, { 1, 1, 1, 1 } },
3649 { X86ISD::VROTLI, MVT::v8i64, { 1, 1, 1, 1 } },
3650 { X86ISD::VROTLI, MVT::v4i64, { 1, 1, 1, 1 } },
3651 { X86ISD::VROTLI, MVT::v2i64, { 1, 1, 1, 1 } },
3652 { X86ISD::VROTLI, MVT::v16i32, { 1, 1, 1, 1 } },
3653 { X86ISD::VROTLI, MVT::v8i32, { 1, 1, 1, 1 } },
3654 { X86ISD::VROTLI, MVT::v4i32, { 1, 1, 1, 1 } },
3655 { ISD::SMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3656 { ISD::SMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3657 { ISD::SMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3658 { ISD::SMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3659 { ISD::SMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3660 { ISD::SMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3661 { ISD::SMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3662 { ISD::SMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3663 { ISD::SMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3664 { ISD::SMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3665 { ISD::SMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3666 { ISD::SMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3667 { ISD::UMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3668 { ISD::UMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3669 { ISD::UMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3670 { ISD::UMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3671 { ISD::UMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3672 { ISD::UMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3673 { ISD::UMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3674 { ISD::UMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3675 { ISD::UMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3676 { ISD::UMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3677 { ISD::UMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3678 { ISD::UMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3679 { ISD::USUBSAT, MVT::v16i32, { 2 } }, // pmaxud + psubd
3680 { ISD::USUBSAT, MVT::v2i64, { 2 } }, // pmaxuq + psubq
3681 { ISD::USUBSAT, MVT::v4i64, { 2 } }, // pmaxuq + psubq
3682 { ISD::USUBSAT, MVT::v8i64, { 2 } }, // pmaxuq + psubq
3683 { ISD::UADDSAT, MVT::v16i32, { 3 } }, // not + pminud + paddd
3684 { ISD::UADDSAT, MVT::v2i64, { 3 } }, // not + pminuq + paddq
3685 { ISD::UADDSAT, MVT::v4i64, { 3 } }, // not + pminuq + paddq
3686 { ISD::UADDSAT, MVT::v8i64, { 3 } }, // not + pminuq + paddq
3687 { ISD::SADDSAT, MVT::v32i16, { 2 } },
3688 { ISD::SADDSAT, MVT::v64i8, { 2 } },
3689 { ISD::SSUBSAT, MVT::v32i16, { 2 } },
3690 { ISD::SSUBSAT, MVT::v64i8, { 2 } },
3691 { ISD::UADDSAT, MVT::v32i16, { 2 } },
3692 { ISD::UADDSAT, MVT::v64i8, { 2 } },
3693 { ISD::USUBSAT, MVT::v32i16, { 2 } },
3694 { ISD::USUBSAT, MVT::v64i8, { 2 } },
3695 { ISD::FMAXNUM, MVT::f32, { 2, 2, 3, 3 } },
3696 { ISD::FMAXNUM, MVT::v4f32, { 1, 1, 3, 3 } },
3697 { ISD::FMAXNUM, MVT::v8f32, { 2, 2, 3, 3 } },
3698 { ISD::FMAXNUM, MVT::v16f32, { 4, 4, 3, 3 } },
3699 { ISD::FMAXNUM, MVT::f64, { 2, 2, 3, 3 } },
3700 { ISD::FMAXNUM, MVT::v2f64, { 1, 1, 3, 3 } },
3701 { ISD::FMAXNUM, MVT::v4f64, { 2, 2, 3, 3 } },
3702 { ISD::FMAXNUM, MVT::v8f64, { 3, 3, 3, 3 } },
3703 { ISD::FSQRT, MVT::f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3704 { ISD::FSQRT, MVT::v4f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3705 { ISD::FSQRT, MVT::v8f32, { 6, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3706 { ISD::FSQRT, MVT::v16f32, { 12, 20, 1, 3 } }, // Skylake from http://www.agner.org/
3707 { ISD::FSQRT, MVT::f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3708 { ISD::FSQRT, MVT::v2f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3709 { ISD::FSQRT, MVT::v4f64, { 12, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3710 { ISD::FSQRT, MVT::v8f64, { 24, 32, 1, 3 } }, // Skylake from http://www.agner.org/
3711 };
3712 static const CostKindTblEntry XOPCostTbl[] = {
3713 { ISD::BITREVERSE, MVT::v4i64, { 3, 6, 5, 6 } },
3714 { ISD::BITREVERSE, MVT::v8i32, { 3, 6, 5, 6 } },
3715 { ISD::BITREVERSE, MVT::v16i16, { 3, 6, 5, 6 } },
3716 { ISD::BITREVERSE, MVT::v32i8, { 3, 6, 5, 6 } },
3717 { ISD::BITREVERSE, MVT::v2i64, { 2, 7, 1, 1 } },
3718 { ISD::BITREVERSE, MVT::v4i32, { 2, 7, 1, 1 } },
3719 { ISD::BITREVERSE, MVT::v8i16, { 2, 7, 1, 1 } },
3720 { ISD::BITREVERSE, MVT::v16i8, { 2, 7, 1, 1 } },
3721 { ISD::BITREVERSE, MVT::i64, { 2, 2, 3, 4 } },
3722 { ISD::BITREVERSE, MVT::i32, { 2, 2, 3, 4 } },
3723 { ISD::BITREVERSE, MVT::i16, { 2, 2, 3, 4 } },
3724 { ISD::BITREVERSE, MVT::i8, { 2, 2, 3, 4 } },
3725 // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
3726 { ISD::ROTL, MVT::v4i64, { 4, 7, 5, 6 } },
3727 { ISD::ROTL, MVT::v8i32, { 4, 7, 5, 6 } },
3728 { ISD::ROTL, MVT::v16i16, { 4, 7, 5, 6 } },
3729 { ISD::ROTL, MVT::v32i8, { 4, 7, 5, 6 } },
3730 { ISD::ROTL, MVT::v2i64, { 1, 3, 1, 1 } },
3731 { ISD::ROTL, MVT::v4i32, { 1, 3, 1, 1 } },
3732 { ISD::ROTL, MVT::v8i16, { 1, 3, 1, 1 } },
3733 { ISD::ROTL, MVT::v16i8, { 1, 3, 1, 1 } },
3734 { ISD::ROTR, MVT::v4i64, { 4, 7, 8, 9 } },
3735 { ISD::ROTR, MVT::v8i32, { 4, 7, 8, 9 } },
3736 { ISD::ROTR, MVT::v16i16, { 4, 7, 8, 9 } },
3737 { ISD::ROTR, MVT::v32i8, { 4, 7, 8, 9 } },
3738 { ISD::ROTR, MVT::v2i64, { 1, 3, 3, 3 } },
3739 { ISD::ROTR, MVT::v4i32, { 1, 3, 3, 3 } },
3740 { ISD::ROTR, MVT::v8i16, { 1, 3, 3, 3 } },
3741 { ISD::ROTR, MVT::v16i8, { 1, 3, 3, 3 } },
3742 { X86ISD::VROTLI, MVT::v4i64, { 4, 7, 5, 6 } },
3743 { X86ISD::VROTLI, MVT::v8i32, { 4, 7, 5, 6 } },
3744 { X86ISD::VROTLI, MVT::v16i16, { 4, 7, 5, 6 } },
3745 { X86ISD::VROTLI, MVT::v32i8, { 4, 7, 5, 6 } },
3746 { X86ISD::VROTLI, MVT::v2i64, { 1, 3, 1, 1 } },
3747 { X86ISD::VROTLI, MVT::v4i32, { 1, 3, 1, 1 } },
3748 { X86ISD::VROTLI, MVT::v8i16, { 1, 3, 1, 1 } },
3749 { X86ISD::VROTLI, MVT::v16i8, { 1, 3, 1, 1 } },
3750 };
3751 static const CostKindTblEntry AVX2CostTbl[] = {
3752 { ISD::ABS, MVT::v2i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3753 { ISD::ABS, MVT::v4i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3754 { ISD::ABS, MVT::v4i32, { 1, 1, 1, 1 } },
3755 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 2 } },
3756 { ISD::ABS, MVT::v8i16, { 1, 1, 1, 1 } },
3757 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 2 } },
3758 { ISD::ABS, MVT::v16i8, { 1, 1, 1, 1 } },
3759 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 2 } },
3760 { ISD::BITREVERSE, MVT::v2i64, { 3, 11, 10, 11 } },
3761 { ISD::BITREVERSE, MVT::v4i64, { 5, 11, 10, 17 } },
3762 { ISD::BITREVERSE, MVT::v4i32, { 3, 11, 10, 11 } },
3763 { ISD::BITREVERSE, MVT::v8i32, { 5, 11, 10, 17 } },
3764 { ISD::BITREVERSE, MVT::v8i16, { 3, 11, 10, 11 } },
3765 { ISD::BITREVERSE, MVT::v16i16, { 5, 11, 10, 17 } },
3766 { ISD::BITREVERSE, MVT::v16i8, { 3, 6, 9, 9 } },
3767 { ISD::BITREVERSE, MVT::v32i8, { 4, 5, 9, 15 } },
3768 { ISD::BSWAP, MVT::v2i64, { 1, 2, 1, 2 } },
3769 { ISD::BSWAP, MVT::v4i64, { 1, 3, 1, 2 } },
3770 { ISD::BSWAP, MVT::v4i32, { 1, 2, 1, 2 } },
3771 { ISD::BSWAP, MVT::v8i32, { 1, 3, 1, 2 } },
3772 { ISD::BSWAP, MVT::v8i16, { 1, 2, 1, 2 } },
3773 { ISD::BSWAP, MVT::v16i16, { 1, 3, 1, 2 } },
3774 { ISD::CTLZ, MVT::v2i64, { 7, 18, 24, 25 } },
3775 { ISD::CTLZ, MVT::v4i64, { 14, 18, 24, 44 } },
3776 { ISD::CTLZ, MVT::v4i32, { 5, 16, 19, 20 } },
3777 { ISD::CTLZ, MVT::v8i32, { 10, 16, 19, 34 } },
3778 { ISD::CTLZ, MVT::v8i16, { 4, 13, 14, 15 } },
3779 { ISD::CTLZ, MVT::v16i16, { 6, 14, 14, 24 } },
3780 { ISD::CTLZ, MVT::v16i8, { 3, 12, 9, 10 } },
3781 { ISD::CTLZ, MVT::v32i8, { 4, 12, 9, 14 } },
3782 { ISD::CTPOP, MVT::v2i64, { 3, 9, 10, 10 } },
3783 { ISD::CTPOP, MVT::v4i64, { 4, 9, 10, 14 } },
3784 { ISD::CTPOP, MVT::v4i32, { 7, 12, 14, 14 } },
3785 { ISD::CTPOP, MVT::v8i32, { 7, 12, 14, 18 } },
3786 { ISD::CTPOP, MVT::v8i16, { 3, 7, 11, 11 } },
3787 { ISD::CTPOP, MVT::v16i16, { 6, 8, 11, 18 } },
3788 { ISD::CTPOP, MVT::v16i8, { 2, 5, 8, 8 } },
3789 { ISD::CTPOP, MVT::v32i8, { 3, 5, 8, 12 } },
3790 { ISD::CTTZ, MVT::v2i64, { 4, 11, 13, 13 } },
3791 { ISD::CTTZ, MVT::v4i64, { 5, 11, 13, 20 } },
3792 { ISD::CTTZ, MVT::v4i32, { 7, 14, 17, 17 } },
3793 { ISD::CTTZ, MVT::v8i32, { 7, 15, 17, 24 } },
3794 { ISD::CTTZ, MVT::v8i16, { 4, 9, 14, 14 } },
3795 { ISD::CTTZ, MVT::v16i16, { 6, 9, 14, 24 } },
3796 { ISD::CTTZ, MVT::v16i8, { 3, 7, 11, 11 } },
3797 { ISD::CTTZ, MVT::v32i8, { 5, 7, 11, 18 } },
3798 { ISD::SADDSAT, MVT::v16i16, { 1 } },
3799 { ISD::SADDSAT, MVT::v32i8, { 1 } },
3800 { ISD::SMAX, MVT::v2i64, { 2, 7, 2, 3 } },
3801 { ISD::SMAX, MVT::v4i64, { 2, 7, 2, 3 } },
3802 { ISD::SMAX, MVT::v8i32, { 1, 1, 1, 2 } },
3803 { ISD::SMAX, MVT::v16i16, { 1, 1, 1, 2 } },
3804 { ISD::SMAX, MVT::v32i8, { 1, 1, 1, 2 } },
3805 { ISD::SMIN, MVT::v2i64, { 2, 7, 2, 3 } },
3806 { ISD::SMIN, MVT::v4i64, { 2, 7, 2, 3 } },
3807 { ISD::SMIN, MVT::v8i32, { 1, 1, 1, 2 } },
3808 { ISD::SMIN, MVT::v16i16, { 1, 1, 1, 2 } },
3809 { ISD::SMIN, MVT::v32i8, { 1, 1, 1, 2 } },
3810 { ISD::SSUBSAT, MVT::v16i16, { 1 } },
3811 { ISD::SSUBSAT, MVT::v32i8, { 1 } },
3812 { ISD::UADDSAT, MVT::v16i16, { 1 } },
3813 { ISD::UADDSAT, MVT::v32i8, { 1 } },
3814 { ISD::UADDSAT, MVT::v8i32, { 3 } }, // not + pminud + paddd
3815 { ISD::UMAX, MVT::v2i64, { 2, 8, 5, 6 } },
3816 { ISD::UMAX, MVT::v4i64, { 2, 8, 5, 8 } },
3817 { ISD::UMAX, MVT::v8i32, { 1, 1, 1, 2 } },
3818 { ISD::UMAX, MVT::v16i16, { 1, 1, 1, 2 } },
3819 { ISD::UMAX, MVT::v32i8, { 1, 1, 1, 2 } },
3820 { ISD::UMIN, MVT::v2i64, { 2, 8, 5, 6 } },
3821 { ISD::UMIN, MVT::v4i64, { 2, 8, 5, 8 } },
3822 { ISD::UMIN, MVT::v8i32, { 1, 1, 1, 2 } },
3823 { ISD::UMIN, MVT::v16i16, { 1, 1, 1, 2 } },
3824 { ISD::UMIN, MVT::v32i8, { 1, 1, 1, 2 } },
3825 { ISD::USUBSAT, MVT::v16i16, { 1 } },
3826 { ISD::USUBSAT, MVT::v32i8, { 1 } },
3827 { ISD::USUBSAT, MVT::v8i32, { 2 } }, // pmaxud + psubd
3828 { ISD::FMAXNUM, MVT::f32, { 2, 7, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3829 { ISD::FMAXNUM, MVT::v4f32, { 2, 7, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3830 { ISD::FMAXNUM, MVT::v8f32, { 3, 7, 3, 6 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3831 { ISD::FMAXNUM, MVT::f64, { 2, 7, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3832 { ISD::FMAXNUM, MVT::v2f64, { 2, 7, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3833 { ISD::FMAXNUM, MVT::v4f64, { 3, 7, 3, 6 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3834 { ISD::FSQRT, MVT::f32, { 7, 15, 1, 1 } }, // vsqrtss
3835 { ISD::FSQRT, MVT::v4f32, { 7, 15, 1, 1 } }, // vsqrtps
3836 { ISD::FSQRT, MVT::v8f32, { 14, 21, 1, 3 } }, // vsqrtps
3837 { ISD::FSQRT, MVT::f64, { 14, 21, 1, 1 } }, // vsqrtsd
3838 { ISD::FSQRT, MVT::v2f64, { 14, 21, 1, 1 } }, // vsqrtpd
3839 { ISD::FSQRT, MVT::v4f64, { 28, 35, 1, 3 } }, // vsqrtpd
3840 };
3841 static const CostKindTblEntry AVX1CostTbl[] = {
3842 { ISD::ABS, MVT::v4i64, { 6, 8, 6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3843 { ISD::ABS, MVT::v8i32, { 3, 6, 4, 5 } },
3844 { ISD::ABS, MVT::v16i16, { 3, 6, 4, 5 } },
3845 { ISD::ABS, MVT::v32i8, { 3, 6, 4, 5 } },
3846 { ISD::BITREVERSE, MVT::v4i64, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3847 { ISD::BITREVERSE, MVT::v2i64, { 8, 13, 10, 16 } },
3848 { ISD::BITREVERSE, MVT::v8i32, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3849 { ISD::BITREVERSE, MVT::v4i32, { 8, 13, 10, 16 } },
3850 { ISD::BITREVERSE, MVT::v16i16, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3851 { ISD::BITREVERSE, MVT::v8i16, { 8, 13, 10, 16 } },
3852 { ISD::BITREVERSE, MVT::v32i8, { 13, 15, 17, 26 } }, // 2 x 128-bit Op + extract/insert
3853 { ISD::BITREVERSE, MVT::v16i8, { 7, 7, 9, 13 } },
3854 { ISD::BSWAP, MVT::v4i64, { 5, 6, 5, 10 } },
3855 { ISD::BSWAP, MVT::v2i64, { 2, 2, 1, 3 } },
3856 { ISD::BSWAP, MVT::v8i32, { 5, 6, 5, 10 } },
3857 { ISD::BSWAP, MVT::v4i32, { 2, 2, 1, 3 } },
3858 { ISD::BSWAP, MVT::v16i16, { 5, 6, 5, 10 } },
3859 { ISD::BSWAP, MVT::v8i16, { 2, 2, 1, 3 } },
3860 { ISD::CTLZ, MVT::v4i64, { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert
3861 { ISD::CTLZ, MVT::v2i64, { 14, 24, 24, 28 } },
3862 { ISD::CTLZ, MVT::v8i32, { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert
3863 { ISD::CTLZ, MVT::v4i32, { 12, 20, 19, 23 } },
3864 { ISD::CTLZ, MVT::v16i16, { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert
3865 { ISD::CTLZ, MVT::v8i16, { 9, 16, 14, 18 } },
3866 { ISD::CTLZ, MVT::v32i8, { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert
3867 { ISD::CTLZ, MVT::v16i8, { 7, 12, 9, 13 } },
3868 { ISD::CTPOP, MVT::v4i64, { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert
3869 { ISD::CTPOP, MVT::v2i64, { 7, 14, 10, 14 } },
3870 { ISD::CTPOP, MVT::v8i32, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
3871 { ISD::CTPOP, MVT::v4i32, { 9, 20, 14, 18 } },
3872 { ISD::CTPOP, MVT::v16i16, { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert
3873 { ISD::CTPOP, MVT::v8i16, { 8, 18, 11, 15 } },
3874 { ISD::CTPOP, MVT::v32i8, { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert
3875 { ISD::CTPOP, MVT::v16i8, { 6, 12, 8, 12 } },
3876 { ISD::CTTZ, MVT::v4i64, { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert
3877 { ISD::CTTZ, MVT::v2i64, { 9, 19, 13, 17 } },
3878 { ISD::CTTZ, MVT::v8i32, { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert
3879 { ISD::CTTZ, MVT::v4i32, { 11, 24, 17, 21 } },
3880 { ISD::CTTZ, MVT::v16i16, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
3881 { ISD::CTTZ, MVT::v8i16, { 9, 21, 14, 18 } },
3882 { ISD::CTTZ, MVT::v32i8, { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert
3883 { ISD::CTTZ, MVT::v16i8, { 8, 16, 11, 15 } },
3884 { ISD::SADDSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3885 { ISD::SADDSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3886 { ISD::SMAX, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
3887 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 4 } },
3888 { ISD::SMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3889 { ISD::SMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3890 { ISD::SMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3891 { ISD::SMIN, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
3892 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
3893 { ISD::SMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3894 { ISD::SMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3895 { ISD::SMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3896 { ISD::SSUBSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3897 { ISD::SSUBSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3898 { ISD::UADDSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3899 { ISD::UADDSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3900 { ISD::UADDSAT, MVT::v8i32, { 8 } }, // 2 x 128-bit Op + extract/insert
3901 { ISD::UMAX, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
3902 { ISD::UMAX, MVT::v2i64, { 4, 8, 5, 7 } },
3903 { ISD::UMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3904 { ISD::UMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3905 { ISD::UMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3906 { ISD::UMIN, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
3907 { ISD::UMIN, MVT::v2i64, { 4, 8, 5, 7 } },
3908 { ISD::UMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3909 { ISD::UMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3910 { ISD::UMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3911 { ISD::USUBSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3912 { ISD::USUBSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3913 { ISD::USUBSAT, MVT::v8i32, { 6 } }, // 2 x 128-bit Op + extract/insert
3914 { ISD::FMAXNUM, MVT::f32, { 3, 6, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3915 { ISD::FMAXNUM, MVT::v4f32, { 3, 6, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3916 { ISD::FMAXNUM, MVT::v8f32, { 5, 7, 3, 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3917 { ISD::FMAXNUM, MVT::f64, { 3, 6, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3918 { ISD::FMAXNUM, MVT::v2f64, { 3, 6, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3919 { ISD::FMAXNUM, MVT::v4f64, { 5, 7, 3, 10 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3920 { ISD::FSQRT, MVT::f32, { 21, 21, 1, 1 } }, // vsqrtss
3921 { ISD::FSQRT, MVT::v4f32, { 21, 21, 1, 1 } }, // vsqrtps
3922 { ISD::FSQRT, MVT::v8f32, { 42, 42, 1, 3 } }, // vsqrtps
3923 { ISD::FSQRT, MVT::f64, { 27, 27, 1, 1 } }, // vsqrtsd
3924 { ISD::FSQRT, MVT::v2f64, { 27, 27, 1, 1 } }, // vsqrtpd
3925 { ISD::FSQRT, MVT::v4f64, { 54, 54, 1, 3 } }, // vsqrtpd
3926 };
3927 static const CostKindTblEntry GFNICostTbl[] = {
3928 { ISD::BITREVERSE, MVT::i8, { 3, 3, 3, 4 } }, // gf2p8affineqb
3929 { ISD::BITREVERSE, MVT::i16, { 3, 3, 4, 6 } }, // gf2p8affineqb
3930 { ISD::BITREVERSE, MVT::i32, { 3, 3, 4, 5 } }, // gf2p8affineqb
3931 { ISD::BITREVERSE, MVT::i64, { 3, 3, 4, 6 } }, // gf2p8affineqb
3932 { ISD::BITREVERSE, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3933 { ISD::BITREVERSE, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3934 { ISD::BITREVERSE, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3935 { ISD::BITREVERSE, MVT::v8i16, { 1, 8, 2, 4 } }, // gf2p8affineqb
3936 { ISD::BITREVERSE, MVT::v16i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
3937 { ISD::BITREVERSE, MVT::v32i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
3938 { ISD::BITREVERSE, MVT::v4i32, { 1, 8, 2, 4 } }, // gf2p8affineqb
3939 { ISD::BITREVERSE, MVT::v8i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
3940 { ISD::BITREVERSE, MVT::v16i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
3941 { ISD::BITREVERSE, MVT::v2i64, { 1, 8, 2, 4 } }, // gf2p8affineqb
3942 { ISD::BITREVERSE, MVT::v4i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
3943 { ISD::BITREVERSE, MVT::v8i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
3944 { X86ISD::VROTLI, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3945 { X86ISD::VROTLI, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3946 { X86ISD::VROTLI, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3947 };
3948 static const CostKindTblEntry GLMCostTbl[] = {
3949 { ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss
3950 { ISD::FSQRT, MVT::v4f32, { 37, 41, 1, 5 } }, // sqrtps
3951 { ISD::FSQRT, MVT::f64, { 34, 35, 1, 1 } }, // sqrtsd
3952 { ISD::FSQRT, MVT::v2f64, { 67, 71, 1, 5 } }, // sqrtpd
3953 };
3954 static const CostKindTblEntry SLMCostTbl[] = {
3955 { ISD::BSWAP, MVT::v2i64, { 5, 5, 1, 5 } },
3956 { ISD::BSWAP, MVT::v4i32, { 5, 5, 1, 5 } },
3957 { ISD::BSWAP, MVT::v8i16, { 5, 5, 1, 5 } },
3958 { ISD::FSQRT, MVT::f32, { 20, 20, 1, 1 } }, // sqrtss
3959 { ISD::FSQRT, MVT::v4f32, { 40, 41, 1, 5 } }, // sqrtps
3960 { ISD::FSQRT, MVT::f64, { 35, 35, 1, 1 } }, // sqrtsd
3961 { ISD::FSQRT, MVT::v2f64, { 70, 71, 1, 5 } }, // sqrtpd
3962 };
3963 static const CostKindTblEntry SSE42CostTbl[] = {
3964 { ISD::USUBSAT, MVT::v4i32, { 2 } }, // pmaxud + psubd
3965 { ISD::UADDSAT, MVT::v4i32, { 3 } }, // not + pminud + paddd
3966 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3967 { ISD::FMAXNUM, MVT::v4f32, { 4, 4, 4, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3968 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3969 { ISD::FMAXNUM, MVT::v2f64, { 4, 4, 4, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3970 { ISD::FSQRT, MVT::f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
3971 { ISD::FSQRT, MVT::v4f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
3972 };
3973 static const CostKindTblEntry SSE41CostTbl[] = {
3974 { ISD::ABS, MVT::v2i64, { 3, 4, 3, 5 } }, // BLENDVPD(X,PSUBQ(0,X),X)
3975 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 3 } },
3976 { ISD::SMAX, MVT::v4i32, { 1, 1, 1, 1 } },
3977 { ISD::SMAX, MVT::v16i8, { 1, 1, 1, 1 } },
3978 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
3979 { ISD::SMIN, MVT::v4i32, { 1, 1, 1, 1 } },
3980 { ISD::SMIN, MVT::v16i8, { 1, 1, 1, 1 } },
3981 { ISD::UMAX, MVT::v2i64, { 2, 11, 6, 7 } },
3982 { ISD::UMAX, MVT::v4i32, { 1, 1, 1, 1 } },
3983 { ISD::UMAX, MVT::v8i16, { 1, 1, 1, 1 } },
3984 { ISD::UMIN, MVT::v2i64, { 2, 11, 6, 7 } },
3985 { ISD::UMIN, MVT::v4i32, { 1, 1, 1, 1 } },
3986 { ISD::UMIN, MVT::v8i16, { 1, 1, 1, 1 } },
3987 };
3988 static const CostKindTblEntry SSSE3CostTbl[] = {
3989 { ISD::ABS, MVT::v4i32, { 1, 2, 1, 1 } },
3990 { ISD::ABS, MVT::v8i16, { 1, 2, 1, 1 } },
3991 { ISD::ABS, MVT::v16i8, { 1, 2, 1, 1 } },
3992 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 11, 21 } },
3993 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 11, 21 } },
3994 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 11, 21 } },
3995 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 10, 16 } },
3996 { ISD::BSWAP, MVT::v2i64, { 2, 3, 1, 5 } },
3997 { ISD::BSWAP, MVT::v4i32, { 2, 3, 1, 5 } },
3998 { ISD::BSWAP, MVT::v8i16, { 2, 3, 1, 5 } },
3999 { ISD::CTLZ, MVT::v2i64, { 18, 28, 28, 35 } },
4000 { ISD::CTLZ, MVT::v4i32, { 15, 20, 22, 28 } },
4001 { ISD::CTLZ, MVT::v8i16, { 13, 17, 16, 22 } },
4002 { ISD::CTLZ, MVT::v16i8, { 11, 15, 10, 16 } },
4003 { ISD::CTPOP, MVT::v2i64, { 13, 19, 12, 18 } },
4004 { ISD::CTPOP, MVT::v4i32, { 18, 24, 16, 22 } },
4005 { ISD::CTPOP, MVT::v8i16, { 13, 18, 14, 20 } },
4006 { ISD::CTPOP, MVT::v16i8, { 11, 12, 10, 16 } },
4007 { ISD::CTTZ, MVT::v2i64, { 13, 25, 15, 22 } },
4008 { ISD::CTTZ, MVT::v4i32, { 18, 26, 19, 25 } },
4009 { ISD::CTTZ, MVT::v8i16, { 13, 20, 17, 23 } },
4010 { ISD::CTTZ, MVT::v16i8, { 11, 16, 13, 19 } }
4011 };
4012 static const CostKindTblEntry SSE2CostTbl[] = {
4013 { ISD::ABS, MVT::v2i64, { 3, 6, 5, 5 } },
4014 { ISD::ABS, MVT::v4i32, { 1, 4, 4, 4 } },
4015 { ISD::ABS, MVT::v8i16, { 1, 2, 3, 3 } },
4016 { ISD::ABS, MVT::v16i8, { 1, 2, 3, 3 } },
4017 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 32, 32 } },
4018 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 30, 30 } },
4019 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 25, 25 } },
4020 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 21, 21 } },
4021 { ISD::BSWAP, MVT::v2i64, { 5, 6, 11, 11 } },
4022 { ISD::BSWAP, MVT::v4i32, { 5, 5, 9, 9 } },
4023 { ISD::BSWAP, MVT::v8i16, { 5, 5, 4, 5 } },
4024 { ISD::CTLZ, MVT::v2i64, { 10, 45, 36, 38 } },
4025 { ISD::CTLZ, MVT::v4i32, { 10, 45, 38, 40 } },
4026 { ISD::CTLZ, MVT::v8i16, { 9, 38, 32, 34 } },
4027 { ISD::CTLZ, MVT::v16i8, { 8, 39, 29, 32 } },
4028 { ISD::CTPOP, MVT::v2i64, { 12, 26, 16, 18 } },
4029 { ISD::CTPOP, MVT::v4i32, { 15, 29, 21, 23 } },
4030 { ISD::CTPOP, MVT::v8i16, { 13, 25, 18, 20 } },
4031 { ISD::CTPOP, MVT::v16i8, { 10, 21, 14, 16 } },
4032 { ISD::CTTZ, MVT::v2i64, { 14, 28, 19, 21 } },
4033 { ISD::CTTZ, MVT::v4i32, { 18, 31, 24, 26 } },
4034 { ISD::CTTZ, MVT::v8i16, { 16, 27, 21, 23 } },
4035 { ISD::CTTZ, MVT::v16i8, { 13, 23, 17, 19 } },
4036 { ISD::SADDSAT, MVT::v8i16, { 1 } },
4037 { ISD::SADDSAT, MVT::v16i8, { 1 } },
4038 { ISD::SMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4039 { ISD::SMAX, MVT::v4i32, { 2, 4, 5, 5 } },
4040 { ISD::SMAX, MVT::v8i16, { 1, 1, 1, 1 } },
4041 { ISD::SMAX, MVT::v16i8, { 2, 4, 5, 5 } },
4042 { ISD::SMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4043 { ISD::SMIN, MVT::v4i32, { 2, 4, 5, 5 } },
4044 { ISD::SMIN, MVT::v8i16, { 1, 1, 1, 1 } },
4045 { ISD::SMIN, MVT::v16i8, { 2, 4, 5, 5 } },
4046 { ISD::SSUBSAT, MVT::v8i16, { 1 } },
4047 { ISD::SSUBSAT, MVT::v16i8, { 1 } },
4048 { ISD::UADDSAT, MVT::v8i16, { 1 } },
4049 { ISD::UADDSAT, MVT::v16i8, { 1 } },
4050 { ISD::UMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4051 { ISD::UMAX, MVT::v4i32, { 2, 5, 8, 8 } },
4052 { ISD::UMAX, MVT::v8i16, { 1, 3, 3, 3 } },
4053 { ISD::UMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4054 { ISD::UMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4055 { ISD::UMIN, MVT::v4i32, { 2, 5, 8, 8 } },
4056 { ISD::UMIN, MVT::v8i16, { 1, 3, 3, 3 } },
4057 { ISD::UMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4058 { ISD::USUBSAT, MVT::v8i16, { 1 } },
4059 { ISD::USUBSAT, MVT::v16i8, { 1 } },
4060 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } },
4061 { ISD::FMAXNUM, MVT::v2f64, { 4, 6, 6, 6 } },
4062 { ISD::FSQRT, MVT::f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4063 { ISD::FSQRT, MVT::v2f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4064 };
4065 static const CostKindTblEntry SSE1CostTbl[] = {
4066 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } },
4067 { ISD::FMAXNUM, MVT::v4f32, { 4, 6, 6, 6 } },
4068 { ISD::FSQRT, MVT::f32, { 28, 30, 1, 2 } }, // Pentium III from http://www.agner.org/
4069 { ISD::FSQRT, MVT::v4f32, { 56, 56, 1, 2 } }, // Pentium III from http://www.agner.org/
4070 };
4071 static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets
4072 { ISD::CTTZ, MVT::i64, { 1 } },
4073 };
4074 static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
4075 { ISD::CTTZ, MVT::i32, { 1 } },
4076 { ISD::CTTZ, MVT::i16, { 1 } },
4077 { ISD::CTTZ, MVT::i8, { 1 } },
4078 };
4079 static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets
4080 { ISD::CTLZ, MVT::i64, { 1 } },
4081 };
4082 static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
4083 { ISD::CTLZ, MVT::i32, { 1 } },
4084 { ISD::CTLZ, MVT::i16, { 2 } },
4085 { ISD::CTLZ, MVT::i8, { 2 } },
4086 };
4087 static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets
4088 { ISD::CTPOP, MVT::i64, { 1, 1, 1, 1 } }, // popcnt
4089 };
4090 static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
4091 { ISD::CTPOP, MVT::i32, { 1, 1, 1, 1 } }, // popcnt
4092 { ISD::CTPOP, MVT::i16, { 1, 1, 2, 2 } }, // popcnt(zext())
4093 { ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext())
4094 };
4095 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
4096 { ISD::ABS, MVT::i64, { 1, 2, 3, 3 } }, // SUB+CMOV
4097 { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } },
4098 { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } },
4099 { ISD::CTLZ, MVT::i64, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
4100 { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 1, 1, 1 } }, // BSR+XOR
4101 { ISD::CTTZ, MVT::i64, { 3 } }, // TEST+BSF+CMOV/BRANCH
4102 { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 1, 1, 1 } }, // BSR
4103 { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } },
4104 { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } },
4105 { ISD::ROTR, MVT::i64, { 2, 3, 1, 3 } },
4106 { X86ISD::VROTLI, MVT::i64, { 1, 1, 1, 1 } },
4107 { ISD::FSHL, MVT::i64, { 4, 4, 1, 4 } },
4108 { ISD::SMAX, MVT::i64, { 1, 3, 2, 3 } },
4109 { ISD::SMIN, MVT::i64, { 1, 3, 2, 3 } },
4110 { ISD::UMAX, MVT::i64, { 1, 3, 2, 3 } },
4111 { ISD::UMIN, MVT::i64, { 1, 3, 2, 3 } },
4112 { ISD::SADDO, MVT::i64, { 1 } },
4113 { ISD::UADDO, MVT::i64, { 1 } },
4114 { ISD::UMULO, MVT::i64, { 2 } }, // mulq + seto
4115 };
4116 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
4117 { ISD::ABS, MVT::i32, { 1, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
4118 { ISD::ABS, MVT::i16, { 2, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
4119 { ISD::ABS, MVT::i8, { 2, 4, 4, 3 } }, // SUB+XOR+SRA
4120 { ISD::BITREVERSE, MVT::i32, { 9, 12, 17, 19 } },
4121 { ISD::BITREVERSE, MVT::i16, { 9, 12, 17, 19 } },
4122 { ISD::BITREVERSE, MVT::i8, { 7, 9, 13, 14 } },
4123 { ISD::BSWAP, MVT::i32, { 1, 1, 1, 1 } },
4124 { ISD::BSWAP, MVT::i16, { 1, 2, 1, 2 } }, // ROL
4125 { ISD::CTLZ, MVT::i32, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
4126 { ISD::CTLZ, MVT::i16, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
4127 { ISD::CTLZ, MVT::i8, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
4128 { ISD::CTLZ_ZERO_UNDEF, MVT::i32,{ 1, 1, 1, 1 } }, // BSR+XOR
4129 { ISD::CTLZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 3, 3 } }, // BSR+XOR
4130 { ISD::CTLZ_ZERO_UNDEF, MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR
4131 { ISD::CTTZ, MVT::i32, { 3 } }, // TEST+BSF+CMOV/BRANCH
4132 { ISD::CTTZ, MVT::i16, { 3 } }, // TEST+BSF+CMOV/BRANCH
4133 { ISD::CTTZ, MVT::i8, { 3 } }, // TEST+BSF+CMOV/BRANCH
4134 { ISD::CTTZ_ZERO_UNDEF, MVT::i32,{ 1, 1, 1, 1 } }, // BSF
4135 { ISD::CTTZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 1, 1 } }, // BSF
4136 { ISD::CTTZ_ZERO_UNDEF, MVT::i8, { 2, 2, 1, 1 } }, // BSF
4137 { ISD::CTPOP, MVT::i32, { 8, 7, 15, 15 } },
4138 { ISD::CTPOP, MVT::i16, { 9, 8, 17, 17 } },
4139 { ISD::CTPOP, MVT::i8, { 7, 6, 6, 6 } },
4140 { ISD::ROTL, MVT::i32, { 2, 3, 1, 3 } },
4141 { ISD::ROTL, MVT::i16, { 2, 3, 1, 3 } },
4142 { ISD::ROTL, MVT::i8, { 2, 3, 1, 3 } },
4143 { ISD::ROTR, MVT::i32, { 2, 3, 1, 3 } },
4144 { ISD::ROTR, MVT::i16, { 2, 3, 1, 3 } },
4145 { ISD::ROTR, MVT::i8, { 2, 3, 1, 3 } },
4146 { X86ISD::VROTLI, MVT::i32, { 1, 1, 1, 1 } },
4147 { X86ISD::VROTLI, MVT::i16, { 1, 1, 1, 1 } },
4148 { X86ISD::VROTLI, MVT::i8, { 1, 1, 1, 1 } },
4149 { ISD::FSHL, MVT::i32, { 4, 4, 1, 4 } },
4150 { ISD::FSHL, MVT::i16, { 4, 4, 2, 5 } },
4151 { ISD::FSHL, MVT::i8, { 4, 4, 2, 5 } },
4152 { ISD::SMAX, MVT::i32, { 1, 2, 2, 3 } },
4153 { ISD::SMAX, MVT::i16, { 1, 4, 2, 4 } },
4154 { ISD::SMAX, MVT::i8, { 1, 4, 2, 4 } },
4155 { ISD::SMIN, MVT::i32, { 1, 2, 2, 3 } },
4156 { ISD::SMIN, MVT::i16, { 1, 4, 2, 4 } },
4157 { ISD::SMIN, MVT::i8, { 1, 4, 2, 4 } },
4158 { ISD::UMAX, MVT::i32, { 1, 2, 2, 3 } },
4159 { ISD::UMAX, MVT::i16, { 1, 4, 2, 4 } },
4160 { ISD::UMAX, MVT::i8, { 1, 4, 2, 4 } },
4161 { ISD::UMIN, MVT::i32, { 1, 2, 2, 3 } },
4162 { ISD::UMIN, MVT::i16, { 1, 4, 2, 4 } },
4163 { ISD::UMIN, MVT::i8, { 1, 4, 2, 4 } },
4164 { ISD::SADDO, MVT::i32, { 1 } },
4165 { ISD::SADDO, MVT::i16, { 1 } },
4166 { ISD::SADDO, MVT::i8, { 1 } },
4167 { ISD::UADDO, MVT::i32, { 1 } },
4168 { ISD::UADDO, MVT::i16, { 1 } },
4169 { ISD::UADDO, MVT::i8, { 1 } },
4170 { ISD::UMULO, MVT::i32, { 2 } }, // mul + seto
4171 { ISD::UMULO, MVT::i16, { 2 } },
4172 { ISD::UMULO, MVT::i8, { 2 } },
4173 };
4174
4175 Type *RetTy = ICA.getReturnType();
4176 Type *OpTy = RetTy;
4177 Intrinsic::ID IID = ICA.getID();
4178 unsigned ISD = ISD::DELETED_NODE;
4179 switch (IID) {
4180 default:
4181 break;
4182 case Intrinsic::abs:
4183 ISD = ISD::ABS;
4184 break;
4185 case Intrinsic::bitreverse:
4186 ISD = ISD::BITREVERSE;
4187 break;
4188 case Intrinsic::bswap:
4189 ISD = ISD::BSWAP;
4190 break;
4191 case Intrinsic::ctlz:
4192 ISD = ISD::CTLZ;
4193 break;
4194 case Intrinsic::ctpop:
4195 ISD = ISD::CTPOP;
4196 break;
4197 case Intrinsic::cttz:
4198 ISD = ISD::CTTZ;
4199 break;
4200 case Intrinsic::fshl:
4201 ISD = ISD::FSHL;
4202 if (!ICA.isTypeBasedOnly()) {
4203 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4204 if (Args[0] == Args[1]) {
4205 ISD = ISD::ROTL;
4206 // Handle uniform constant rotation amounts.
4207 // TODO: Handle funnel-shift cases.
4208 const APInt *Amt;
4209 if (Args[2] &&
4211 ISD = X86ISD::VROTLI;
4212 }
4213 }
4214 break;
4215 case Intrinsic::fshr:
4216 // FSHR has same costs so don't duplicate.
4217 ISD = ISD::FSHL;
4218 if (!ICA.isTypeBasedOnly()) {
4219 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4220 if (Args[0] == Args[1]) {
4221 ISD = ISD::ROTR;
4222 // Handle uniform constant rotation amount.
4223 // TODO: Handle funnel-shift cases.
4224 const APInt *Amt;
4225 if (Args[2] &&
4227 ISD = X86ISD::VROTLI;
4228 }
4229 }
4230 break;
4231 case Intrinsic::lrint:
4232 case Intrinsic::llrint:
4233 // X86 can use the CVTP2SI instructions to lower lrint/llrint calls, which
4234 // have the same costs as the CVTTP2SI (fptosi) instructions
4235 if (!ICA.isTypeBasedOnly()) {
4236 const SmallVectorImpl<Type *> &ArgTys = ICA.getArgTypes();
4237 return getCastInstrCost(Instruction::FPToSI, RetTy, ArgTys[0],
4239 }
4240 break;
4241 case Intrinsic::maxnum:
4242 case Intrinsic::minnum:
4243 // FMINNUM has same costs so don't duplicate.
4244 ISD = ISD::FMAXNUM;
4245 break;
4246 case Intrinsic::sadd_sat:
4247 ISD = ISD::SADDSAT;
4248 break;
4249 case Intrinsic::smax:
4250 ISD = ISD::SMAX;
4251 break;
4252 case Intrinsic::smin:
4253 ISD = ISD::SMIN;
4254 break;
4255 case Intrinsic::ssub_sat:
4256 ISD = ISD::SSUBSAT;
4257 break;
4258 case Intrinsic::uadd_sat:
4259 ISD = ISD::UADDSAT;
4260 break;
4261 case Intrinsic::umax:
4262 ISD = ISD::UMAX;
4263 break;
4264 case Intrinsic::umin:
4265 ISD = ISD::UMIN;
4266 break;
4267 case Intrinsic::usub_sat:
4268 ISD = ISD::USUBSAT;
4269 break;
4270 case Intrinsic::sqrt:
4271 ISD = ISD::FSQRT;
4272 break;
4273 case Intrinsic::sadd_with_overflow:
4274 case Intrinsic::ssub_with_overflow:
4275 // SSUBO has same costs so don't duplicate.
4276 ISD = ISD::SADDO;
4277 OpTy = RetTy->getContainedType(0);
4278 break;
4279 case Intrinsic::uadd_with_overflow:
4280 case Intrinsic::usub_with_overflow:
4281 // USUBO has same costs so don't duplicate.
4282 ISD = ISD::UADDO;
4283 OpTy = RetTy->getContainedType(0);
4284 break;
4285 case Intrinsic::umul_with_overflow:
4286 case Intrinsic::smul_with_overflow:
4287 // SMULO has same costs so don't duplicate.
4288 ISD = ISD::UMULO;
4289 OpTy = RetTy->getContainedType(0);
4290 break;
4291 }
4292
4293 if (ISD != ISD::DELETED_NODE) {
4294 auto adjustTableCost = [&](int ISD, unsigned Cost,
4295 std::pair<InstructionCost, MVT> LT,
4297 InstructionCost LegalizationCost = LT.first;
4298 MVT MTy = LT.second;
4299
4300 // If there are no NANs to deal with, then these are reduced to a
4301 // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
4302 // assume is used in the non-fast case.
4303 if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) {
4304 if (FMF.noNaNs())
4305 return LegalizationCost * 1;
4306 }
4307
4308 // For cases where some ops can be folded into a load/store, assume free.
4309 if (MTy.isScalarInteger()) {
4310 if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
4311 if (const Instruction *II = ICA.getInst()) {
4312 if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
4313 return TTI::TCC_Free;
4314 if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
4315 if (LI->hasOneUse())
4316 return TTI::TCC_Free;
4317 }
4318 }
4319 }
4320 }
4321
4322 return LegalizationCost * (int)Cost;
4323 };
4324
4325 // Legalize the type.
4326 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy);
4327 MVT MTy = LT.second;
4328
4329 // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost.
4330 if (((ISD == ISD::CTTZ && !ST->hasBMI()) ||
4331 (ISD == ISD::CTLZ && !ST->hasLZCNT())) &&
4332 !MTy.isVector() && !ICA.isTypeBasedOnly()) {
4333 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4334 if (auto *Cst = dyn_cast<ConstantInt>(Args[1]))
4335 if (Cst->isAllOnesValue())
4337 }
4338
4339 // FSQRT is a single instruction.
4340 if (ISD == ISD::FSQRT && CostKind == TTI::TCK_CodeSize)
4341 return LT.first;
4342
4343 if (ST->useGLMDivSqrtCosts())
4344 if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
4345 if (auto KindCost = Entry->Cost[CostKind])
4346 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4347
4348 if (ST->useSLMArithCosts())
4349 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
4350 if (auto KindCost = Entry->Cost[CostKind])
4351 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4352
4353 if (ST->hasVBMI2())
4354 if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy))
4355 if (auto KindCost = Entry->Cost[CostKind])
4356 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4357
4358 if (ST->hasBITALG())
4359 if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
4360 if (auto KindCost = Entry->Cost[CostKind])
4361 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4362
4363 if (ST->hasVPOPCNTDQ())
4364 if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
4365 if (auto KindCost = Entry->Cost[CostKind])
4366 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4367
4368 if (ST->hasGFNI())
4369 if (const auto *Entry = CostTableLookup(GFNICostTbl, ISD, MTy))
4370 if (auto KindCost = Entry->Cost[CostKind])
4371 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4372
4373 if (ST->hasCDI())
4374 if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
4375 if (auto KindCost = Entry->Cost[CostKind])
4376 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4377
4378 if (ST->hasBWI())
4379 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
4380 if (auto KindCost = Entry->Cost[CostKind])
4381 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4382
4383 if (ST->hasAVX512())
4384 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
4385 if (auto KindCost = Entry->Cost[CostKind])
4386 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4387
4388 if (ST->hasXOP())
4389 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
4390 if (auto KindCost = Entry->Cost[CostKind])
4391 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4392
4393 if (ST->hasAVX2())
4394 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
4395 if (auto KindCost = Entry->Cost[CostKind])
4396 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4397
4398 if (ST->hasAVX())
4399 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
4400 if (auto KindCost = Entry->Cost[CostKind])
4401 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4402
4403 if (ST->hasSSE42())
4404 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
4405 if (auto KindCost = Entry->Cost[CostKind])
4406 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4407
4408 if (ST->hasSSE41())
4409 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
4410 if (auto KindCost = Entry->Cost[CostKind])
4411 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4412
4413 if (ST->hasSSSE3())
4414 if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
4415 if (auto KindCost = Entry->Cost[CostKind])
4416 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4417
4418 if (ST->hasSSE2())
4419 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
4420 if (auto KindCost = Entry->Cost[CostKind])
4421 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4422
4423 if (ST->hasSSE1())
4424 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
4425 if (auto KindCost = Entry->Cost[CostKind])
4426 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4427
4428 if (ST->hasBMI()) {
4429 if (ST->is64Bit())
4430 if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
4431 if (auto KindCost = Entry->Cost[CostKind])
4432 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4433
4434 if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
4435 if (auto KindCost = Entry->Cost[CostKind])
4436 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4437 }
4438
4439 if (ST->hasLZCNT()) {
4440 if (ST->is64Bit())
4441 if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
4442 if (auto KindCost = Entry->Cost[CostKind])
4443 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4444
4445 if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
4446 if (auto KindCost = Entry->Cost[CostKind])
4447 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4448 }
4449
4450 if (ST->hasPOPCNT()) {
4451 if (ST->is64Bit())
4452 if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
4453 if (auto KindCost = Entry->Cost[CostKind])
4454 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4455
4456 if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
4457 if (auto KindCost = Entry->Cost[CostKind])
4458 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4459 }
4460
4461 if (ST->is64Bit())
4462 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
4463 if (auto KindCost = Entry->Cost[CostKind])
4464 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4465
4466 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
4467 if (auto KindCost = Entry->Cost[CostKind])
4468 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4469 }
4470
4472}
4473
4476 unsigned Index, Value *Op0,
4477 Value *Op1) {
4478 static const CostTblEntry SLMCostTbl[] = {
4479 { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },
4480 { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 },
4481 { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 },
4482 { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 }
4483 };
4484
4485 assert(Val->isVectorTy() && "This must be a vector type");
4486 Type *ScalarType = Val->getScalarType();
4487 InstructionCost RegisterFileMoveCost = 0;
4488
4489 // Non-immediate extraction/insertion can be handled as a sequence of
4490 // aliased loads+stores via the stack.
4491 if (Index == -1U && (Opcode == Instruction::ExtractElement ||
4492 Opcode == Instruction::InsertElement)) {
4493 // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
4494 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
4495
4496 // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
4497 assert(isa<FixedVectorType>(Val) && "Fixed vector type expected");
4498 Align VecAlign = DL.getPrefTypeAlign(Val);
4499 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
4500
4501 // Extract - store vector to stack, load scalar.
4502 if (Opcode == Instruction::ExtractElement) {
4503 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4504 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
4505 CostKind);
4506 }
4507 // Insert - store vector to stack, store scalar, load vector.
4508 if (Opcode == Instruction::InsertElement) {
4509 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4510 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
4511 CostKind) +
4512 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind);
4513 }
4514 }
4515
4516 if (Index != -1U && (Opcode == Instruction::ExtractElement ||
4517 Opcode == Instruction::InsertElement)) {
4518 // Extraction of vXi1 elements are now efficiently handled by MOVMSK.
4519 if (Opcode == Instruction::ExtractElement &&
4520 ScalarType->getScalarSizeInBits() == 1 &&
4521 cast<FixedVectorType>(Val)->getNumElements() > 1)
4522 return 1;
4523
4524 // Legalize the type.
4525 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4526
4527 // This type is legalized to a scalar type.
4528 if (!LT.second.isVector())
4529 return 0;
4530
4531 // The type may be split. Normalize the index to the new type.
4532 unsigned SizeInBits = LT.second.getSizeInBits();
4533 unsigned NumElts = LT.second.getVectorNumElements();
4534 unsigned SubNumElts = NumElts;
4535 Index = Index % NumElts;
4536
4537 // For >128-bit vectors, we need to extract higher 128-bit subvectors.
4538 // For inserts, we also need to insert the subvector back.
4539 if (SizeInBits > 128) {
4540 assert((SizeInBits % 128) == 0 && "Illegal vector");
4541 unsigned NumSubVecs = SizeInBits / 128;
4542 SubNumElts = NumElts / NumSubVecs;
4543 if (SubNumElts <= Index) {
4544 RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
4545 Index %= SubNumElts;
4546 }
4547 }
4548
4549 MVT MScalarTy = LT.second.getScalarType();
4550 auto IsCheapPInsrPExtrInsertPS = [&]() {
4551 // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
4552 // Also, assume insertps is relatively cheap on all >= SSE41 targets.
4553 return (MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4554 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4555 (MScalarTy == MVT::f32 && ST->hasSSE41() &&
4556 Opcode == Instruction::InsertElement);
4557 };
4558
4559 if (Index == 0) {
4560 // Floating point scalars are already located in index #0.
4561 // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
4562 // true for all.
4563 if (ScalarType->isFloatingPointTy() &&
4564 (Opcode != Instruction::InsertElement || !Op0 ||
4565 isa<UndefValue>(Op0)))
4566 return RegisterFileMoveCost;
4567
4568 if (Opcode == Instruction::InsertElement &&
4569 isa_and_nonnull<UndefValue>(Op0)) {
4570 // Consider the gather cost to be cheap.
4571 if (isa_and_nonnull<LoadInst>(Op1))
4572 return RegisterFileMoveCost;
4573 if (!IsCheapPInsrPExtrInsertPS()) {
4574 // mov constant-to-GPR + movd/movq GPR -> XMM.
4575 if (isa_and_nonnull<Constant>(Op1) && Op1->getType()->isIntegerTy())
4576 return 2 + RegisterFileMoveCost;
4577 // Assume movd/movq GPR -> XMM is relatively cheap on all targets.
4578 return 1 + RegisterFileMoveCost;
4579 }
4580 }
4581
4582 // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
4583 if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
4584 return 1 + RegisterFileMoveCost;
4585 }
4586
4587 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4588 assert(ISD && "Unexpected vector opcode");
4589 if (ST->useSLMArithCosts())
4590 if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
4591 return Entry->Cost + RegisterFileMoveCost;
4592
4593 // Consider cheap cases.
4594 if (IsCheapPInsrPExtrInsertPS())
4595 return 1 + RegisterFileMoveCost;
4596
4597 // For extractions we just need to shuffle the element to index 0, which
4598 // should be very cheap (assume cost = 1). For insertions we need to shuffle
4599 // the elements to its destination. In both cases we must handle the
4600 // subvector move(s).
4601 // If the vector type is already less than 128-bits then don't reduce it.
4602 // TODO: Under what circumstances should we shuffle using the full width?
4603 InstructionCost ShuffleCost = 1;
4604 if (Opcode == Instruction::InsertElement) {
4605 auto *SubTy = cast<VectorType>(Val);
4606 EVT VT = TLI->getValueType(DL, Val);
4607 if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
4608 SubTy = FixedVectorType::get(ScalarType, SubNumElts);
4609 ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, std::nullopt,
4610 CostKind, 0, SubTy);
4611 }
4612 int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
4613 return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
4614 }
4615
4616 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1) +
4617 RegisterFileMoveCost;
4618}
4619
4622 bool Insert, bool Extract,
4624 assert(DemandedElts.getBitWidth() ==
4625 cast<FixedVectorType>(Ty)->getNumElements() &&
4626 "Vector size mismatch");
4627
4628 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4629 MVT MScalarTy = LT.second.getScalarType();
4630 unsigned LegalVectorBitWidth = LT.second.getSizeInBits();
4632
4633 constexpr unsigned LaneBitWidth = 128;
4634 assert((LegalVectorBitWidth < LaneBitWidth ||
4635 (LegalVectorBitWidth % LaneBitWidth) == 0) &&
4636 "Illegal vector");
4637
4638 const int NumLegalVectors = *LT.first.getValue();
4639 assert(NumLegalVectors >= 0 && "Negative cost!");
4640
4641 // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
4642 // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
4643 if (Insert) {
4644 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4645 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4646 (MScalarTy == MVT::f32 && ST->hasSSE41())) {
4647 // For types we can insert directly, insertion into 128-bit sub vectors is
4648 // cheap, followed by a cheap chain of concatenations.
4649 if (LegalVectorBitWidth <= LaneBitWidth) {
4650 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
4651 /*Extract*/ false, CostKind);
4652 } else {
4653 // In each 128-lane, if at least one index is demanded but not all
4654 // indices are demanded and this 128-lane is not the first 128-lane of
4655 // the legalized-vector, then this 128-lane needs a extracti128; If in
4656 // each 128-lane, there is at least one demanded index, this 128-lane
4657 // needs a inserti128.
4658
4659 // The following cases will help you build a better understanding:
4660 // Assume we insert several elements into a v8i32 vector in avx2,
4661 // Case#1: inserting into 1th index needs vpinsrd + inserti128.
4662 // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
4663 // inserti128.
4664 // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
4665 assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector");
4666 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
4667 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
4668 unsigned NumLegalElts =
4669 LT.second.getVectorNumElements() * NumLegalVectors;
4670 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
4671 "Vector has been legalized to smaller element count");
4672 assert((NumLegalElts % NumLanesTotal) == 0 &&
4673 "Unexpected elts per lane");
4674 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
4675
4676 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
4677 auto *LaneTy =
4678 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
4679
4680 for (unsigned I = 0; I != NumLanesTotal; ++I) {
4681 APInt LaneEltMask = WidenedDemandedElts.extractBits(
4682 NumEltsPerLane, NumEltsPerLane * I);
4683 if (LaneEltMask.isZero())
4684 continue;
4685 // FIXME: we don't need to extract if all non-demanded elements
4686 // are legalization-inserted padding.
4687 if (!LaneEltMask.isAllOnes())
4688 Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
4689 CostKind, I * NumEltsPerLane, LaneTy);
4690 Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert,
4691 /*Extract*/ false, CostKind);
4692 }
4693
4694 APInt AffectedLanes =
4695 APIntOps::ScaleBitMask(WidenedDemandedElts, NumLanesTotal);
4696 APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask(
4697 AffectedLanes, NumLegalVectors, /*MatchAllBits=*/true);
4698 for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) {
4699 for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) {
4700 unsigned I = NumLegalLanes * LegalVec + Lane;
4701 // No need to insert unaffected lane; or lane 0 of each legal vector
4702 // iff ALL lanes of that vector were affected and will be inserted.
4703 if (!AffectedLanes[I] ||
4704 (Lane == 0 && FullyAffectedLegalVectors[LegalVec]))
4705 continue;
4706 Cost += getShuffleCost(TTI::SK_InsertSubvector, Ty, std::nullopt,
4707 CostKind, I * NumEltsPerLane, LaneTy);
4708 }
4709 }
4710 }
4711 } else if (LT.second.isVector()) {
4712 // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
4713 // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
4714 // series of UNPCK followed by CONCAT_VECTORS - all of these can be
4715 // considered cheap.
4716 if (Ty->isIntOrIntVectorTy())
4717 Cost += DemandedElts.popcount();
4718
4719 // Get the smaller of the legalized or original pow2-extended number of
4720 // vector elements, which represents the number of unpacks we'll end up
4721 // performing.
4722 unsigned NumElts = LT.second.getVectorNumElements();
4723 unsigned Pow2Elts =
4724 PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements());
4725 Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
4726 }
4727 }
4728
4729 if (Extract) {
4730 // vXi1 can be efficiently extracted with MOVMSK.
4731 // TODO: AVX512 predicate mask handling.
4732 // NOTE: This doesn't work well for roundtrip scalarization.
4733 if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) {
4734 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
4735 unsigned MaxElts = ST->hasAVX2() ? 32 : 16;
4736 unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts;
4737 return MOVMSKCost;
4738 }
4739
4740 if (LT.second.isVector()) {
4741 unsigned NumLegalElts =
4742 LT.second.getVectorNumElements() * NumLegalVectors;
4743 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
4744 "Vector has been legalized to smaller element count");
4745
4746 // If we're extracting elements from a 128-bit subvector lane,
4747 // we only need to extract each lane once, not for every element.
4748 if (LegalVectorBitWidth > LaneBitWidth) {
4749 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
4750 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
4751 assert((NumLegalElts % NumLanesTotal) == 0 &&
4752 "Unexpected elts per lane");
4753 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
4754
4755 // Add cost for each demanded 128-bit subvector extraction.
4756 // Luckily this is a lot easier than for insertion.
4757 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
4758 auto *LaneTy =
4759 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
4760
4761 for (unsigned I = 0; I != NumLanesTotal; ++I) {
4762 APInt LaneEltMask = WidenedDemandedElts.extractBits(
4763 NumEltsPerLane, I * NumEltsPerLane);
4764 if (LaneEltMask.isZero())
4765 continue;
4766 Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
4767 CostKind, I * NumEltsPerLane, LaneTy);
4769 LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind);
4770 }
4771
4772 return Cost;
4773 }
4774 }
4775
4776 // Fallback to default extraction.
4777 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ false,
4778 Extract, CostKind);
4779 }
4780
4781 return Cost;
4782}
4783
4785X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
4786 int VF, const APInt &DemandedDstElts,
4788 const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy);
4789 // We don't differentiate element types here, only element bit width.
4790 EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits);
4791
4792 auto bailout = [&]() {
4793 return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
4794 DemandedDstElts, CostKind);
4795 };
4796
4797 // For now, only deal with AVX512 cases.
4798 if (!ST->hasAVX512())
4799 return bailout();
4800
4801 // Do we have a native shuffle for this element type, or should we promote?
4802 unsigned PromEltTyBits = EltTyBits;
4803 switch (EltTyBits) {
4804 case 32:
4805 case 64:
4806 break; // AVX512F.
4807 case 16:
4808 if (!ST->hasBWI())
4809 PromEltTyBits = 32; // promote to i32, AVX512F.
4810 break; // AVX512BW
4811 case 8:
4812 if (!ST->hasVBMI())
4813 PromEltTyBits = 32; // promote to i32, AVX512F.
4814 break; // AVX512VBMI
4815 case 1:
4816 // There is no support for shuffling i1 elements. We *must* promote.
4817 if (ST->hasBWI()) {
4818 if (ST->hasVBMI())
4819 PromEltTyBits = 8; // promote to i8, AVX512VBMI.
4820 else
4821 PromEltTyBits = 16; // promote to i16, AVX512BW.
4822 break;
4823 }
4824 PromEltTyBits = 32; // promote to i32, AVX512F.
4825 break;
4826 default:
4827 return bailout();
4828 }
4829 auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits);
4830
4831 auto *SrcVecTy = FixedVectorType::get(EltTy, VF);
4832 auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF);
4833
4834 int NumDstElements = VF * ReplicationFactor;
4835 auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements);
4836 auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements);
4837
4838 // Legalize the types.
4839 MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second;
4840 MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second;
4841 MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second;
4842 MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second;
4843 // They should have legalized into vector types.
4844 if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() ||
4845 !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector())
4846 return bailout();
4847
4848 if (PromEltTyBits != EltTyBits) {
4849 // If we have to perform the shuffle with wider elt type than our data type,
4850 // then we will first need to anyext (we don't care about the new bits)
4851 // the source elements, and then truncate Dst elements.
4852 InstructionCost PromotionCost;
4853 PromotionCost += getCastInstrCost(
4854 Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy,
4856 PromotionCost +=
4857 getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy,
4858 /*Src=*/PromDstVecTy,
4860 return PromotionCost + getReplicationShuffleCost(PromEltTy,
4861 ReplicationFactor, VF,
4862 DemandedDstElts, CostKind);
4863 }
4864
4865 assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&
4866 LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&
4867 "We expect that the legalization doesn't affect the element width, "
4868 "doesn't coalesce/split elements.");
4869
4870 unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements();
4871 unsigned NumDstVectors =
4872 divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec);
4873
4874 auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec);
4875
4876 // Not all the produced Dst elements may be demanded. In our case,
4877 // given that a single Dst vector is formed by a single shuffle,
4878 // if all elements that will form a single Dst vector aren't demanded,
4879 // then we won't need to do that shuffle, so adjust the cost accordingly.
4880 APInt DemandedDstVectors = APIntOps::ScaleBitMask(
4881 DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors);
4882 unsigned NumDstVectorsDemanded = DemandedDstVectors.popcount();
4883
4884 InstructionCost SingleShuffleCost = getShuffleCost(
4885 TTI::SK_PermuteSingleSrc, SingleDstVecTy, /*Mask=*/std::nullopt, CostKind,
4886 /*Index=*/0, /*SubTp=*/nullptr);
4887 return NumDstVectorsDemanded * SingleShuffleCost;
4888}
4889
4891 MaybeAlign Alignment,
4892 unsigned AddressSpace,
4894 TTI::OperandValueInfo OpInfo,
4895 const Instruction *I) {
4896 // TODO: Handle other cost kinds.
4898 if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
4899 // Store instruction with index and scale costs 2 Uops.
4900 // Check the preceding GEP to identify non-const indices.
4901 if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
4902 if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
4903 return TTI::TCC_Basic * 2;
4904 }
4905 }
4906 return TTI::TCC_Basic;
4907 }
4908
4909 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
4910 "Invalid Opcode");
4911 // Type legalization can't handle structs
4912 if (TLI->getValueType(DL, Src, true) == MVT::Other)
4913 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4914 CostKind);
4915
4916 // Legalize the type.
4917 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
4918
4919 auto *VTy = dyn_cast<FixedVectorType>(Src);
4920
4922
4923 // Add a cost for constant load to vector.
4924 if (Opcode == Instruction::Store && OpInfo.isConstant())
4925 Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src),
4926 /*AddressSpace=*/0, CostKind);
4927
4928 // Handle the simple case of non-vectors.
4929 // NOTE: this assumes that legalization never creates vector from scalars!
4930 if (!VTy || !LT.second.isVector()) {
4931 // Each load/store unit costs 1.
4932 return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1;
4933 }
4934
4935 bool IsLoad = Opcode == Instruction::Load;
4936
4937 Type *EltTy = VTy->getElementType();
4938
4939 const int EltTyBits = DL.getTypeSizeInBits(EltTy);
4940
4941 // Source of truth: how many elements were there in the original IR vector?
4942 const unsigned SrcNumElt = VTy->getNumElements();
4943
4944 // How far have we gotten?
4945 int NumEltRemaining = SrcNumElt;
4946 // Note that we intentionally capture by-reference, NumEltRemaining changes.
4947 auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
4948
4949 const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
4950
4951 // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
4952 const unsigned XMMBits = 128;
4953 if (XMMBits % EltTyBits != 0)
4954 // Vector size must be a multiple of the element size. I.e. no padding.
4955 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4956 CostKind);
4957 const int NumEltPerXMM = XMMBits / EltTyBits;
4958
4959 auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
4960
4961 for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
4962 NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
4963 // How many elements would a single op deal with at once?
4964 if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
4965 // Vector size must be a multiple of the element size. I.e. no padding.
4966 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4967 CostKind);
4968 int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
4969
4970 assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
4971 assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||
4972 (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
4973 "Unless we haven't halved the op size yet, "
4974 "we have less than two op's sized units of work left.");
4975
4976 auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
4977 ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
4978 : XMMVecTy;
4979
4980 assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
4981 "After halving sizes, the vector elt count is no longer a multiple "
4982 "of number of elements per operation?");
4983 auto *CoalescedVecTy =
4984 CurrNumEltPerOp == 1
4985 ? CurrVecTy
4987 IntegerType::get(Src->getContext(),
4988 EltTyBits * CurrNumEltPerOp),
4989 CurrVecTy->getNumElements() / CurrNumEltPerOp);
4990 assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
4991 DL.getTypeSizeInBits(CurrVecTy) &&
4992 "coalesciing elements doesn't change vector width.");
4993
4994 while (NumEltRemaining > 0) {
4995 assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
4996
4997 // Can we use this vector size, as per the remaining element count?
4998 // Iff the vector is naturally aligned, we can do a wide load regardless.
4999 if (NumEltRemaining < CurrNumEltPerOp &&
5000 (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) &&
5001 CurrOpSizeBytes != 1)
5002 break; // Try smalled vector size.
5003
5004 bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
5005
5006 // If we have fully processed the previous reg, we need to replenish it.
5007 if (SubVecEltsLeft == 0) {
5008 SubVecEltsLeft += CurrVecTy->getNumElements();
5009 // And that's free only for the 0'th subvector of a legalized vector.
5010 if (!Is0thSubVec)
5013 VTy, std::nullopt, CostKind, NumEltDone(),
5014 CurrVecTy);
5015 }
5016
5017 // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
5018 // for smaller widths (32/16/8) we have to insert/extract them separately.
5019 // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
5020 // but let's pretend that it is also true for 16/8 bit wide ops...)
5021 if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
5022 int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
5023 assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
5024 int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
5025 APInt DemandedElts =
5026 APInt::getBitsSet(CoalescedVecTy->getNumElements(),
5027 CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
5028 assert(DemandedElts.popcount() == 1 && "Inserting single value");
5029 Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
5030 !IsLoad, CostKind);
5031 }
5032
5033 // This isn't exactly right. We're using slow unaligned 32-byte accesses
5034 // as a proxy for a double-pumped AVX memory interface such as on
5035 // Sandybridge.
5036 // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
5037 // will be scalarized.
5038 if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
5039 Cost += 2;
5040 else if (CurrOpSizeBytes < 4)
5041 Cost += 2;
5042 else
5043 Cost += 1;
5044
5045 SubVecEltsLeft -= CurrNumEltPerOp;
5046 NumEltRemaining -= CurrNumEltPerOp;
5047 Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes);
5048 }
5049 }
5050
5051 assert(NumEltRemaining <= 0 && "Should have processed all the elements.");
5052
5053 return Cost;
5054}
5055
5057X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
5058 unsigned AddressSpace,
5060 bool IsLoad = (Instruction::Load == Opcode);
5061 bool IsStore = (Instruction::Store == Opcode);
5062
5063 auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
5064 if (!SrcVTy)
5065 // To calculate scalar take the regular cost, without mask
5066 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
5067
5068 unsigned NumElem = SrcVTy->getNumElements();
5069 auto *MaskTy =
5070 FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
5071 if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) ||
5072 (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) {
5073 // Scalarization
5074 APInt DemandedElts = APInt::getAllOnes(NumElem);
5076 MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind);
5077 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
5078 Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
5080 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
5081 InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
5083 SrcVTy, DemandedElts, IsLoad, IsStore, CostKind);
5084 InstructionCost MemopCost =
5085 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5086 Alignment, AddressSpace, CostKind);
5087 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
5088 }
5089
5090 // Legalize the type.
5091 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
5092 auto VT = TLI->getValueType(DL, SrcVTy);
5094 MVT Ty = LT.second;
5095 if (Ty == MVT::i16 || Ty == MVT::i32 || Ty == MVT::i64)
5096 // APX masked load/store for scalar is cheap.
5097 return Cost + LT.first;
5098
5099 if (VT.isSimple() && Ty != VT.getSimpleVT() &&
5100 LT.second.getVectorNumElements() == NumElem)
5101 // Promotion requires extend/truncate for data and a shuffle for mask.
5102 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, std::nullopt,
5103 CostKind, 0, nullptr) +
5104 getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, std::nullopt,
5105 CostKind, 0, nullptr);
5106
5107 else if (LT.first * Ty.getVectorNumElements() > NumElem) {
5108 auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
5110 // Expanding requires fill mask with zeroes
5111 Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, std::nullopt,
5112 CostKind, 0, MaskTy);
5113 }
5114
5115 // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
5116 if (!ST->hasAVX512())
5117 return Cost + LT.first * (IsLoad ? 2 : 8);
5118
5119 // AVX-512 masked load/store is cheaper
5120 return Cost + LT.first;
5121}
5122
5125 const Value *Base,
5126 const TTI::PointersChainInfo &Info,
5127 Type *AccessTy, TTI::TargetCostKind CostKind) {
5128 if (Info.isSameBase() && Info.isKnownStride()) {
5129 // If all the pointers have known stride all the differences are translated
5130 // into constants. X86 memory addressing allows encoding it into
5131 // displacement. So we just need to take the base GEP cost.
5132 if (const auto *BaseGEP = dyn_cast<GetElementPtrInst>(Base)) {
5133 SmallVector<const Value *> Indices(BaseGEP->indices());
5134 return getGEPCost(BaseGEP->getSourceElementType(),
5135 BaseGEP->getPointerOperand(), Indices, nullptr,
5136 CostKind);
5137 }
5138 return TTI::TCC_Free;
5139 }
5140 return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind);
5141}
5142
5144 ScalarEvolution *SE,
5145 const SCEV *Ptr) {
5146 // Address computations in vectorized code with non-consecutive addresses will
5147 // likely result in more instructions compared to scalar code where the
5148 // computation can more often be merged into the index mode. The resulting
5149 // extra micro-ops can significantly decrease throughput.
5150 const unsigned NumVectorInstToHideOverhead = 10;
5151
5152 // Cost modeling of Strided Access Computation is hidden by the indexing
5153 // modes of X86 regardless of the stride value. We dont believe that there
5154 // is a difference between constant strided access in gerenal and constant
5155 // strided value which is less than or equal to 64.
5156 // Even in the case of (loop invariant) stride whose value is not known at
5157 // compile time, the address computation will not incur more than one extra
5158 // ADD instruction.
5159 if (Ty->isVectorTy() && SE && !ST->hasAVX2()) {
5160 // TODO: AVX2 is the current cut-off because we don't have correct
5161 // interleaving costs for prior ISA's.
5163 return NumVectorInstToHideOverhead;
5165 return 1;
5166 }
5167
5168 return BaseT::getAddressComputationCost(Ty, SE, Ptr);
5169}
5170
5173 std::optional<FastMathFlags> FMF,
5176 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5177
5178 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5179 // and make it as the cost.
5180
5181 static const CostTblEntry SLMCostTbl[] = {
5182 { ISD::FADD, MVT::v2f64, 3 },
5183 { ISD::ADD, MVT::v2i64, 5 },
5184 };
5185
5186 static const CostTblEntry SSE2CostTbl[] = {
5187 { ISD::FADD, MVT::v2f64, 2 },
5188 { ISD::FADD, MVT::v2f32, 2 },
5189 { ISD::FADD, MVT::v4f32, 4 },
5190 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
5191 { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
5192 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
5193 { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
5194 { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
5195 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
5196 { ISD::ADD, MVT::v2i8, 2 },
5197 { ISD::ADD, MVT::v4i8, 2 },
5198 { ISD::ADD, MVT::v8i8, 2 },
5199 { ISD::ADD, MVT::v16i8, 3 },
5200 };
5201
5202 static const CostTblEntry AVX1CostTbl[] = {
5203 { ISD::FADD, MVT::v4f64, 3 },
5204 { ISD::FADD, MVT::v4f32, 3 },
5205 { ISD::FADD, MVT::v8f32, 4 },
5206 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
5207 { ISD::ADD, MVT::v4i64, 3 },
5208 { ISD::ADD, MVT::v8i32, 5 },
5209 { ISD::ADD, MVT::v16i16, 5 },
5210 { ISD::ADD, MVT::v32i8, 4 },
5211 };
5212
5213 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5214 assert(ISD && "Invalid opcode");
5215
5216 // Before legalizing the type, give a chance to look up illegal narrow types
5217 // in the table.
5218 // FIXME: Is there a better way to do this?
5219 EVT VT = TLI->getValueType(DL, ValTy);
5220 if (VT.isSimple()) {
5221 MVT MTy = VT.getSimpleVT();
5222 if (ST->useSLMArithCosts())
5223 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5224 return Entry->Cost;
5225
5226 if (ST->hasAVX())
5227 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5228 return Entry->Cost;
5229
5230 if (ST->hasSSE2())
5231 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5232 return Entry->Cost;
5233 }
5234
5235 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5236
5237 MVT MTy = LT.second;
5238
5239 auto *ValVTy = cast<FixedVectorType>(ValTy);
5240
5241 // Special case: vXi8 mul reductions are performed as vXi16.
5242 if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
5243 auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
5244 auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
5245 return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
5247 CostKind) +
5248 getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind);
5249 }
5250
5251 InstructionCost ArithmeticCost = 0;
5252 if (LT.first != 1 && MTy.isVector() &&
5253 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5254 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5255 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5256 MTy.getVectorNumElements());
5257 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5258 ArithmeticCost *= LT.first - 1;
5259 }
5260
5261 if (ST->useSLMArithCosts())
5262 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5263 return ArithmeticCost + Entry->Cost;
5264
5265 if (ST->hasAVX())
5266 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5267 return ArithmeticCost + Entry->Cost;
5268
5269 if (ST->hasSSE2())
5270 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5271 return ArithmeticCost + Entry->Cost;
5272
5273 // FIXME: These assume a naive kshift+binop lowering, which is probably
5274 // conservative in most cases.
5275 static const CostTblEntry AVX512BoolReduction[] = {
5276 { ISD::AND, MVT::v2i1, 3 },
5277 { ISD::AND, MVT::v4i1, 5 },
5278 { ISD::AND, MVT::v8i1, 7 },
5279 { ISD::AND, MVT::v16i1, 9 },
5280 { ISD::AND, MVT::v32i1, 11 },
5281 { ISD::AND, MVT::v64i1, 13 },
5282 { ISD::OR, MVT::v2i1, 3 },
5283 { ISD::OR, MVT::v4i1, 5 },
5284 { ISD::OR, MVT::v8i1, 7 },
5285 { ISD::OR, MVT::v16i1, 9 },
5286 { ISD::OR, MVT::v32i1, 11 },
5287 { ISD::OR, MVT::v64i1, 13 },
5288 };
5289
5290 static const CostTblEntry AVX2BoolReduction[] = {
5291 { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
5292 { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
5293 { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
5294 { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
5295 };
5296
5297 static const CostTblEntry AVX1BoolReduction[] = {
5298 { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
5299 { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
5300 { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5301 { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5302 { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
5303 { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
5304 { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5305 { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5306 };
5307
5308 static const CostTblEntry SSE2BoolReduction[] = {
5309 { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
5310 { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
5311 { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
5312 { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
5313 { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
5314 { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
5315 { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
5316 { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
5317 };
5318
5319 // Handle bool allof/anyof patterns.
5320 if (ValVTy->getElementType()->isIntegerTy(1)) {
5321 InstructionCost ArithmeticCost = 0;
5322 if (LT.first != 1 && MTy.isVector() &&
5323 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5324 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5325 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5326 MTy.getVectorNumElements());
5327 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5328 ArithmeticCost *= LT.first - 1;
5329 }
5330
5331 if (ST->hasAVX512())
5332 if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
5333 return ArithmeticCost + Entry->Cost;
5334 if (ST->hasAVX2())
5335 if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
5336 return ArithmeticCost + Entry->Cost;
5337 if (ST->hasAVX())
5338 if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
5339 return ArithmeticCost + Entry->Cost;
5340 if (ST->hasSSE2())
5341 if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
5342 return ArithmeticCost + Entry->Cost;
5343
5344 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5345 }
5346
5347 unsigned NumVecElts = ValVTy->getNumElements();
5348 unsigned ScalarSize = ValVTy->getScalarSizeInBits();
5349
5350 // Special case power of 2 reductions where the scalar type isn't changed
5351 // by type legalization.
5352 if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
5353 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5354
5355 InstructionCost ReductionCost = 0;
5356
5357 auto *Ty = ValVTy;
5358 if (LT.first != 1 && MTy.isVector() &&
5359 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5360 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5361 Ty = FixedVectorType::get(ValVTy->getElementType(),
5362 MTy.getVectorNumElements());
5363 ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5364 ReductionCost *= LT.first - 1;
5365 NumVecElts = MTy.getVectorNumElements();
5366 }
5367
5368 // Now handle reduction with the legal type, taking into account size changes
5369 // at each level.
5370 while (NumVecElts > 1) {
5371 // Determine the size of the remaining vector we need to reduce.
5372 unsigned Size = NumVecElts * ScalarSize;
5373 NumVecElts /= 2;
5374 // If we're reducing from 256/512 bits, use an extract_subvector.
5375 if (Size > 128) {
5376 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5377 ReductionCost +=
5379 NumVecElts, SubTy);
5380 Ty = SubTy;
5381 } else if (Size == 128) {
5382 // Reducing from 128 bits is a permute of v2f64/v2i64.
5383 FixedVectorType *ShufTy;
5384 if (ValVTy->isFloatingPointTy())
5385 ShufTy =
5386 FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
5387 else
5388 ShufTy =
5389 FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
5390 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5391 std::nullopt, CostKind, 0, nullptr);
5392 } else if (Size == 64) {
5393 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5394 FixedVectorType *ShufTy;
5395 if (ValVTy->isFloatingPointTy())
5396 ShufTy =
5397 FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
5398 else
5399 ShufTy =
5400 FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
5401 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5402 std::nullopt, CostKind, 0, nullptr);
5403 } else {
5404 // Reducing from smaller size is a shift by immediate.
5405 auto *ShiftTy = FixedVectorType::get(
5406 Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
5407 ReductionCost += getArithmeticInstrCost(
5408 Instruction::LShr, ShiftTy, CostKind,
5411 }
5412
5413 // Add the arithmetic op for this level.
5414 ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
5415 }
5416
5417 // Add the final extract element to the cost.
5418 return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5419 CostKind, 0, nullptr, nullptr);
5420}
5421
5424 FastMathFlags FMF) {
5425 IntrinsicCostAttributes ICA(IID, Ty, {Ty, Ty}, FMF);
5426 return getIntrinsicInstrCost(ICA, CostKind);
5427}
5428
5431 FastMathFlags FMF,
5433 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5434
5435 MVT MTy = LT.second;
5436
5437 int ISD;
5438 if (ValTy->isIntOrIntVectorTy()) {
5439 ISD = (IID == Intrinsic::umin || IID == Intrinsic::umax) ? ISD::UMIN
5440 : ISD::SMIN;
5441 } else {
5442 assert(ValTy->isFPOrFPVectorTy() &&
5443 "Expected float point or integer vector type.");
5444 ISD = (IID == Intrinsic::minnum || IID == Intrinsic::maxnum)
5445 ? ISD::FMINNUM
5446 : ISD::FMINIMUM;
5447 }
5448
5449 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5450 // and make it as the cost.
5451
5452 static const CostTblEntry SSE2CostTbl[] = {
5453 {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
5454 {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
5455 {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
5456 };
5457
5458 static const CostTblEntry SSE41CostTbl[] = {
5459 {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
5460 {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
5461 {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
5462 {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
5463 {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
5464 {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
5465 {ISD::SMIN, MVT::v2i8, 3}, // pminsb
5466 {ISD::SMIN, MVT::v4i8, 5}, // pminsb
5467 {ISD::SMIN, MVT::v8i8, 7}, // pminsb
5468 {ISD::SMIN, MVT::v16i8, 6},
5469 {ISD::UMIN, MVT::v2i8, 3}, // same as sse2
5470 {ISD::UMIN, MVT::v4i8, 5}, // same as sse2
5471 {ISD::UMIN, MVT::v8i8, 7}, // same as sse2
5472 {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
5473 };
5474
5475 static const CostTblEntry AVX1CostTbl[] = {
5476 {ISD::SMIN, MVT::v16i16, 6},
5477 {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
5478 {ISD::SMIN, MVT::v32i8, 8},
5479 {ISD::UMIN, MVT::v32i8, 8},
5480 };
5481
5482 static const CostTblEntry AVX512BWCostTbl[] = {
5483 {ISD::SMIN, MVT::v32i16, 8},
5484 {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
5485 {ISD::SMIN, MVT::v64i8, 10},
5486 {ISD::UMIN, MVT::v64i8, 10},
5487 };
5488
5489 // Before legalizing the type, give a chance to look up illegal narrow types
5490 // in the table.
5491 // FIXME: Is there a better way to do this?
5492 EVT VT = TLI->getValueType(DL, ValTy);
5493 if (VT.isSimple()) {
5494 MVT MTy = VT.getSimpleVT();
5495 if (ST->hasBWI())
5496 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5497 return Entry->Cost;
5498
5499 if (ST->hasAVX())
5500 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5501 return Entry->Cost;
5502
5503 if (ST->hasSSE41())
5504 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5505 return Entry->Cost;
5506
5507 if (ST->hasSSE2())
5508 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5509 return Entry->Cost;
5510 }
5511
5512 auto *ValVTy = cast<FixedVectorType>(ValTy);
5513 unsigned NumVecElts = ValVTy->getNumElements();
5514
5515 auto *Ty = ValVTy;
5516 InstructionCost MinMaxCost = 0;
5517 if (LT.first != 1 && MTy.isVector() &&
5518 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5519 // Type needs to be split. We need LT.first - 1 operations ops.
5520 Ty = FixedVectorType::get(ValVTy->getElementType(),
5521 MTy.getVectorNumElements());
5522 MinMaxCost = getMinMaxCost(IID, Ty, CostKind, FMF);
5523 MinMaxCost *= LT.first - 1;
5524 NumVecElts = MTy.getVectorNumElements();
5525 }
5526
5527 if (ST->hasBWI())
5528 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5529 return MinMaxCost + Entry->Cost;
5530
5531 if (ST->hasAVX())
5532 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5533 return MinMaxCost + Entry->Cost;
5534
5535 if (ST->hasSSE41())
5536 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5537 return MinMaxCost + Entry->Cost;
5538
5539 if (ST->hasSSE2())
5540 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5541 return MinMaxCost + Entry->Cost;
5542
5543 unsigned ScalarSize = ValTy->getScalarSizeInBits();
5544
5545 // Special case power of 2 reductions where the scalar type isn't changed
5546 // by type legalization.
5547 if (!isPowerOf2_32(ValVTy->getNumElements()) ||
5548 ScalarSize != MTy.getScalarSizeInBits())
5549 return BaseT::getMinMaxReductionCost(IID, ValTy, FMF, CostKind);
5550
5551 // Now handle reduction with the legal type, taking into account size changes
5552 // at each level.
5553 while (NumVecElts > 1) {
5554 // Determine the size of the remaining vector we need to reduce.
5555 unsigned Size = NumVecElts * ScalarSize;
5556 NumVecElts /= 2;
5557 // If we're reducing from 256/512 bits, use an extract_subvector.
5558 if (Size > 128) {
5559 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5560 MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
5561 CostKind, NumVecElts, SubTy);
5562 Ty = SubTy;
5563 } else if (Size == 128) {
5564 // Reducing from 128 bits is a permute of v2f64/v2i64.
5565 VectorType *ShufTy;
5566 if (ValTy->isFloatingPointTy())
5567 ShufTy =
5569 else
5570 ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
5571 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5572 std::nullopt, CostKind, 0, nullptr);
5573 } else if (Size == 64) {
5574 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5575 FixedVectorType *ShufTy;
5576 if (ValTy->isFloatingPointTy())
5577 ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
5578 else
5579 ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
5580 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5581 std::nullopt, CostKind, 0, nullptr);
5582 } else {
5583 // Reducing from smaller size is a shift by immediate.
5584 auto *ShiftTy = FixedVectorType::get(
5585 Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
5586 MinMaxCost += getArithmeticInstrCost(
5587 Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
5590 }
5591
5592 // Add the arithmetic op for this level.
5593 MinMaxCost += getMinMaxCost(IID, Ty, CostKind, FMF);
5594 }
5595
5596 // Add the final extract element to the cost.
5597 return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5598 CostKind, 0, nullptr, nullptr);
5599}
5600
5601/// Calculate the cost of materializing a 64-bit value. This helper
5602/// method might only calculate a fraction of a larger immediate. Therefore it
5603/// is valid to return a cost of ZERO.
5605 if (Val == 0)
5606 return TTI::TCC_Free;
5607
5608 if (isInt<32>(Val))
5609 return TTI::TCC_Basic;
5610
5611 return 2 * TTI::TCC_Basic;
5612}
5613
5616 assert(Ty->isIntegerTy());
5617
5618 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5619 if (BitSize == 0)
5620 return ~0U;
5621
5622 // Never hoist constants larger than 128bit, because this might lead to
5623 // incorrect code generation or assertions in codegen.
5624 // Fixme: Create a cost model for types larger than i128 once the codegen
5625 // issues have been fixed.
5626 if (BitSize > 128)
5627 return TTI::TCC_Free;
5628
5629 if (Imm == 0)
5630 return TTI::TCC_Free;
5631
5632 // Sign-extend all constants to a multiple of 64-bit.
5633 APInt ImmVal = Imm;
5634 if (BitSize % 64 != 0)
5635 ImmVal = Imm.sext(alignTo(BitSize, 64));
5636
5637 // Split the constant into 64-bit chunks and calculate the cost for each
5638 // chunk.
5640 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
5641 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
5642 int64_t Val = Tmp.getSExtValue();
5643 Cost += getIntImmCost(Val);
5644 }
5645 // We need at least one instruction to materialize the constant.
5646 return std::max<InstructionCost>(1, Cost);
5647}
5648
5650 const APInt &Imm, Type *Ty,
5652 Instruction *Inst) {
5653 assert(Ty->isIntegerTy());
5654
5655 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5656 // There is no cost model for constants with a bit size of 0. Return TCC_Free
5657 // here, so that constant hoisting will ignore this constant.
5658 if (BitSize == 0)
5659 return TTI::TCC_Free;
5660
5661 unsigned ImmIdx = ~0U;
5662 switch (Opcode) {
5663 default:
5664 return TTI::TCC_Free;
5665 case Instruction::GetElementPtr:
5666 // Always hoist the base address of a GetElementPtr. This prevents the
5667 // creation of new constants for every base constant that gets constant
5668 // folded with the offset.
5669 if (Idx == 0)
5670 return 2 * TTI::TCC_Basic;
5671 return TTI::TCC_Free;
5672 case Instruction::Store:
5673 ImmIdx = 0;
5674 break;
5675 case Instruction::ICmp:
5676 // This is an imperfect hack to prevent constant hoisting of
5677 // compares that might be trying to check if a 64-bit value fits in
5678 // 32-bits. The backend can optimize these cases using a right shift by 32.
5679 // Ideally we would check the compare predicate here. There also other
5680 // similar immediates the backend can use shifts for.
5681 if (Idx == 1 && Imm.getBitWidth() == 64) {
5682 uint64_t ImmVal = Imm.getZExtValue();
5683 if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
5684 return TTI::TCC_Free;
5685 }
5686 ImmIdx = 1;
5687 break;
5688 case Instruction::And:
5689 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
5690 // by using a 32-bit operation with implicit zero extension. Detect such
5691 // immediates here as the normal path expects bit 31 to be sign extended.
5692 if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.isIntN(32))
5693 return TTI::TCC_Free;
5694 ImmIdx = 1;
5695 break;
5696 case Instruction::Add:
5697 case Instruction::Sub:
5698 // For add/sub, we can use the opposite instruction for INT32_MIN.
5699 if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000)
5700 return TTI::TCC_Free;
5701 ImmIdx = 1;
5702 break;
5703 case Instruction::UDiv:
5704 case Instruction::SDiv:
5705 case Instruction::URem:
5706 case Instruction::SRem:
5707 // Division by constant is typically expanded later into a different
5708 // instruction sequence. This completely changes the constants.
5709 // Report them as "free" to stop ConstantHoist from marking them as opaque.
5710 return TTI::TCC_Free;
5711 case Instruction::Mul:
5712 case Instruction::Or:
5713 case Instruction::Xor:
5714 ImmIdx = 1;
5715 break;
5716 // Always return TCC_Free for the shift value of a shift instruction.
5717 case Instruction::Shl:
5718 case Instruction::LShr:
5719 case Instruction::AShr:
5720 if (Idx == 1)
5721 return TTI::TCC_Free;
5722 break;
5723 case Instruction::Trunc:
5724 case Instruction::ZExt:
5725 case Instruction::SExt:
5726 case Instruction::IntToPtr:
5727 case Instruction::PtrToInt:
5728 case Instruction::BitCast:
5729 case Instruction::PHI:
5730 case Instruction::Call:
5731 case Instruction::Select:
5732 case Instruction::Ret:
5733 case Instruction::Load:
5734 break;
5735 }
5736
5737 if (Idx == ImmIdx) {
5738 uint64_t NumConstants = divideCeil(BitSize, 64);
5740 return (Cost <= NumConstants * TTI::TCC_Basic)
5741 ? static_cast<int>(TTI::TCC_Free)
5742 : Cost;
5743 }
5744
5745 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
5746}
5747
5749 const APInt &Imm, Type *Ty,
5751 assert(Ty->isIntegerTy());
5752
5753 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5754 // There is no cost model for constants with a bit size of 0. Return TCC_Free
5755 // here, so that constant hoisting will ignore this constant.
5756 if (BitSize == 0)
5757 return TTI::TCC_Free;
5758
5759 switch (IID) {
5760 default:
5761 return TTI::TCC_Free;
5762 case Intrinsic::sadd_with_overflow:
5763 case Intrinsic::uadd_with_overflow:
5764 case Intrinsic::ssub_with_overflow:
5765 case Intrinsic::usub_with_overflow:
5766 case Intrinsic::smul_with_overflow:
5767 case Intrinsic::umul_with_overflow:
5768 if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(32))
5769 return TTI::TCC_Free;
5770 break;
5771 case Intrinsic::experimental_stackmap:
5772 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
5773 return TTI::TCC_Free;
5774 break;
5775 case Intrinsic::experimental_patchpoint_void:
5776 case Intrinsic::experimental_patchpoint:
5777 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
5778 return TTI::TCC_Free;
5779 break;
5780 }
5781 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
5782}
5783
5786 const Instruction *I) {
5788 return Opcode == Instruction::PHI ? 0 : 1;
5789 // Branches are assumed to be predicted.
5790 return 0;
5791}
5792
5793int X86TTIImpl::getGatherOverhead() const {
5794 // Some CPUs have more overhead for gather. The specified overhead is relative
5795 // to the Load operation. "2" is the number provided by Intel architects. This
5796 // parameter is used for cost estimation of Gather Op and comparison with
5797 // other alternatives.
5798 // TODO: Remove the explicit hasAVX512()?, That would mean we would only
5799 // enable gather with a -march.
5800 if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
5801 return 2;
5802
5803 return 1024;
5804}
5805
5806int X86TTIImpl::getScatterOverhead() const {
5807 if (ST->hasAVX512())
5808 return 2;
5809
5810 return 1024;
5811}
5812
5813// Return an average cost of Gather / Scatter instruction, maybe improved later.
5814InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode,
5816 Type *SrcVTy, const Value *Ptr,
5817 Align Alignment,
5818 unsigned AddressSpace) {
5819
5820 assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
5821 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
5822
5823 // Try to reduce index size from 64 bit (default for GEP)
5824 // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
5825 // operation will use 16 x 64 indices which do not fit in a zmm and needs
5826 // to split. Also check that the base pointer is the same for all lanes,
5827 // and that there's at most one variable index.
5828 auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
5829 unsigned IndexSize = DL.getPointerSizeInBits();
5830 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
5831 if (IndexSize < 64 || !GEP)
5832 return IndexSize;
5833
5834 unsigned NumOfVarIndices = 0;
5835 const Value *Ptrs = GEP->getPointerOperand();
5836 if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
5837 return IndexSize;
5838 for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) {
5839 if (isa<Constant>(GEP->getOperand(I)))
5840 continue;
5841 Type *IndxTy = GEP->getOperand(I)->getType();
5842 if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
5843 IndxTy = IndexVTy->getElementType();
5844 if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
5845 !isa<SExtInst>(GEP->getOperand(I))) ||
5846 ++NumOfVarIndices > 1)
5847 return IndexSize; // 64
5848 }
5849 return (unsigned)32;
5850 };
5851
5852 // Trying to reduce IndexSize to 32 bits for vector 16.
5853 // By default the IndexSize is equal to pointer size.
5854 unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
5855 ? getIndexSizeInBits(Ptr, DL)
5857
5858 auto *IndexVTy = FixedVectorType::get(
5859 IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
5860 std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(IndexVTy);
5861 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcVTy);
5862 InstructionCost::CostType SplitFactor =
5863 *std::max(IdxsLT.first, SrcLT.first).getValue();
5864 if (SplitFactor > 1) {
5865 // Handle splitting of vector of pointers
5866 auto *SplitSrcTy =
5867 FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
5868 return SplitFactor * getGSVectorCost(Opcode, CostKind, SplitSrcTy, Ptr,
5869 Alignment, AddressSpace);
5870 }
5871
5872 // If we didn't split, this will be a single gather/scatter instruction.
5874 return 1;
5875
5876 // The gather / scatter cost is given by Intel architects. It is a rough
5877 // number since we are looking at one instruction in a time.
5878 const int GSOverhead = (Opcode == Instruction::Load) ? getGatherOverhead()
5879 : getScatterOverhead();
5880 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5881 MaybeAlign(Alignment), AddressSpace,
5882 CostKind);
5883}
5884
5885/// Calculate the cost of Gather / Scatter operation
5887 unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask,
5889 const Instruction *I = nullptr) {
5890 if ((Opcode == Instruction::Load &&
5891 (!isLegalMaskedGather(SrcVTy, Align(Alignment)) ||
5892 forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
5893 Align(Alignment)))) ||
5894 (Opcode == Instruction::Store &&
5895 (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) ||
5896 forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
5897 Align(Alignment)))))
5898 return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask,
5899 Alignment, CostKind, I);
5900
5901 assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
5902 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
5903 if (!PtrTy && Ptr->getType()->isVectorTy())
5904 PtrTy = dyn_cast<PointerType>(
5905 cast<VectorType>(Ptr->getType())->getElementType());
5906 assert(PtrTy && "Unexpected type for Ptr argument");
5907 unsigned AddressSpace = PtrTy->getAddressSpace();
5908 return getGSVectorCost(Opcode, CostKind, SrcVTy, Ptr, Alignment,
5909 AddressSpace);
5910}
5911
5913 const TargetTransformInfo::LSRCost &C2) {
5914 // X86 specific here are "instruction number 1st priority".
5915 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
5916 C1.NumIVMuls, C1.NumBaseAdds,
5917 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
5918 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
5919 C2.NumIVMuls, C2.NumBaseAdds,
5920 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
5921}
5922
5924 return ST->hasMacroFusion() || ST->hasBranchFusion();
5925}
5926
5927bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
5928 Type *ScalarTy = DataTy->getScalarType();
5929
5930 // The backend can't handle a single element vector w/o CFCMOV.
5931 if (isa<VectorType>(DataTy) && cast<FixedVectorType>(DataTy)->getNumElements() == 1)
5932 return ST->hasCF() && hasConditionalLoadStoreForType(ScalarTy);
5933
5934 if (!ST->hasAVX())
5935 return false;
5936
5937 if (ScalarTy->isPointerTy())
5938 return true;
5939
5940 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
5941 return true;
5942
5943 if (ScalarTy->isHalfTy() && ST->hasBWI())
5944 return true;
5945
5946 if (ScalarTy->isBFloatTy() && ST->hasBF16())
5947 return true;
5948
5949 if (!ScalarTy->isIntegerTy())
5950 return false;
5951
5952 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
5953 return IntWidth == 32 || IntWidth == 64 ||
5954 ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
5955}
5956
5957bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) {
5958 return isLegalMaskedLoad(DataType, Alignment);
5959}
5960
5961bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) {
5962 unsigned DataSize = DL.getTypeStoreSize(DataType);
5963 // The only supported nontemporal loads are for aligned vectors of 16 or 32
5964 // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
5965 // (the equivalent stores only require AVX).
5966 if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
5967 return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
5968
5969 return false;
5970}
5971
5972bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) {
5973 unsigned DataSize = DL.getTypeStoreSize(DataType);
5974
5975 // SSE4A supports nontemporal stores of float and double at arbitrary
5976 // alignment.
5977 if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
5978 return true;
5979
5980 // Besides the SSE4A subtarget exception above, only aligned stores are
5981 // available nontemporaly on any other subtarget. And only stores with a size
5982 // of 4..32 bytes (powers of 2, only) are permitted.
5983 if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
5984 !isPowerOf2_32(DataSize))
5985 return false;
5986
5987 // 32-byte vector nontemporal stores are supported by AVX (the equivalent
5988 // loads require AVX2).
5989 if (DataSize == 32)
5990 return ST->hasAVX();
5991 if (DataSize == 16)
5992 return ST->hasSSE1();
5993 return true;
5994}
5995
5997 ElementCount NumElements) const {
5998 // movddup
5999 return ST->hasSSE3() && !NumElements.isScalable() &&
6000 NumElements.getFixedValue() == 2 &&
6001 ElementTy == Type::getDoubleTy(ElementTy->getContext());
6002}
6003
6005 if (!isa<VectorType>(DataTy))
6006 return false;
6007
6008 if (!ST->hasAVX512())
6009 return false;
6010
6011 // The backend can't handle a single element vector.
6012 if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6013 return false;
6014
6015 Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
6016
6017 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6018 return true;
6019
6020 if (!ScalarTy->isIntegerTy())
6021 return false;
6022
6023 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6024 return IntWidth == 32 || IntWidth == 64 ||
6025 ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
6026}
6027
6029 return isLegalMaskedExpandLoad(DataTy, Alignment);
6030}
6031
6032bool X86TTIImpl::supportsGather() const {
6033 // Some CPUs have better gather performance than others.
6034 // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
6035 // enable gather with a -march.
6036 return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2());
6037}
6038
6040 // Gather / Scatter for vector 2 is not profitable on KNL / SKX
6041 // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend
6042 // it to 8 elements, but zeroing upper bits of the mask vector will add more
6043 // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO:
6044 // Check, maybe the gather/scatter instruction is better in the VariableMask
6045 // case.
6046 unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements();
6047 return NumElts == 1 ||
6048 (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())));
6049}
6050
6052 Type *ScalarTy = DataTy->getScalarType();
6053 if (ScalarTy->isPointerTy())
6054 return true;
6055
6056 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6057 return true;
6058
6059 if (!ScalarTy->isIntegerTy())
6060 return false;
6061
6062 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6063 return IntWidth == 32 || IntWidth == 64;
6064}
6065
6067 if (!supportsGather() || !ST->preferGather())
6068 return false;
6069 return isLegalMaskedGatherScatter(DataTy, Alignment);
6070}
6071
6072bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
6073 unsigned Opcode1,
6074 const SmallBitVector &OpcodeMask) const {
6075 // ADDSUBPS 4xf32 SSE3
6076 // VADDSUBPS 4xf32 AVX
6077 // VADDSUBPS 8xf32 AVX2
6078 // ADDSUBPD 2xf64 SSE3
6079 // VADDSUBPD 2xf64 AVX
6080 // VADDSUBPD 4xf64 AVX2
6081
6082 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
6083 assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible");
6084 if (!isPowerOf2_32(NumElements))
6085 return false;
6086 // Check the opcode pattern. We apply the mask on the opcode arguments and
6087 // then check if it is what we expect.
6088 for (int Lane : seq<int>(0, NumElements)) {
6089 unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0;
6090 // We expect FSub for even lanes and FAdd for odd lanes.
6091 if (Lane % 2 == 0 && Opc != Instruction::FSub)
6092 return false;
6093 if (Lane % 2 == 1 && Opc != Instruction::FAdd)
6094 return false;
6095 }
6096 // Now check that the pattern is supported by the target ISA.
6097 Type *ElemTy = cast<VectorType>(VecTy)->getElementType();
6098 if (ElemTy->isFloatTy())
6099 return ST->hasSSE3() && NumElements % 4 == 0;
6100 if (ElemTy->isDoubleTy())
6101 return ST->hasSSE3() && NumElements % 2 == 0;
6102 return false;
6103}
6104
6105bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) {
6106 // AVX2 doesn't support scatter
6107 if (!ST->hasAVX512() || !ST->preferScatter())
6108 return false;
6109 return isLegalMaskedGatherScatter(DataType, Alignment);
6110}
6111
6112bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
6113 EVT VT = TLI->getValueType(DL, DataType);
6114 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
6115}
6116
6118 // FDIV is always expensive, even if it has a very low uop count.
6119 // TODO: Still necessary for recent CPUs with low latency/throughput fdiv?
6120 if (I->getOpcode() == Instruction::FDiv)
6121 return true;
6122
6124}
6125
6127 return false;
6128}
6129
6131 const Function *Callee) const {
6132 const TargetMachine &TM = getTLI()->getTargetMachine();
6133
6134 // Work this as a subsetting of subtarget features.
6135 const FeatureBitset &CallerBits =
6136 TM.getSubtargetImpl(*Caller)->getFeatureBits();
6137 const FeatureBitset &CalleeBits =
6138 TM.getSubtargetImpl(*Callee)->getFeatureBits();
6139
6140 // Check whether features are the same (apart from the ignore list).
6141 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
6142 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
6143 if (RealCallerBits == RealCalleeBits)
6144 return true;
6145
6146 // If the features are a subset, we need to additionally check for calls
6147 // that may become ABI-incompatible as a result of inlining.
6148 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
6149 return false;
6150
6151 for (const Instruction &I : instructions(Callee)) {
6152 if (const auto *CB = dyn_cast<CallBase>(&I)) {
6153 // Having more target features is fine for inline ASM.
6154 if (CB->isInlineAsm())
6155 continue;
6156
6158 for (Value *Arg : CB->args())
6159 Types.push_back(Arg->getType());
6160 if (!CB->getType()->isVoidTy())
6161 Types.push_back(CB->getType());
6162
6163 // Simple types are always ABI compatible.
6164 auto IsSimpleTy = [](Type *Ty) {
6165 return !Ty->isVectorTy() && !Ty->isAggregateType();
6166 };
6167 if (all_of(Types, IsSimpleTy))
6168 continue;
6169
6170 if (Function *NestedCallee = CB->getCalledFunction()) {
6171 // Assume that intrinsics are always ABI compatible.
6172 if (NestedCallee->isIntrinsic())
6173 continue;
6174
6175 // Do a precise compatibility check.
6176 if (!areTypesABICompatible(Caller, NestedCallee, Types))
6177 return false;
6178 } else {
6179 // We don't know the target features of the callee,
6180 // assume it is incompatible.
6181 return false;
6182 }
6183 }
6184 }
6185 return true;
6186}
6187
6189 const Function *Callee,
6190 const ArrayRef<Type *> &Types) const {
6191 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
6192 return false;
6193
6194 // If we get here, we know the target features match. If one function
6195 // considers 512-bit vectors legal and the other does not, consider them
6196 // incompatible.
6197 const TargetMachine &TM = getTLI()->getTargetMachine();
6198
6199 if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
6200 TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs())
6201 return true;
6202
6203 // Consider the arguments compatible if they aren't vectors or aggregates.
6204 // FIXME: Look at the size of vectors.
6205 // FIXME: Look at the element types of aggregates to see if there are vectors.
6206 return llvm::none_of(Types,
6207 [](Type *T) { return T->isVectorTy() || T->isAggregateType(); });
6208}
6209
6211X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
6213 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
6214 Options.NumLoadsPerBlock = 2;
6215 // All GPR and vector loads can be unaligned.
6216 Options.AllowOverlappingLoads = true;
6217 if (IsZeroCmp) {
6218 // Only enable vector loads for equality comparison. Right now the vector
6219 // version is not as fast for three way compare (see #33329).
6220 const unsigned PreferredWidth = ST->getPreferVectorWidth();
6221 if (PreferredWidth >= 512 && ST->hasAVX512() && ST->hasEVEX512())
6222 Options.LoadSizes.push_back(64);
6223 if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
6224 if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
6225 }
6226 if (ST->is64Bit()) {
6227 Options.LoadSizes.push_back(8);
6228 }
6229 Options.LoadSizes.push_back(4);
6230 Options.LoadSizes.push_back(2);
6231 Options.LoadSizes.push_back(1);
6232 return Options;
6233}
6234
6236 return supportsGather();
6237}
6238
6240 return false;
6241}
6242
6244 // TODO: We expect this to be beneficial regardless of arch,
6245 // but there are currently some unexplained performance artifacts on Atom.
6246 // As a temporary solution, disable on Atom.
6247 return !(ST->isAtom());
6248}
6249
6250// Get estimation for interleaved load/store operations and strided load.
6251// \p Indices contains indices for strided load.
6252// \p Factor - the factor of interleaving.
6253// AVX-512 provides 3-src shuffles that significantly reduces the cost.
6255 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
6256 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
6257 TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
6258 // VecTy for interleave memop is <VF*Factor x Elt>.
6259 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6260 // VecTy = <12 x i32>.
6261
6262 // Calculate the number of memory operations (NumOfMemOps), required
6263 // for load/store the VecTy.
6264 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6265 unsigned VecTySize = DL.getTypeStoreSize(VecTy);
6266 unsigned LegalVTSize = LegalVT.getStoreSize();
6267 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
6268
6269 // Get the cost of one memory operation.
6270 auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
6271 LegalVT.getVectorNumElements());
6272 InstructionCost MemOpCost;
6273 bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps;
6274 if (UseMaskedMemOp)
6275 MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment,
6277 else
6278 MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, MaybeAlign(Alignment),
6280
6281 unsigned VF = VecTy->getNumElements() / Factor;
6282 MVT VT =
6284
6285 InstructionCost MaskCost;
6286 if (UseMaskedMemOp) {
6287 APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements());
6288 for (unsigned Index : Indices) {
6289 assert(Index < Factor && "Invalid index for interleaved memory op");
6290 for (unsigned Elm = 0; Elm < VF; Elm++)
6291 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
6292 }
6293
6294 Type *I1Type = Type::getInt1Ty(VecTy->getContext());
6295
6296 MaskCost = getReplicationShuffleCost(
6297 I1Type, Factor, VF,
6298 UseMaskForGaps ? DemandedLoadStoreElts
6300 CostKind);
6301
6302 // The Gaps mask is invariant and created outside the loop, therefore the
6303 // cost of creating it is not accounted for here. However if we have both
6304 // a MaskForGaps and some other mask that guards the execution of the
6305 // memory access, we need to account for the cost of And-ing the two masks
6306 // inside the loop.
6307 if (UseMaskForGaps) {
6308 auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements());
6309 MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind);
6310 }
6311 }
6312
6313 if (Opcode == Instruction::Load) {
6314 // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
6315 // contain the cost of the optimized shuffle sequence that the
6316 // X86InterleavedAccess pass will generate.
6317 // The cost of loads and stores are computed separately from the table.
6318
6319 // X86InterleavedAccess support only the following interleaved-access group.
6320 static const CostTblEntry AVX512InterleavedLoadTbl[] = {
6321 {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
6322 {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
6323 {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
6324 };
6325
6326 if (const auto *Entry =
6327 CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
6328 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6329 //If an entry does not exist, fallback to the default implementation.
6330
6331 // Kind of shuffle depends on number of loaded values.
6332 // If we load the entire data in one register, we can use a 1-src shuffle.
6333 // Otherwise, we'll merge 2 sources in each operation.
6334 TTI::ShuffleKind ShuffleKind =
6335 (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
6336
6337 InstructionCost ShuffleCost = getShuffleCost(
6338 ShuffleKind, SingleMemOpTy, std::nullopt, CostKind, 0, nullptr);
6339
6340 unsigned NumOfLoadsInInterleaveGrp =
6341 Indices.size() ? Indices.size() : Factor;
6342 auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
6343 VecTy->getNumElements() / Factor);
6344 InstructionCost NumOfResults =
6345 getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp;
6346
6347 // About a half of the loads may be folded in shuffles when we have only
6348 // one result. If we have more than one result, or the loads are masked,
6349 // we do not fold loads at all.
6350 unsigned NumOfUnfoldedLoads =
6351 UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
6352
6353 // Get a number of shuffle operations per result.
6354 unsigned NumOfShufflesPerResult =
6355 std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
6356
6357 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6358 // When we have more than one destination, we need additional instructions
6359 // to keep sources.
6360 InstructionCost NumOfMoves = 0;
6361 if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
6362 NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
6363
6364 InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
6365 MaskCost + NumOfUnfoldedLoads * MemOpCost +
6366 NumOfMoves;
6367
6368 return Cost;
6369 }
6370
6371 // Store.
6372 assert(Opcode == Instruction::Store &&
6373 "Expected Store Instruction at this point");
6374 // X86InterleavedAccess support only the following interleaved-access group.
6375 static const CostTblEntry AVX512InterleavedStoreTbl[] = {
6376 {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
6377 {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
6378 {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
6379
6380 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
6381 {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
6382 {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
6383 {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
6384 };
6385
6386 if (const auto *Entry =
6387 CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
6388 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6389 //If an entry does not exist, fallback to the default implementation.
6390
6391 // There is no strided stores meanwhile. And store can't be folded in
6392 // shuffle.
6393 unsigned NumOfSources = Factor; // The number of values to be merged.
6394 InstructionCost ShuffleCost = getShuffleCost(
6395 TTI::SK_PermuteTwoSrc, SingleMemOpTy, std::nullopt, CostKind, 0, nullptr);
6396 unsigned NumOfShufflesPerStore = NumOfSources - 1;
6397
6398 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6399 // We need additional instructions to keep sources.
6400 unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
6402 MaskCost +
6403 NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
6404 NumOfMoves;
6405 return Cost;
6406}
6407
6409 unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices,
6410 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
6411 bool UseMaskForCond, bool UseMaskForGaps) {
6412 auto *VecTy = cast<FixedVectorType>(BaseTy);
6413
6414 auto isSupportedOnAVX512 = [&](Type *VecTy) {
6415 Type *EltTy = cast<VectorType>(VecTy)->getElementType();
6416 if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
6417 EltTy->isIntegerTy(32) || EltTy->isPointerTy())
6418 return true;
6419 if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy())
6420 return ST->hasBWI();
6421 if (EltTy->isBFloatTy())
6422 return ST->hasBF16();
6423 return false;
6424 };
6425 if (ST->hasAVX512() && isSupportedOnAVX512(VecTy))
6427 Opcode, VecTy, Factor, Indices, Alignment,
6428 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
6429
6430 if (UseMaskForCond || UseMaskForGaps)
6431 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6432 Alignment, AddressSpace, CostKind,
6433 UseMaskForCond, UseMaskForGaps);
6434
6435 // Get estimation for interleaved load/store operations for SSE-AVX2.
6436 // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow
6437 // computing the cost using a generic formula as a function of generic
6438 // shuffles. We therefore use a lookup table instead, filled according to
6439 // the instruction sequences that codegen currently generates.
6440
6441 // VecTy for interleave memop is <VF*Factor x Elt>.
6442 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6443 // VecTy = <12 x i32>.
6444 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6445
6446 // This function can be called with VecTy=<6xi128>, Factor=3, in which case
6447 // the VF=2, while v2i128 is an unsupported MVT vector type
6448 // (see MachineValueType.h::getVectorVT()).
6449 if (!LegalVT.isVector())
6450 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6451 Alignment, AddressSpace, CostKind);
6452
6453 unsigned VF = VecTy->getNumElements() / Factor;
6454 Type *ScalarTy = VecTy->getElementType();
6455 // Deduplicate entries, model floats/pointers as appropriately-sized integers.
6456 if (!ScalarTy->isIntegerTy())
6457 ScalarTy =
6458 Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
6459
6460 // Get the cost of all the memory operations.
6461 // FIXME: discount dead loads.
6462 InstructionCost MemOpCosts = getMemoryOpCost(
6463 Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind);
6464
6465 auto *VT = FixedVectorType::get(ScalarTy, VF);
6466 EVT ETy = TLI->getValueType(DL, VT);
6467 if (!ETy.isSimple())
6468 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6469 Alignment, AddressSpace, CostKind);
6470
6471 // TODO: Complete for other data-types and strides.
6472 // Each combination of Stride, element bit width and VF results in a different
6473 // sequence; The cost tables are therefore accessed with:
6474 // Factor (stride) and VectorType=VFxiN.
6475 // The Cost accounts only for the shuffle sequence;
6476 // The cost of the loads/stores is accounted for separately.
6477 //
6478 static const CostTblEntry AVX2InterleavedLoadTbl[] = {
6479 {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8
6480 {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8
6481 {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8
6482 {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8
6483 {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8
6484
6485 {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16
6486 {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16
6487 {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16
6488
6489 {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32
6490 {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32
6491 {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32
6492
6493 {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64
6494 {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64
6495 {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64
6496 {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64
6497
6498 {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8
6499 {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8
6500 {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8
6501 {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
6502 {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8
6503
6504 {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16
6505 {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16
6506 {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16
6507 {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16
6508 {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16
6509
6510 {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32
6511 {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32
6512 {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32
6513 {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32
6514 {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32
6515
6516 {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64
6517 {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64
6518 {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64
6519 {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64
6520
6521 {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8
6522 {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8
6523 {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8
6524 {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8
6525 {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8
6526
6527 {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16
6528 {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16
6529 {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16
6530 {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16
6531 {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16
6532
6533 {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32
6534 {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32
6535 {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32
6536 {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32
6537 {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32
6538
6539 {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64
6540 {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64
6541 {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64
6542 {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64
6543
6544 {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8
6545 {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8
6546 {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8
6547 {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8
6548 {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8
6549
6550 {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16
6551 {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16
6552 {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16
6553 {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16
6554 {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16
6555
6556 {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32
6557 {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32
6558 {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32
6559 {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32
6560
6561 {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64
6562 {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64
6563 {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64
6564
6565 {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
6566 };
6567
6568 static const CostTblEntry SSSE3InterleavedLoadTbl[] = {
6569 {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16
6570 };
6571
6572 static const CostTblEntry SSE2InterleavedLoadTbl[] = {
6573 {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16
6574 {2, MVT::v4i16, 7}, // (load 8i16 and) deinterleave into 2 x 4i16
6575
6576 {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32
6577 {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32
6578
6579 {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64
6580 };
6581
6582 static const CostTblEntry AVX2InterleavedStoreTbl[] = {
6583 {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store)
6584 {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store)
6585
6586 {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store)
6587 {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store)
6588 {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store)
6589
6590 {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store)
6591 {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store)
6592 {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store)
6593 {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store)
6594
6595 {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store)
6596 {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store)
6597 {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store)
6598 {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store)
6599 {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store)
6600
6601 {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store)
6602 {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store)
6603 {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store)
6604 {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
6605 {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
6606
6607 {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store)
6608 {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store)
6609 {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store)
6610 {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store)
6611 {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store)
6612
6613 {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store)
6614 {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store)
6615 {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store)
6616 {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store)
6617 {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store)
6618
6619 {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store)
6620 {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store)
6621 {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store)
6622 {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store)
6623
6624 {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store)
6625 {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store)
6626 {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store)
6627 {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store)
6628 {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store)
6629
6630 {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store)
6631 {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store)
6632 {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store)
6633 {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store)
6634 {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store)
6635
6636 {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store)
6637 {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store)
6638 {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store)
6639 {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store)
6640 {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store)
6641
6642 {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store)
6643 {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store)
6644 {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store)
6645 {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store)
6646
6647 {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store)
6648 {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store)
6649 {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store)
6650 {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store)
6651 {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store)
6652
6653 {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store)
6654 {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store)
6655 {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store)
6656 {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store)
6657 {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store)
6658
6659 {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store)
6660 {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store)
6661 {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store)
6662 {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store)
6663
6664 {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store)
6665 {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store)
6666 {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store)
6667 };
6668
6669 static const CostTblEntry SSE2InterleavedStoreTbl[] = {
6670 {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store)
6671 {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store)
6672 {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store)
6673
6674 {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store)
6675 {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store)
6676
6677 {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store)
6678 };
6679
6680 if (Opcode == Instruction::Load) {
6681 auto GetDiscountedCost = [Factor, NumMembers = Indices.size(),
6682 MemOpCosts](const CostTblEntry *Entry) {
6683 // NOTE: this is just an approximation!
6684 // It can over/under -estimate the cost!
6685 return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor);
6686 };
6687
6688 if (ST->hasAVX2())
6689 if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor,
6690 ETy.getSimpleVT()))
6691 return GetDiscountedCost(Entry);
6692
6693 if (ST->hasSSSE3())
6694 if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor,
6695 ETy.getSimpleVT()))
6696 return GetDiscountedCost(Entry);
6697
6698 if (ST->hasSSE2())
6699 if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor,
6700 ETy.getSimpleVT()))
6701 return GetDiscountedCost(Entry);
6702 } else {
6703 assert(Opcode == Instruction::Store &&
6704 "Expected Store Instruction at this point");
6705 assert((!Indices.size() || Indices.size() == Factor) &&
6706 "Interleaved store only supports fully-interleaved groups.");
6707 if (ST->hasAVX2())
6708 if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor,
6709 ETy.getSimpleVT()))
6710 return MemOpCosts + Entry->Cost;
6711
6712 if (ST->hasSSE2())
6713 if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor,
6714 ETy.getSimpleVT()))
6715 return MemOpCosts + Entry->Cost;
6716 }
6717
6718 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6719 Alignment, AddressSpace, CostKind,
6720 UseMaskForCond, UseMaskForGaps);
6721}
6722
6724 StackOffset BaseOffset,
6725 bool HasBaseReg, int64_t Scale,
6726 unsigned AddrSpace) const {
6727 // Scaling factors are not free at all.
6728 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
6729 // will take 2 allocations in the out of order engine instead of 1
6730 // for plain addressing mode, i.e. inst (reg1).
6731 // E.g.,
6732 // vaddps (%rsi,%rdx), %ymm0, %ymm1
6733 // Requires two allocations (one for the load, one for the computation)
6734 // whereas:
6735 // vaddps (%rsi), %ymm0, %ymm1
6736 // Requires just 1 allocation, i.e., freeing allocations for other operations
6737 // and having less micro operations to execute.
6738 //
6739 // For some X86 architectures, this is even worse because for instance for
6740 // stores, the complex addressing mode forces the instruction to use the
6741 // "load" ports instead of the dedicated "store" port.
6742 // E.g., on Haswell:
6743 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
6744 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
6746 AM.BaseGV = BaseGV;
6747 AM.BaseOffs = BaseOffset.getFixed();
6748 AM.HasBaseReg = HasBaseReg;
6749 AM.Scale = Scale;
6750 AM.ScalableOffset = BaseOffset.getScalable();
6751 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
6752 // Scale represents reg2 * scale, thus account for 1
6753 // as soon as we use a second register.
6754 return AM.Scale != 0;
6755 return -1;
6756}
Expand Atomic instructions
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
Hexagon Common GEP
static LVOptions Options
Definition: LVOptions.cpp:25
#define I(x, y, z)
Definition: MD5.cpp:58
uint64_t IntrinsicInst * II
if(VerifyEach)
const char LLVMTargetMachineRef TM
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
This file a TargetTransformInfo::Concept conforming object specific to the X86 target machine.
Class for arbitrary precision integers.
Definition: APInt.h:77
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:213
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1628
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1309
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:350
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:237
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:359
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1447
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:806
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:179
APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition: APInt.cpp:453
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1521
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:975
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:768
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:444
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:897
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:861
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:340
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:760
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:787
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:781
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:780
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:765
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:768
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:782
@ ICMP_NE
not equal
Definition: InstrTypes.h:779
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:785
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:783
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:767
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:410
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:865
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:672
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:472
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:874
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:319
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
unsigned getNumElements() const
Definition: DerivedTypes.h:582
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:914
static InstructionCost getInvalid(CostType Val=0)
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
const IntrinsicInst * getInst() const
Machine Value Type.
bool is128BitVector() const
Return true if this is a 128-bit vector type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Class to represent pointers.
Definition: DerivedTypes.h:646
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:679
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
bool test(unsigned Idx) const
size_type size() const
Returns the number of bits in this bitvector.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
static StackOffset getScalable(int64_t Scalable)
Definition: TypeSize.h:43
static StackOffset getFixed(int64_t Fixed)
Definition: TypeSize.h:42
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
MVT getSimpleValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the MVT corresponding to this LLVM type. See getValueType.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
bool isExpensiveToSpeculativelyExecute(const Instruction *I)
bool isStridedAccess(const SCEV *Ptr) const
unsigned minRequiredElementSize(const Value *Val, bool &isSigned) const
const SCEVConstant * getConstantStrideStep(ScalarEvolution *SE, const SCEV *Ptr) const
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
CacheLevel
The possible cache levels.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:342
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
static Type * getDoubleTy(LLVMContext &C)
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:234
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
static IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:146
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:143
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
static IntegerType * getInt8Ty(LLVMContext &C)
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:157
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:216
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
static VectorType * getExtendedElementVectorType(VectorType *VTy)
This static method is like getInteger except that the element types are twice as wide as the elements...
Definition: DerivedTypes.h:463
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * getDoubleElementsVectorType(VectorType *VTy)
This static method returns a VectorType with twice as many elements as the input type and the same el...
Definition: DerivedTypes.h:517
Type * getElementType() const
Definition: DerivedTypes.h:436
bool hasSSE1() const
Definition: X86Subtarget.h:200
bool hasSSE42() const
Definition: X86Subtarget.h:205
bool useAVX512Regs() const
Definition: X86Subtarget.h:265
bool hasSSE3() const
Definition: X86Subtarget.h:202
bool hasAVX512() const
Definition: X86Subtarget.h:208
bool hasSSE41() const
Definition: X86Subtarget.h:204
bool hasSSE2() const
Definition: X86Subtarget.h:201
bool hasSSSE3() const
Definition: X86Subtarget.h:203
bool hasAVX() const
Definition: X86Subtarget.h:206
unsigned getPreferVectorWidth() const
Definition: X86Subtarget.h:237
bool hasAVX2() const
Definition: X86Subtarget.h:207
InstructionCost getInterleavedMemoryOpCostAVX512(unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
bool isLegalMaskedGather(Type *DataType, Align Alignment)
InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask, TTI::TargetCostKind CostKind) const
std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool isLegalNTStore(Type *DataType, Align Alignment)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
bool isLegalNTLoad(Type *DataType, Align Alignment)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment)
bool isLegalMaskedGatherScatter(Type *DataType, Align Alignment)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool isLegalMaskedLoad(Type *DataType, Align Alignment)
bool hasConditionalLoadStoreForType(Type *Ty=nullptr) const
bool supportsEfficientVectorElementLoadStore() const
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
bool prefersVectorizedAddressing() const
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment)
std::optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const override
bool isLegalMaskedStore(Type *DataType, Align Alignment)
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
Calculate the cost of Gather / Scatter operation.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
unsigned getMaxInterleaveFactor(ElementCount VF)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
bool isLegalMaskedCompressStore(Type *DataType, Align Alignment)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
unsigned getNumberOfRegisters(unsigned ClassID) const
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
unsigned getAtomicMemIntrinsicMaxElementSize() const
bool isLegalMaskedScatter(Type *DataType, Align Alignment)
InstructionCost getIntImmCost(int64_t)
Calculate the cost of materializing a 64-bit value.
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isExpensiveToSpeculativelyExecute(const Instruction *I)
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
InstructionCost getMinMaxCost(Intrinsic::ID IID, Type *Ty, TTI::TargetCostKind CostKind, FastMathFlags FMF)
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Type) const
bool hasDivRemOp(Type *DataType, bool IsSigned)
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:199
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:2978
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:764
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:737
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:728
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:804
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:702
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:788
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:736
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:944
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:741
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:330
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:719
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:543
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:794
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:986
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:682
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:902
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1005
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:850
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:694
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:883
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:800
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
apint_match m_APIntAllowPoison(const APInt *&Res)
Match APInt while allowing poison in splat vector constants.
Definition: PatternMatch.h:305
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:431
AddressSpace
Definition: NVPTXBaseInfo.h:21
Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:372
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
constexpr int PoisonMaskElem
void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1824
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
InstructionCost Cost
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:483
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
#define N
std::optional< unsigned > operator[](TargetTransformInfo::TargetCostKind Kind) const
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:34
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition: Alignment.h:141
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Type Conversion Cost Table.
Definition: CostTable.h:55