LLVM 19.0.0git
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements a TargetTransformInfo analysis pass specific to the
10/// X86 target machine. It uses the target's detailed information to provide
11/// more precise answers to certain TTI queries, while letting the target
12/// independent and default TTI implementations handle the rest.
13///
14//===----------------------------------------------------------------------===//
15/// About Cost Model numbers used below it's necessary to say the following:
16/// the numbers correspond to some "generic" X86 CPU instead of usage of a
17/// specific CPU model. Usually the numbers correspond to the CPU where the
18/// feature first appeared. For example, if we do Subtarget.hasSSE42() in
19/// the lookups below the cost is based on Nehalem as that was the first CPU
20/// to support that feature level and thus has most likely the worst case cost,
21/// although we may discard an outlying worst cost from one CPU (e.g. Atom).
22///
23/// Some examples of other technologies/CPUs:
24/// SSE 3 - Pentium4 / Athlon64
25/// SSE 4.1 - Penryn
26/// SSE 4.2 - Nehalem / Silvermont
27/// AVX - Sandy Bridge / Jaguar / Bulldozer
28/// AVX2 - Haswell / Ryzen
29/// AVX-512 - Xeon Phi / Skylake
30///
31/// And some examples of instruction target dependent costs (latency)
32/// divss sqrtss rsqrtss
33/// AMD K7 11-16 19 3
34/// Piledriver 9-24 13-15 5
35/// Jaguar 14 16 2
36/// Pentium II,III 18 30 2
37/// Nehalem 7-14 7-18 3
38/// Haswell 10-13 11 5
39///
40/// Interpreting the 4 TargetCostKind types:
41/// TCK_RecipThroughput and TCK_Latency should try to match the worst case
42/// values reported by the CPU scheduler models (and llvm-mca).
43/// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the
44/// actual encoding size of the instruction.
45/// TCK_SizeAndLatency should match the worst case micro-op counts reported by
46/// by the CPU scheduler models (and llvm-mca), to ensure that they are
47/// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are
48/// often used as the cost thresholds where TCK_SizeAndLatency is requested.
49//===----------------------------------------------------------------------===//
50
58#include "llvm/Support/Debug.h"
59#include <optional>
60
61using namespace llvm;
62
63#define DEBUG_TYPE "x86tti"
64
65//===----------------------------------------------------------------------===//
66//
67// X86 cost model.
68//
69//===----------------------------------------------------------------------===//
70
71// Helper struct to store/access costs for each cost kind.
72// TODO: Move this to allow other targets to use it?
74 unsigned RecipThroughputCost = ~0U;
75 unsigned LatencyCost = ~0U;
76 unsigned CodeSizeCost = ~0U;
77 unsigned SizeAndLatencyCost = ~0U;
78
79 std::optional<unsigned>
81 unsigned Cost = ~0U;
82 switch (Kind) {
85 break;
88 break;
91 break;
94 break;
95 }
96 if (Cost == ~0U)
97 return std::nullopt;
98 return Cost;
99 }
100};
102
105 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
106 // TODO: Currently the __builtin_popcount() implementation using SSE3
107 // instructions is inefficient. Once the problem is fixed, we should
108 // call ST->hasSSE3() instead of ST->hasPOPCNT().
109 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
110}
111
112std::optional<unsigned> X86TTIImpl::getCacheSize(
114 switch (Level) {
116 // - Penryn
117 // - Nehalem
118 // - Westmere
119 // - Sandy Bridge
120 // - Ivy Bridge
121 // - Haswell
122 // - Broadwell
123 // - Skylake
124 // - Kabylake
125 return 32 * 1024; // 32 KByte
127 // - Penryn
128 // - Nehalem
129 // - Westmere
130 // - Sandy Bridge
131 // - Ivy Bridge
132 // - Haswell
133 // - Broadwell
134 // - Skylake
135 // - Kabylake
136 return 256 * 1024; // 256 KByte
137 }
138
139 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
140}
141
142std::optional<unsigned> X86TTIImpl::getCacheAssociativity(
144 // - Penryn
145 // - Nehalem
146 // - Westmere
147 // - Sandy Bridge
148 // - Ivy Bridge
149 // - Haswell
150 // - Broadwell
151 // - Skylake
152 // - Kabylake
153 switch (Level) {
155 [[fallthrough]];
157 return 8;
158 }
159
160 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
161}
162
163unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
164 bool Vector = (ClassID == 1);
165 if (Vector && !ST->hasSSE1())
166 return 0;
167
168 if (ST->is64Bit()) {
169 if (Vector && ST->hasAVX512())
170 return 32;
171 return 16;
172 }
173 return 8;
174}
175
178 unsigned PreferVectorWidth = ST->getPreferVectorWidth();
179 switch (K) {
181 return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
183 if (ST->hasAVX512() && ST->hasEVEX512() && PreferVectorWidth >= 512)
184 return TypeSize::getFixed(512);
185 if (ST->hasAVX() && PreferVectorWidth >= 256)
186 return TypeSize::getFixed(256);
187 if (ST->hasSSE1() && PreferVectorWidth >= 128)
188 return TypeSize::getFixed(128);
189 return TypeSize::getFixed(0);
191 return TypeSize::getScalable(0);
192 }
193
194 llvm_unreachable("Unsupported register kind");
195}
196
199 .getFixedValue();
200}
201
203 // If the loop will not be vectorized, don't interleave the loop.
204 // Let regular unroll to unroll the loop, which saves the overflow
205 // check and memory check cost.
206 if (VF.isScalar())
207 return 1;
208
209 if (ST->isAtom())
210 return 1;
211
212 // Sandybridge and Haswell have multiple execution ports and pipelined
213 // vector units.
214 if (ST->hasAVX())
215 return 4;
216
217 return 2;
218}
219
221 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
224 const Instruction *CxtI) {
225
226 // vXi8 multiplications are always promoted to vXi16.
227 // Sub-128-bit types can be extended/packed more efficiently.
228 if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
229 Ty->getPrimitiveSizeInBits() <= 64 && Ty->getScalarSizeInBits() == 8) {
230 Type *WideVecTy =
231 VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
232 return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
234 CostKind) +
235 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
237 CostKind) +
238 getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info);
239 }
240
241 // Legalize the type.
242 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
243
244 int ISD = TLI->InstructionOpcodeToISD(Opcode);
245 assert(ISD && "Invalid opcode");
246
247 if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
248 (LT.second.getScalarType() == MVT::i32 ||
249 LT.second.getScalarType() == MVT::i64)) {
250 // Check if the operands can be represented as a smaller datatype.
251 bool Op1Signed = false, Op2Signed = false;
252 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
253 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
254 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
255 bool SignedMode = Op1Signed || Op2Signed;
256
257 // If both vXi32 are representable as i15 and at least one is constant,
258 // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
259 // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
260 if (OpMinSize <= 15 && !ST->isPMADDWDSlow() &&
261 LT.second.getScalarType() == MVT::i32) {
262 bool Op1Constant =
263 isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
264 bool Op2Constant =
265 isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
266 bool Op1Sext = isa<SExtInst>(Args[0]) &&
267 (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
268 bool Op2Sext = isa<SExtInst>(Args[1]) &&
269 (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
270
271 bool IsZeroExtended = !Op1Signed || !Op2Signed;
272 bool IsConstant = Op1Constant || Op2Constant;
273 bool IsSext = Op1Sext || Op2Sext;
274 if (IsConstant || IsZeroExtended || IsSext)
275 LT.second =
276 MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
277 }
278
279 // Check if the vXi32 operands can be shrunk into a smaller datatype.
280 // This should match the codegen from reduceVMULWidth.
281 // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()).
282 if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) {
283 if (OpMinSize <= 7)
284 return LT.first * 3; // pmullw/sext
285 if (!SignedMode && OpMinSize <= 8)
286 return LT.first * 3; // pmullw/zext
287 if (OpMinSize <= 15)
288 return LT.first * 5; // pmullw/pmulhw/pshuf
289 if (!SignedMode && OpMinSize <= 16)
290 return LT.first * 5; // pmullw/pmulhw/pshuf
291 }
292
293 // If both vXi64 are representable as (unsigned) i32, then we can perform
294 // the multiple with a single PMULUDQ instruction.
295 // TODO: Add (SSE41+) PMULDQ handling for signed extensions.
296 if (!SignedMode && OpMinSize <= 32 && LT.second.getScalarType() == MVT::i64)
297 ISD = X86ISD::PMULUDQ;
298 }
299
300 // Vector multiply by pow2 will be simplified to shifts.
301 // Vector multiply by -pow2 will be simplified to shifts/negates.
302 if (ISD == ISD::MUL && Op2Info.isConstant() &&
303 (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) {
305 getArithmeticInstrCost(Instruction::Shl, Ty, CostKind,
306 Op1Info.getNoProps(), Op2Info.getNoProps());
307 if (Op2Info.isNegatedPowerOf2())
308 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
309 return Cost;
310 }
311
312 // On X86, vector signed division by constants power-of-two are
313 // normally expanded to the sequence SRA + SRL + ADD + SRA.
314 // The OperandValue properties may not be the same as that of the previous
315 // operation; conservatively assume OP_None.
316 if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
317 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
319 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
320 Op1Info.getNoProps(), Op2Info.getNoProps());
321 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
322 Op1Info.getNoProps(), Op2Info.getNoProps());
323 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
324 Op1Info.getNoProps(), Op2Info.getNoProps());
325
326 if (ISD == ISD::SREM) {
327 // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
328 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(),
329 Op2Info.getNoProps());
330 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(),
331 Op2Info.getNoProps());
332 }
333
334 return Cost;
335 }
336
337 // Vector unsigned division/remainder will be simplified to shifts/masks.
338 if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
339 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
340 if (ISD == ISD::UDIV)
341 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
342 Op1Info.getNoProps(), Op2Info.getNoProps());
343 // UREM
344 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
345 Op1Info.getNoProps(), Op2Info.getNoProps());
346 }
347
348 static const CostKindTblEntry GFNIUniformConstCostTable[] = {
349 { ISD::SHL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
350 { ISD::SRL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
351 { ISD::SRA, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
352 { ISD::SHL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
353 { ISD::SRL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
354 { ISD::SRA, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
355 { ISD::SHL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
356 { ISD::SRL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
357 { ISD::SRA, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
358 };
359
360 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasGFNI())
361 if (const auto *Entry =
362 CostTableLookup(GFNIUniformConstCostTable, ISD, LT.second))
363 if (auto KindCost = Entry->Cost[CostKind])
364 return LT.first * *KindCost;
365
366 static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
367 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
368 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
369 { ISD::SRA, MVT::v16i8, { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb.
370 { ISD::SHL, MVT::v32i8, { 1, 8, 2, 3 } }, // psllw + pand.
371 { ISD::SRL, MVT::v32i8, { 1, 8, 2, 3 } }, // psrlw + pand.
372 { ISD::SRA, MVT::v32i8, { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb.
373 { ISD::SHL, MVT::v64i8, { 1, 8, 2, 3 } }, // psllw + pand.
374 { ISD::SRL, MVT::v64i8, { 1, 8, 2, 3 } }, // psrlw + pand.
375 { ISD::SRA, MVT::v64i8, { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb.
376
377 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // psllw
378 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
379 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
380 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // psllw
381 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
382 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
383 };
384
385 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI())
386 if (const auto *Entry =
387 CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second))
388 if (auto KindCost = Entry->Cost[CostKind])
389 return LT.first * *KindCost;
390
391 static const CostKindTblEntry AVX512UniformConstCostTable[] = {
392 { ISD::SHL, MVT::v64i8, { 2, 12, 5, 6 } }, // psllw + pand.
393 { ISD::SRL, MVT::v64i8, { 2, 12, 5, 6 } }, // psrlw + pand.
394 { ISD::SRA, MVT::v64i8, { 3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb.
395
396 { ISD::SHL, MVT::v16i16, { 2, 7, 4, 4 } }, // psllw + split.
397 { ISD::SRL, MVT::v16i16, { 2, 7, 4, 4 } }, // psrlw + split.
398 { ISD::SRA, MVT::v16i16, { 2, 7, 4, 4 } }, // psraw + split.
399
400 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, // pslld
401 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, // psrld
402 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, // psrad
403 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, // pslld
404 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, // psrld
405 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, // psrad
406
407 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, // psraq
408 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, // psllq
409 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, // psrlq
410 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, // psraq
411 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, // psllq
412 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, // psrlq
413 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, // psraq
414
415 { ISD::SDIV, MVT::v16i32, { 6 } }, // pmuludq sequence
416 { ISD::SREM, MVT::v16i32, { 8 } }, // pmuludq+mul+sub sequence
417 { ISD::UDIV, MVT::v16i32, { 5 } }, // pmuludq sequence
418 { ISD::UREM, MVT::v16i32, { 7 } }, // pmuludq+mul+sub sequence
419 };
420
421 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512())
422 if (const auto *Entry =
423 CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second))
424 if (auto KindCost = Entry->Cost[CostKind])
425 return LT.first * *KindCost;
426
427 static const CostKindTblEntry AVX2UniformConstCostTable[] = {
428 { ISD::SHL, MVT::v16i8, { 1, 8, 2, 3 } }, // psllw + pand.
429 { ISD::SRL, MVT::v16i8, { 1, 8, 2, 3 } }, // psrlw + pand.
430 { ISD::SRA, MVT::v16i8, { 2, 10, 5, 6 } }, // psrlw, pand, pxor, psubb.
431 { ISD::SHL, MVT::v32i8, { 2, 8, 2, 4 } }, // psllw + pand.
432 { ISD::SRL, MVT::v32i8, { 2, 8, 2, 4 } }, // psrlw + pand.
433 { ISD::SRA, MVT::v32i8, { 3, 10, 5, 9 } }, // psrlw, pand, pxor, psubb.
434
435 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw
436 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw
437 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw
438 { ISD::SHL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psllw
439 { ISD::SRL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psrlw
440 { ISD::SRA, MVT::v16i16,{ 2, 2, 1, 2 } }, // psraw
441
442 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
443 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld
444 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad
445 { ISD::SHL, MVT::v8i32, { 2, 2, 1, 2 } }, // pslld
446 { ISD::SRL, MVT::v8i32, { 2, 2, 1, 2 } }, // psrld
447 { ISD::SRA, MVT::v8i32, { 2, 2, 1, 2 } }, // psrad
448
449 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq
450 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq
451 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
452 { ISD::SHL, MVT::v4i64, { 2, 2, 1, 2 } }, // psllq
453 { ISD::SRL, MVT::v4i64, { 2, 2, 1, 2 } }, // psrlq
454 { ISD::SRA, MVT::v4i64, { 4, 4, 3, 6 } }, // psrad + shuffle + split.
455
456 { ISD::SDIV, MVT::v8i32, { 6 } }, // pmuludq sequence
457 { ISD::SREM, MVT::v8i32, { 8 } }, // pmuludq+mul+sub sequence
458 { ISD::UDIV, MVT::v8i32, { 5 } }, // pmuludq sequence
459 { ISD::UREM, MVT::v8i32, { 7 } }, // pmuludq+mul+sub sequence
460 };
461
462 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2())
463 if (const auto *Entry =
464 CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second))
465 if (auto KindCost = Entry->Cost[CostKind])
466 return LT.first * *KindCost;
467
468 static const CostKindTblEntry AVXUniformConstCostTable[] = {
469 { ISD::SHL, MVT::v16i8, { 2, 7, 2, 3 } }, // psllw + pand.
470 { ISD::SRL, MVT::v16i8, { 2, 7, 2, 3 } }, // psrlw + pand.
471 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
472 { ISD::SHL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psllw + pand) + split.
473 { ISD::SRL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psrlw + pand) + split.
474 { ISD::SRA, MVT::v32i8, { 7, 7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split.
475
476 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 1 } }, // psllw.
477 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 1 } }, // psrlw.
478 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 1 } }, // psraw.
479 { ISD::SHL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psllw + split.
480 { ISD::SRL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psrlw + split.
481 { ISD::SRA, MVT::v16i16,{ 3, 6, 4, 5 } }, // psraw + split.
482
483 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 1 } }, // pslld.
484 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 1 } }, // psrld.
485 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 1 } }, // psrad.
486 { ISD::SHL, MVT::v8i32, { 3, 6, 4, 5 } }, // pslld + split.
487 { ISD::SRL, MVT::v8i32, { 3, 6, 4, 5 } }, // psrld + split.
488 { ISD::SRA, MVT::v8i32, { 3, 6, 4, 5 } }, // psrad + split.
489
490 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 1 } }, // psllq.
491 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 1 } }, // psrlq.
492 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
493 { ISD::SHL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
494 { ISD::SRL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
495 { ISD::SRA, MVT::v4i64, { 5, 7, 8, 9 } }, // 2 x psrad + shuffle + split.
496
497 { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split.
498 { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split.
499 { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split.
500 { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split.
501 };
502
503 // XOP has faster vXi8 shifts.
504 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() &&
505 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
506 if (const auto *Entry =
507 CostTableLookup(AVXUniformConstCostTable, ISD, LT.second))
508 if (auto KindCost = Entry->Cost[CostKind])
509 return LT.first * *KindCost;
510
511 static const CostKindTblEntry SSE2UniformConstCostTable[] = {
512 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
513 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
514 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
515
516 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw.
517 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw.
518 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw.
519
520 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
521 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld.
522 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad.
523
524 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq.
525 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq.
526 { ISD::SRA, MVT::v2i64, { 3, 5, 6, 6 } }, // 2 x psrad + shuffle.
527
528 { ISD::SDIV, MVT::v4i32, { 6 } }, // pmuludq sequence
529 { ISD::SREM, MVT::v4i32, { 8 } }, // pmuludq+mul+sub sequence
530 { ISD::UDIV, MVT::v4i32, { 5 } }, // pmuludq sequence
531 { ISD::UREM, MVT::v4i32, { 7 } }, // pmuludq+mul+sub sequence
532 };
533
534 // XOP has faster vXi8 shifts.
535 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() &&
536 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
537 if (const auto *Entry =
538 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
539 if (auto KindCost = Entry->Cost[CostKind])
540 return LT.first * *KindCost;
541
542 static const CostKindTblEntry AVX512BWConstCostTable[] = {
543 { ISD::SDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
544 { ISD::SREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
545 { ISD::UDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
546 { ISD::UREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
547
548 { ISD::SDIV, MVT::v32i16, { 6 } }, // vpmulhw sequence
549 { ISD::SREM, MVT::v32i16, { 8 } }, // vpmulhw+mul+sub sequence
550 { ISD::UDIV, MVT::v32i16, { 6 } }, // vpmulhuw sequence
551 { ISD::UREM, MVT::v32i16, { 8 } }, // vpmulhuw+mul+sub sequence
552 };
553
554 if (Op2Info.isConstant() && ST->hasBWI())
555 if (const auto *Entry =
556 CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
557 if (auto KindCost = Entry->Cost[CostKind])
558 return LT.first * *KindCost;
559
560 static const CostKindTblEntry AVX512ConstCostTable[] = {
561 { ISD::SDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
562 { ISD::SREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
563 { ISD::UDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
564 { ISD::UREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
565
566 { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence
567 { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence
568 { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence
569 { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence
570
571 { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence
572 { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence
573 { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence
574 { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence
575 };
576
577 if (Op2Info.isConstant() && ST->hasAVX512())
578 if (const auto *Entry =
579 CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
580 if (auto KindCost = Entry->Cost[CostKind])
581 return LT.first * *KindCost;
582
583 static const CostKindTblEntry AVX2ConstCostTable[] = {
584 { ISD::SDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
585 { ISD::SREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
586 { ISD::UDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
587 { ISD::UREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
588
589 { ISD::SDIV, MVT::v16i16, { 6 } }, // vpmulhw sequence
590 { ISD::SREM, MVT::v16i16, { 8 } }, // vpmulhw+mul+sub sequence
591 { ISD::UDIV, MVT::v16i16, { 6 } }, // vpmulhuw sequence
592 { ISD::UREM, MVT::v16i16, { 8 } }, // vpmulhuw+mul+sub sequence
593
594 { ISD::SDIV, MVT::v8i32, { 15 } }, // vpmuldq sequence
595 { ISD::SREM, MVT::v8i32, { 19 } }, // vpmuldq+mul+sub sequence
596 { ISD::UDIV, MVT::v8i32, { 15 } }, // vpmuludq sequence
597 { ISD::UREM, MVT::v8i32, { 19 } }, // vpmuludq+mul+sub sequence
598 };
599
600 if (Op2Info.isConstant() && ST->hasAVX2())
601 if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
602 if (auto KindCost = Entry->Cost[CostKind])
603 return LT.first * *KindCost;
604
605 static const CostKindTblEntry AVXConstCostTable[] = {
606 { ISD::SDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
607 { ISD::SREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
608 { ISD::UDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
609 { ISD::UREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
610
611 { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split.
612 { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split.
613 { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split.
614 { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split.
615
616 { ISD::SDIV, MVT::v8i32, { 32 } }, // vpmuludq sequence
617 { ISD::SREM, MVT::v8i32, { 38 } }, // vpmuludq+mul+sub sequence
618 { ISD::UDIV, MVT::v8i32, { 32 } }, // 2*pmuludq sequence + split.
619 { ISD::UREM, MVT::v8i32, { 42 } }, // 2*pmuludq+mul+sub sequence + split.
620 };
621
622 if (Op2Info.isConstant() && ST->hasAVX())
623 if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second))
624 if (auto KindCost = Entry->Cost[CostKind])
625 return LT.first * *KindCost;
626
627 static const CostKindTblEntry SSE41ConstCostTable[] = {
628 { ISD::SDIV, MVT::v4i32, { 15 } }, // vpmuludq sequence
629 { ISD::SREM, MVT::v4i32, { 20 } }, // vpmuludq+mul+sub sequence
630 };
631
632 if (Op2Info.isConstant() && ST->hasSSE41())
633 if (const auto *Entry =
634 CostTableLookup(SSE41ConstCostTable, ISD, LT.second))
635 if (auto KindCost = Entry->Cost[CostKind])
636 return LT.first * *KindCost;
637
638 static const CostKindTblEntry SSE2ConstCostTable[] = {
639 { ISD::SDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
640 { ISD::SREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
641 { ISD::UDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
642 { ISD::UREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
643
644 { ISD::SDIV, MVT::v8i16, { 6 } }, // pmulhw sequence
645 { ISD::SREM, MVT::v8i16, { 8 } }, // pmulhw+mul+sub sequence
646 { ISD::UDIV, MVT::v8i16, { 6 } }, // pmulhuw sequence
647 { ISD::UREM, MVT::v8i16, { 8 } }, // pmulhuw+mul+sub sequence
648
649 { ISD::SDIV, MVT::v4i32, { 19 } }, // pmuludq sequence
650 { ISD::SREM, MVT::v4i32, { 24 } }, // pmuludq+mul+sub sequence
651 { ISD::UDIV, MVT::v4i32, { 15 } }, // pmuludq sequence
652 { ISD::UREM, MVT::v4i32, { 20 } }, // pmuludq+mul+sub sequence
653 };
654
655 if (Op2Info.isConstant() && ST->hasSSE2())
656 if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
657 if (auto KindCost = Entry->Cost[CostKind])
658 return LT.first * *KindCost;
659
660 static const CostKindTblEntry AVX512BWUniformCostTable[] = {
661 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
662 { ISD::SRL, MVT::v16i8, { 3,10, 5, 8 } }, // psrlw + pand.
663 { ISD::SRA, MVT::v16i8, { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb.
664 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
665 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
666 { ISD::SRA, MVT::v32i8, { 5,10,10,13 } }, // psrlw, pand, pxor, psubb.
667 { ISD::SHL, MVT::v64i8, { 4, 7, 6, 8 } }, // psllw + pand.
668 { ISD::SRL, MVT::v64i8, { 4, 8, 7,10 } }, // psrlw + pand.
669 { ISD::SRA, MVT::v64i8, { 5,10,10,15 } }, // psrlw, pand, pxor, psubb.
670
671 { ISD::SHL, MVT::v32i16, { 2, 4, 2, 3 } }, // psllw
672 { ISD::SRL, MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw
673 { ISD::SRA, MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw
674 };
675
676 if (ST->hasBWI() && Op2Info.isUniform())
677 if (const auto *Entry =
678 CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second))
679 if (auto KindCost = Entry->Cost[CostKind])
680 return LT.first * *KindCost;
681
682 static const CostKindTblEntry AVX512UniformCostTable[] = {
683 { ISD::SHL, MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split.
684 { ISD::SRL, MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split.
685 { ISD::SRA, MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split.
686
687 { ISD::SHL, MVT::v16i32, { 2, 4, 2, 3 } }, // pslld
688 { ISD::SRL, MVT::v16i32, { 2, 4, 2, 3 } }, // psrld
689 { ISD::SRA, MVT::v16i32, { 2, 4, 2, 3 } }, // psrad
690
691 { ISD::SRA, MVT::v2i64, { 1, 2, 1, 2 } }, // psraq
692 { ISD::SHL, MVT::v4i64, { 1, 4, 1, 2 } }, // psllq
693 { ISD::SRL, MVT::v4i64, { 1, 4, 1, 2 } }, // psrlq
694 { ISD::SRA, MVT::v4i64, { 1, 4, 1, 2 } }, // psraq
695 { ISD::SHL, MVT::v8i64, { 1, 4, 1, 2 } }, // psllq
696 { ISD::SRL, MVT::v8i64, { 1, 4, 1, 2 } }, // psrlq
697 { ISD::SRA, MVT::v8i64, { 1, 4, 1, 2 } }, // psraq
698 };
699
700 if (ST->hasAVX512() && Op2Info.isUniform())
701 if (const auto *Entry =
702 CostTableLookup(AVX512UniformCostTable, ISD, LT.second))
703 if (auto KindCost = Entry->Cost[CostKind])
704 return LT.first * *KindCost;
705
706 static const CostKindTblEntry AVX2UniformCostTable[] = {
707 // Uniform splats are cheaper for the following instructions.
708 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
709 { ISD::SRL, MVT::v16i8, { 3, 9, 5, 8 } }, // psrlw + pand.
710 { ISD::SRA, MVT::v16i8, { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb.
711 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
712 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
713 { ISD::SRA, MVT::v32i8, { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb.
714
715 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 2 } }, // psllw.
716 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 2 } }, // psrlw.
717 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 2 } }, // psraw.
718 { ISD::SHL, MVT::v16i16, { 2, 4, 2, 3 } }, // psllw.
719 { ISD::SRL, MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw.
720 { ISD::SRA, MVT::v16i16, { 2, 4, 2, 3 } }, // psraw.
721
722 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 2 } }, // pslld
723 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 2 } }, // psrld
724 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 2 } }, // psrad
725 { ISD::SHL, MVT::v8i32, { 2, 4, 2, 3 } }, // pslld
726 { ISD::SRL, MVT::v8i32, { 2, 4, 2, 3 } }, // psrld
727 { ISD::SRA, MVT::v8i32, { 2, 4, 2, 3 } }, // psrad
728
729 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 2 } }, // psllq
730 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 2 } }, // psrlq
731 { ISD::SRA, MVT::v2i64, { 2, 4, 5, 7 } }, // 2 x psrad + shuffle.
732 { ISD::SHL, MVT::v4i64, { 2, 4, 1, 2 } }, // psllq
733 { ISD::SRL, MVT::v4i64, { 2, 4, 1, 2 } }, // psrlq
734 { ISD::SRA, MVT::v4i64, { 4, 6, 5, 9 } }, // 2 x psrad + shuffle.
735 };
736
737 if (ST->hasAVX2() && Op2Info.isUniform())
738 if (const auto *Entry =
739 CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
740 if (auto KindCost = Entry->Cost[CostKind])
741 return LT.first * *KindCost;
742
743 static const CostKindTblEntry AVXUniformCostTable[] = {
744 { ISD::SHL, MVT::v16i8, { 4, 4, 6, 8 } }, // psllw + pand.
745 { ISD::SRL, MVT::v16i8, { 4, 8, 5, 8 } }, // psrlw + pand.
746 { ISD::SRA, MVT::v16i8, { 6, 6, 9,13 } }, // psrlw, pand, pxor, psubb.
747 { ISD::SHL, MVT::v32i8, { 7, 8,11,14 } }, // psllw + pand + split.
748 { ISD::SRL, MVT::v32i8, { 7, 9,10,14 } }, // psrlw + pand + split.
749 { ISD::SRA, MVT::v32i8, { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split.
750
751 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 2 } }, // psllw.
752 { ISD::SRL, MVT::v8i16, { 1, 3, 1, 2 } }, // psrlw.
753 { ISD::SRA, MVT::v8i16, { 1, 3, 1, 2 } }, // psraw.
754 { ISD::SHL, MVT::v16i16, { 3, 7, 5, 7 } }, // psllw + split.
755 { ISD::SRL, MVT::v16i16, { 3, 7, 5, 7 } }, // psrlw + split.
756 { ISD::SRA, MVT::v16i16, { 3, 7, 5, 7 } }, // psraw + split.
757
758 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 2 } }, // pslld.
759 { ISD::SRL, MVT::v4i32, { 1, 3, 1, 2 } }, // psrld.
760 { ISD::SRA, MVT::v4i32, { 1, 3, 1, 2 } }, // psrad.
761 { ISD::SHL, MVT::v8i32, { 3, 7, 5, 7 } }, // pslld + split.
762 { ISD::SRL, MVT::v8i32, { 3, 7, 5, 7 } }, // psrld + split.
763 { ISD::SRA, MVT::v8i32, { 3, 7, 5, 7 } }, // psrad + split.
764
765 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 2 } }, // psllq.
766 { ISD::SRL, MVT::v2i64, { 1, 3, 1, 2 } }, // psrlq.
767 { ISD::SRA, MVT::v2i64, { 3, 4, 5, 7 } }, // 2 x psrad + shuffle.
768 { ISD::SHL, MVT::v4i64, { 3, 7, 4, 6 } }, // psllq + split.
769 { ISD::SRL, MVT::v4i64, { 3, 7, 4, 6 } }, // psrlq + split.
770 { ISD::SRA, MVT::v4i64, { 6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split.
771 };
772
773 // XOP has faster vXi8 shifts.
774 if (ST->hasAVX() && Op2Info.isUniform() &&
775 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
776 if (const auto *Entry =
777 CostTableLookup(AVXUniformCostTable, ISD, LT.second))
778 if (auto KindCost = Entry->Cost[CostKind])
779 return LT.first * *KindCost;
780
781 static const CostKindTblEntry SSE2UniformCostTable[] = {
782 // Uniform splats are cheaper for the following instructions.
783 { ISD::SHL, MVT::v16i8, { 9, 10, 6, 9 } }, // psllw + pand.
784 { ISD::SRL, MVT::v16i8, { 9, 13, 5, 9 } }, // psrlw + pand.
785 { ISD::SRA, MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence.
786
787 { ISD::SHL, MVT::v8i16, { 2, 2, 1, 2 } }, // psllw.
788 { ISD::SRL, MVT::v8i16, { 2, 2, 1, 2 } }, // psrlw.
789 { ISD::SRA, MVT::v8i16, { 2, 2, 1, 2 } }, // psraw.
790
791 { ISD::SHL, MVT::v4i32, { 2, 2, 1, 2 } }, // pslld
792 { ISD::SRL, MVT::v4i32, { 2, 2, 1, 2 } }, // psrld.
793 { ISD::SRA, MVT::v4i32, { 2, 2, 1, 2 } }, // psrad.
794
795 { ISD::SHL, MVT::v2i64, { 2, 2, 1, 2 } }, // psllq.
796 { ISD::SRL, MVT::v2i64, { 2, 2, 1, 2 } }, // psrlq.
797 { ISD::SRA, MVT::v2i64, { 5, 9, 5, 7 } }, // 2*psrlq + xor + sub.
798 };
799
800 if (ST->hasSSE2() && Op2Info.isUniform() &&
801 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
802 if (const auto *Entry =
803 CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
804 if (auto KindCost = Entry->Cost[CostKind])
805 return LT.first * *KindCost;
806
807 static const CostKindTblEntry AVX512DQCostTable[] = {
808 { ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq
809 { ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq
810 { ISD::MUL, MVT::v8i64, { 3, 15, 1, 3 } } // pmullq
811 };
812
813 // Look for AVX512DQ lowering tricks for custom cases.
814 if (ST->hasDQI())
815 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
816 if (auto KindCost = Entry->Cost[CostKind])
817 return LT.first * *KindCost;
818
819 static const CostKindTblEntry AVX512BWCostTable[] = {
820 { ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence.
821 { ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence.
822 { ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence.
823 { ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence.
824 { ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence.
825 { ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence.
826 { ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence.
827 { ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence.
828 { ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence.
829
830 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw
831 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw
832 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw
833 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw
834 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw
835 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw
836 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw
837 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw
838 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw
839
840 { ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb
841 { ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw
842
843 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb
844 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw
845 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd
846 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq
847
848 { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb
849 { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw
850
851 { ISD::MUL, MVT::v64i8, { 5, 10,10,11 } },
852 { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw
853
854 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb
855 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw
856 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd
857 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq
858 };
859
860 // Look for AVX512BW lowering tricks for custom cases.
861 if (ST->hasBWI())
862 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
863 if (auto KindCost = Entry->Cost[CostKind])
864 return LT.first * *KindCost;
865
866 static const CostKindTblEntry AVX512CostTable[] = {
867 { ISD::SHL, MVT::v64i8, { 15, 19,27,33 } }, // vpblendv+split sequence.
868 { ISD::SRL, MVT::v64i8, { 15, 19,30,36 } }, // vpblendv+split sequence.
869 { ISD::SRA, MVT::v64i8, { 37, 37,51,63 } }, // vpblendv+split sequence.
870
871 { ISD::SHL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
872 { ISD::SRL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
873 { ISD::SRA, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence.
874
875 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } },
876 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } },
877 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } },
878 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } },
879 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } },
880 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } },
881 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } },
882 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } },
883 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } },
884
885 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } },
886 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } },
887 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } },
888 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } },
889 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } },
890 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } },
891 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } },
892 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } },
893 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } },
894
895 { ISD::ADD, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*paddb + split
896 { ISD::ADD, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*paddw + split
897
898 { ISD::SUB, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*psubb + split
899 { ISD::SUB, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*psubw + split
900
901 { ISD::AND, MVT::v32i8, { 1, 1, 1, 1 } },
902 { ISD::AND, MVT::v16i16, { 1, 1, 1, 1 } },
903 { ISD::AND, MVT::v8i32, { 1, 1, 1, 1 } },
904 { ISD::AND, MVT::v4i64, { 1, 1, 1, 1 } },
905
906 { ISD::OR, MVT::v32i8, { 1, 1, 1, 1 } },
907 { ISD::OR, MVT::v16i16, { 1, 1, 1, 1 } },
908 { ISD::OR, MVT::v8i32, { 1, 1, 1, 1 } },
909 { ISD::OR, MVT::v4i64, { 1, 1, 1, 1 } },
910
911 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 1 } },
912 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 1 } },
913 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } },
914 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } },
915
916 { ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
917 { ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
918 { ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
919 { ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add
920 { ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/
921
922 { X86ISD::PMULUDQ, MVT::v8i64, { 1, 5, 1, 1 } },
923
924 { ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
925 { ISD::FADD, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
926 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
927 { ISD::FSUB, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
928 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
929 { ISD::FMUL, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
930 { ISD::FMUL, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
931 { ISD::FMUL, MVT::v2f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
932 { ISD::FMUL, MVT::f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
933
934 { ISD::FDIV, MVT::f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
935 { ISD::FDIV, MVT::v2f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
936 { ISD::FDIV, MVT::v4f64, { 8, 14, 1, 1 } }, // Skylake from http://www.agner.org/
937 { ISD::FDIV, MVT::v8f64, { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/
938
939 { ISD::FNEG, MVT::v16f32, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
940 { ISD::FADD, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
941 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
942 { ISD::FSUB, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
943 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
944 { ISD::FMUL, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
945 { ISD::FMUL, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
946 { ISD::FMUL, MVT::v4f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
947 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
948
949 { ISD::FDIV, MVT::f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
950 { ISD::FDIV, MVT::v4f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
951 { ISD::FDIV, MVT::v8f32, { 5, 11, 1, 1 } }, // Skylake from http://www.agner.org/
952 { ISD::FDIV, MVT::v16f32, { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/
953 };
954
955 if (ST->hasAVX512())
956 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
957 if (auto KindCost = Entry->Cost[CostKind])
958 return LT.first * *KindCost;
959
960 static const CostKindTblEntry AVX2ShiftCostTable[] = {
961 // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
962 // customize them to detect the cases where shift amount is a scalar one.
963 { ISD::SHL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org)
964 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
965 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org)
966 { ISD::SHL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org)
967 { ISD::SRL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
968 { ISD::SRA, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org)
969 { ISD::SHL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org)
970 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org)
971 { ISD::SHL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org)
972 { ISD::SRL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org)
973 };
974
975 if (ST->hasAVX512()) {
976 if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant())
977 // On AVX512, a packed v32i16 shift left by a constant build_vector
978 // is lowered into a vector multiply (vpmullw).
979 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
980 Op1Info.getNoProps(), Op2Info.getNoProps());
981 }
982
983 // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
984 if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
985 if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
986 Op2Info.isConstant())
987 // On AVX2, a packed v16i16 shift left by a constant build_vector
988 // is lowered into a vector multiply (vpmullw).
989 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
990 Op1Info.getNoProps(), Op2Info.getNoProps());
991
992 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
993 if (auto KindCost = Entry->Cost[CostKind])
994 return LT.first * *KindCost;
995 }
996
997 static const CostKindTblEntry XOPShiftCostTable[] = {
998 // 128bit shifts take 1cy, but right shifts require negation beforehand.
999 { ISD::SHL, MVT::v16i8, { 1, 3, 1, 1 } },
1000 { ISD::SRL, MVT::v16i8, { 2, 3, 1, 1 } },
1001 { ISD::SRA, MVT::v16i8, { 2, 3, 1, 1 } },
1002 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 1 } },
1003 { ISD::SRL, MVT::v8i16, { 2, 3, 1, 1 } },
1004 { ISD::SRA, MVT::v8i16, { 2, 3, 1, 1 } },
1005 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 1 } },
1006 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 1 } },
1007 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 1 } },
1008 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 1 } },
1009 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } },
1010 { ISD::SRA, MVT::v2i64, { 2, 3, 1, 1 } },
1011 // 256bit shifts require splitting if AVX2 didn't catch them above.
1012 { ISD::SHL, MVT::v32i8, { 4, 7, 5, 6 } },
1013 { ISD::SRL, MVT::v32i8, { 6, 7, 5, 6 } },
1014 { ISD::SRA, MVT::v32i8, { 6, 7, 5, 6 } },
1015 { ISD::SHL, MVT::v16i16, { 4, 7, 5, 6 } },
1016 { ISD::SRL, MVT::v16i16, { 6, 7, 5, 6 } },
1017 { ISD::SRA, MVT::v16i16, { 6, 7, 5, 6 } },
1018 { ISD::SHL, MVT::v8i32, { 4, 7, 5, 6 } },
1019 { ISD::SRL, MVT::v8i32, { 6, 7, 5, 6 } },
1020 { ISD::SRA, MVT::v8i32, { 6, 7, 5, 6 } },
1021 { ISD::SHL, MVT::v4i64, { 4, 7, 5, 6 } },
1022 { ISD::SRL, MVT::v4i64, { 6, 7, 5, 6 } },
1023 { ISD::SRA, MVT::v4i64, { 6, 7, 5, 6 } },
1024 };
1025
1026 // Look for XOP lowering tricks.
1027 if (ST->hasXOP()) {
1028 // If the right shift is constant then we'll fold the negation so
1029 // it's as cheap as a left shift.
1030 int ShiftISD = ISD;
1031 if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant())
1032 ShiftISD = ISD::SHL;
1033 if (const auto *Entry =
1034 CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
1035 if (auto KindCost = Entry->Cost[CostKind])
1036 return LT.first * *KindCost;
1037 }
1038
1039 if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) {
1040 MVT VT = LT.second;
1041 // Vector shift left by non uniform constant can be lowered
1042 // into vector multiply.
1043 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
1044 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
1045 ISD = ISD::MUL;
1046 }
1047
1048 static const CostKindTblEntry GLMCostTable[] = {
1049 { ISD::FDIV, MVT::f32, { 18, 19, 1, 1 } }, // divss
1050 { ISD::FDIV, MVT::v4f32, { 35, 36, 1, 1 } }, // divps
1051 { ISD::FDIV, MVT::f64, { 33, 34, 1, 1 } }, // divsd
1052 { ISD::FDIV, MVT::v2f64, { 65, 66, 1, 1 } }, // divpd
1053 };
1054
1055 if (ST->useGLMDivSqrtCosts())
1056 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second))
1057 if (auto KindCost = Entry->Cost[CostKind])
1058 return LT.first * *KindCost;
1059
1060 static const CostKindTblEntry SLMCostTable[] = {
1061 { ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld
1062 { ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw
1063 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd
1064 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss
1065 { ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd
1066 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // mulps
1067 { ISD::FDIV, MVT::f32, { 17, 19, 1, 1 } }, // divss
1068 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 6 } }, // divps
1069 { ISD::FDIV, MVT::f64, { 32, 34, 1, 1 } }, // divsd
1070 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 6 } }, // divpd
1071 { ISD::FADD, MVT::v2f64, { 2, 4, 1, 1 } }, // addpd
1072 { ISD::FSUB, MVT::v2f64, { 2, 4, 1, 1 } }, // subpd
1073 // v2i64/v4i64 mul is custom lowered as a series of long:
1074 // multiplies(3), shifts(3) and adds(2)
1075 // slm muldq version throughput is 2 and addq throughput 4
1076 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
1077 // 3X4 (addq throughput) = 17
1078 { ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } },
1079 // slm addq\subq throughput is 4
1080 { ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } },
1081 { ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } },
1082 };
1083
1084 if (ST->useSLMArithCosts())
1085 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second))
1086 if (auto KindCost = Entry->Cost[CostKind])
1087 return LT.first * *KindCost;
1088
1089 static const CostKindTblEntry AVX2CostTable[] = {
1090 { ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence.
1091 { ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence.
1092 { ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence.
1093 { ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1094
1095 { ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence.
1096 { ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence.
1097 { ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence.
1098 { ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1099
1100 { ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence.
1101 { ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence.
1102 { ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence.
1103 { ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence.
1104 { ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence.
1105 { ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence.
1106
1107 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb
1108 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb
1109 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw
1110 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw
1111 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd
1112 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd
1113 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq
1114 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq
1115
1116 { ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack
1117 { ISD::MUL, MVT::v32i8, { 6, 11,10,19 } }, // unpack/pmullw
1118 { ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw
1119 { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld
1120 { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld
1121 { ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add
1122 { ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
1123
1124 { X86ISD::PMULUDQ, MVT::v4i64, { 1, 5, 1, 1 } },
1125
1126 { ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd
1127 { ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps
1128
1129 { ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd
1130 { ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss
1131 { ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd
1132 { ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps
1133 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd
1134 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps
1135
1136 { ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd
1137 { ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss
1138 { ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd
1139 { ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps
1140 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd
1141 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps
1142
1143 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd
1144 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss
1145 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd
1146 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps
1147 { ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd
1148 { ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps
1149
1150 { ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss
1151 { ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps
1152 { ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps
1153 { ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd
1154 { ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd
1155 { ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd
1156 };
1157
1158 // Look for AVX2 lowering tricks for custom cases.
1159 if (ST->hasAVX2())
1160 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
1161 if (auto KindCost = Entry->Cost[CostKind])
1162 return LT.first * *KindCost;
1163
1164 static const CostKindTblEntry AVX1CostTable[] = {
1165 // We don't have to scalarize unsupported ops. We can issue two half-sized
1166 // operations and we only need to extract the upper YMM half.
1167 // Two ops + 1 extract + 1 insert = 4.
1168 { ISD::MUL, MVT::v32i8, { 12, 13, 22, 23 } }, // unpack/pmullw + split
1169 { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split
1170 { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split
1171 { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
1172 { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } },
1173
1174 { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps
1175 { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps
1176 { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps
1177 { ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps
1178
1179 { ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps
1180 { ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps
1181 { ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps
1182 { ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps
1183
1184 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps
1185 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps
1186 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps
1187 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps
1188
1189 { ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split
1190 { ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split
1191 { ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split
1192 { ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split
1193 { ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split
1194 { ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split
1195 { ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split
1196 { ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split
1197 { ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq
1198 { ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq
1199
1200 { ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence.
1201 { ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split.
1202 { ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence.
1203 { ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split.
1204 { ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld
1205 { ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split
1206 { ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1207 { ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1208
1209 { ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence.
1210 { ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split.
1211 { ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1212 { ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1213 { ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1214 { ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1215 { ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1216 { ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1217
1218 { ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence.
1219 { ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split.
1220 { ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1221 { ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1222 { ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1223 { ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1224 { ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend.
1225 { ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split.
1226
1227 { ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1228 { ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1229
1230 { ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1231 { ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1232 { ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1233 { ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1234 { ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1235 { ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1236
1237 { ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1238 { ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1239 { ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1240 { ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1241 { ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1242 { ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1243
1244 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1245 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1246 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1247 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1248 { ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1249 { ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1250
1251 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1252 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1253 { ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/
1254 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1255 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1256 { ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/
1257 };
1258
1259 if (ST->hasAVX())
1260 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
1261 if (auto KindCost = Entry->Cost[CostKind])
1262 return LT.first * *KindCost;
1263
1264 static const CostKindTblEntry SSE42CostTable[] = {
1265 { ISD::FADD, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1266 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1267 { ISD::FADD, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1268 { ISD::FADD, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1269
1270 { ISD::FSUB, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1271 { ISD::FSUB, MVT::f32 , { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1272 { ISD::FSUB, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1273 { ISD::FSUB, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1274
1275 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1276 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1277 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1278 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1279
1280 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1281 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1282 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1283 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1284
1285 { ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add
1286 };
1287
1288 if (ST->hasSSE42())
1289 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
1290 if (auto KindCost = Entry->Cost[CostKind])
1291 return LT.first * *KindCost;
1292
1293 static const CostKindTblEntry SSE41CostTable[] = {
1294 { ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence.
1295 { ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence.
1296 { ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld
1297
1298 { ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence.
1299 { ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1300 { ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1301 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1302
1303 { ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence.
1304 { ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1305 { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1306 { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence.
1307
1308 { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*unpack/2*pmullw/2*and/pack
1309 { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org)
1310 };
1311
1312 if (ST->hasSSE41())
1313 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
1314 if (auto KindCost = Entry->Cost[CostKind])
1315 return LT.first * *KindCost;
1316
1317 static const CostKindTblEntry SSE2CostTable[] = {
1318 // We don't correctly identify costs of casts because they are marked as
1319 // custom.
1320 { ISD::SHL, MVT::v16i8, { 13, 21,26,28 } }, // cmpgtb sequence.
1321 { ISD::SHL, MVT::v8i16, { 24, 27,16,20 } }, // cmpgtw sequence.
1322 { ISD::SHL, MVT::v4i32, { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq.
1323 { ISD::SHL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1324
1325 { ISD::SRL, MVT::v16i8, { 14, 28,27,30 } }, // cmpgtb sequence.
1326 { ISD::SRL, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1327 { ISD::SRL, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1328 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1329
1330 { ISD::SRA, MVT::v16i8, { 27, 30,54,54 } }, // unpacked cmpgtb sequence.
1331 { ISD::SRA, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1332 { ISD::SRA, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1333 { ISD::SRA, MVT::v2i64, { 8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence.
1334
1335 { ISD::AND, MVT::v16i8, { 1, 1, 1, 1 } }, // pand
1336 { ISD::AND, MVT::v8i16, { 1, 1, 1, 1 } }, // pand
1337 { ISD::AND, MVT::v4i32, { 1, 1, 1, 1 } }, // pand
1338 { ISD::AND, MVT::v2i64, { 1, 1, 1, 1 } }, // pand
1339
1340 { ISD::OR, MVT::v16i8, { 1, 1, 1, 1 } }, // por
1341 { ISD::OR, MVT::v8i16, { 1, 1, 1, 1 } }, // por
1342 { ISD::OR, MVT::v4i32, { 1, 1, 1, 1 } }, // por
1343 { ISD::OR, MVT::v2i64, { 1, 1, 1, 1 } }, // por
1344
1345 { ISD::XOR, MVT::v16i8, { 1, 1, 1, 1 } }, // pxor
1346 { ISD::XOR, MVT::v8i16, { 1, 1, 1, 1 } }, // pxor
1347 { ISD::XOR, MVT::v4i32, { 1, 1, 1, 1 } }, // pxor
1348 { ISD::XOR, MVT::v2i64, { 1, 1, 1, 1 } }, // pxor
1349
1350 { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq
1351 { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq
1352
1353 { ISD::MUL, MVT::v16i8, { 5, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack
1354 { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw
1355 { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle
1356 { ISD::MUL, MVT::v2i64, { 7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add
1357
1358 { X86ISD::PMULUDQ, MVT::v2i64, { 1, 5, 1, 1 } },
1359
1360 { ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/
1361 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/
1362 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/
1363 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/
1364
1365 { ISD::FNEG, MVT::f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1366 { ISD::FNEG, MVT::f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1367 { ISD::FNEG, MVT::v4f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1368 { ISD::FNEG, MVT::v2f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1369
1370 { ISD::FADD, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1371 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1372 { ISD::FADD, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1373
1374 { ISD::FSUB, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1375 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1376 { ISD::FSUB, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1377
1378 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1379 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1380 };
1381
1382 if (ST->hasSSE2())
1383 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1384 if (auto KindCost = Entry->Cost[CostKind])
1385 return LT.first * *KindCost;
1386
1387 static const CostKindTblEntry SSE1CostTable[] = {
1388 { ISD::FDIV, MVT::f32, { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/
1389 { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/
1390
1391 { ISD::FNEG, MVT::f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1392 { ISD::FNEG, MVT::v4f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1393
1394 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1395 { ISD::FADD, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1396
1397 { ISD::FSUB, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1398 { ISD::FSUB, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1399
1400 { ISD::FMUL, MVT::f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1401 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1402 };
1403
1404 if (ST->hasSSE1())
1405 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1406 if (auto KindCost = Entry->Cost[CostKind])
1407 return LT.first * *KindCost;
1408
1409 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
1410 { ISD::ADD, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1411 { ISD::SUB, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1412 { ISD::MUL, MVT::i64, { 2, 6, 1, 2 } },
1413 };
1414
1415 if (ST->is64Bit())
1416 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1417 if (auto KindCost = Entry->Cost[CostKind])
1418 return LT.first * *KindCost;
1419
1420 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1421 { ISD::ADD, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1422 { ISD::ADD, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1423 { ISD::ADD, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1424
1425 { ISD::SUB, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1426 { ISD::SUB, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1427 { ISD::SUB, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1428
1429 { ISD::MUL, MVT::i8, { 3, 4, 1, 1 } },
1430 { ISD::MUL, MVT::i16, { 2, 4, 1, 1 } },
1431 { ISD::MUL, MVT::i32, { 1, 4, 1, 1 } },
1432
1433 { ISD::FNEG, MVT::f64, { 2, 2, 1, 3 } }, // (x87)
1434 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1435 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1436 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // (x87)
1437 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87)
1438 };
1439
1440 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1441 if (auto KindCost = Entry->Cost[CostKind])
1442 return LT.first * *KindCost;
1443
1444 // It is not a good idea to vectorize division. We have to scalarize it and
1445 // in the process we will often end up having to spilling regular
1446 // registers. The overhead of division is going to dominate most kernels
1447 // anyways so try hard to prevent vectorization of division - it is
1448 // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1449 // to hide "20 cycles" for each lane.
1450 if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() &&
1451 (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
1452 ISD == ISD::UREM)) {
1453 InstructionCost ScalarCost =
1455 Op1Info.getNoProps(), Op2Info.getNoProps());
1456 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1457 }
1458
1459 // Handle some basic single instruction code size cases.
1460 if (CostKind == TTI::TCK_CodeSize) {
1461 switch (ISD) {
1462 case ISD::FADD:
1463 case ISD::FSUB:
1464 case ISD::FMUL:
1465 case ISD::FDIV:
1466 case ISD::FNEG:
1467 case ISD::AND:
1468 case ISD::OR:
1469 case ISD::XOR:
1470 return LT.first;
1471 break;
1472 }
1473 }
1474
1475 // Fallback to the default implementation.
1476 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1477 Args, CxtI);
1478}
1479
1482 unsigned Opcode1, const SmallBitVector &OpcodeMask,
1484 if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
1485 return TTI::TCC_Basic;
1487}
1488
1490 TTI::ShuffleKind Kind, VectorType *BaseTp, ArrayRef<int> Mask,
1492 ArrayRef<const Value *> Args, const Instruction *CxtI) {
1493 // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1494 // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1495 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(BaseTp);
1496
1497 Kind = improveShuffleKindFromMask(Kind, Mask, BaseTp, Index, SubTp);
1498
1499 // Recognize a basic concat_vector shuffle.
1500 if (Kind == TTI::SK_PermuteTwoSrc &&
1501 Mask.size() == (2 * BaseTp->getElementCount().getKnownMinValue()) &&
1502 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
1505 CostKind, Mask.size() / 2, BaseTp);
1506
1507 // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1508 if (Kind == TTI::SK_Transpose)
1509 Kind = TTI::SK_PermuteTwoSrc;
1510
1511 if (Kind == TTI::SK_Broadcast) {
1512 // For Broadcasts we are splatting the first element from the first input
1513 // register, so only need to reference that input and all the output
1514 // registers are the same.
1515 LT.first = 1;
1516
1517 // If we're broadcasting a load then AVX/AVX2 can do this for free.
1518 using namespace PatternMatch;
1519 if (!Args.empty() && match(Args[0], m_OneUse(m_Load(m_Value()))) &&
1520 (ST->hasAVX2() ||
1521 (ST->hasAVX() && LT.second.getScalarSizeInBits() >= 32)))
1522 return TTI::TCC_Free;
1523 }
1524
1525 // Treat <X x bfloat> shuffles as <X x half>.
1526 if (LT.second.isVector() && LT.second.getScalarType() == MVT::bf16)
1527 LT.second = LT.second.changeVectorElementType(MVT::f16);
1528
1529 // Subvector extractions are free if they start at the beginning of a
1530 // vector and cheap if the subvectors are aligned.
1531 if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1532 int NumElts = LT.second.getVectorNumElements();
1533 if ((Index % NumElts) == 0)
1534 return 0;
1535 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1536 if (SubLT.second.isVector()) {
1537 int NumSubElts = SubLT.second.getVectorNumElements();
1538 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1539 return SubLT.first;
1540 // Handle some cases for widening legalization. For now we only handle
1541 // cases where the original subvector was naturally aligned and evenly
1542 // fit in its legalized subvector type.
1543 // FIXME: Remove some of the alignment restrictions.
1544 // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1545 // vectors.
1546 int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1547 if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1548 (NumSubElts % OrigSubElts) == 0 &&
1549 LT.second.getVectorElementType() ==
1550 SubLT.second.getVectorElementType() &&
1551 LT.second.getVectorElementType().getSizeInBits() ==
1553 assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1554 "Unexpected number of elements!");
1555 auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
1556 LT.second.getVectorNumElements());
1557 auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
1558 SubLT.second.getVectorNumElements());
1559 int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1560 InstructionCost ExtractCost =
1561 getShuffleCost(TTI::SK_ExtractSubvector, VecTy, std::nullopt,
1562 CostKind, ExtractIndex, SubTy);
1563
1564 // If the original size is 32-bits or more, we can use pshufd. Otherwise
1565 // if we have SSSE3 we can use pshufb.
1566 if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1567 return ExtractCost + 1; // pshufd or pshufb
1568
1569 assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1570 "Unexpected vector size");
1571
1572 return ExtractCost + 2; // worst case pshufhw + pshufd
1573 }
1574 }
1575 // If the extract subvector is not optimal, treat it as single op shuffle.
1577 }
1578
1579 // Subvector insertions are cheap if the subvectors are aligned.
1580 // Note that in general, the insertion starting at the beginning of a vector
1581 // isn't free, because we need to preserve the rest of the wide vector.
1582 if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1583 int NumElts = LT.second.getVectorNumElements();
1584 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1585 if (SubLT.second.isVector()) {
1586 int NumSubElts = SubLT.second.getVectorNumElements();
1587 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1588 return SubLT.first;
1589 }
1590
1591 // If the insertion isn't aligned, treat it like a 2-op shuffle.
1592 Kind = TTI::SK_PermuteTwoSrc;
1593 }
1594
1595 // Handle some common (illegal) sub-vector types as they are often very cheap
1596 // to shuffle even on targets without PSHUFB.
1597 EVT VT = TLI->getValueType(DL, BaseTp);
1598 if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1599 !ST->hasSSSE3()) {
1600 static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
1601 {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw
1602 {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw
1603 {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw
1604 {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw
1605 {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck
1606
1607 {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw
1608 {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw
1609 {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus
1610 {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck
1611
1612 {TTI::SK_Splice, MVT::v4i16, 2}, // punpck+psrldq
1613 {TTI::SK_Splice, MVT::v2i16, 2}, // punpck+psrldq
1614 {TTI::SK_Splice, MVT::v4i8, 2}, // punpck+psrldq
1615 {TTI::SK_Splice, MVT::v2i8, 2}, // punpck+psrldq
1616
1617 {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw
1618 {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw
1619 {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw
1620 {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw
1621 {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck
1622
1623 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
1624 {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
1625 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw
1626 {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw
1627 {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck
1628 };
1629
1630 if (ST->hasSSE2())
1631 if (const auto *Entry =
1632 CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1633 return Entry->Cost;
1634 }
1635
1636 // We are going to permute multiple sources and the result will be in multiple
1637 // destinations. Providing an accurate cost only for splits where the element
1638 // type remains the same.
1639 if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
1640 MVT LegalVT = LT.second;
1641 if (LegalVT.isVector() &&
1642 LegalVT.getVectorElementType().getSizeInBits() ==
1644 LegalVT.getVectorNumElements() <
1645 cast<FixedVectorType>(BaseTp)->getNumElements()) {
1646 unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
1647 unsigned LegalVTSize = LegalVT.getStoreSize();
1648 // Number of source vectors after legalization:
1649 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1650 // Number of destination vectors after legalization:
1651 InstructionCost NumOfDests = LT.first;
1652
1653 auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
1654 LegalVT.getVectorNumElements());
1655
1656 if (!Mask.empty() && NumOfDests.isValid()) {
1657 // Try to perform better estimation of the permutation.
1658 // 1. Split the source/destination vectors into real registers.
1659 // 2. Do the mask analysis to identify which real registers are
1660 // permuted. If more than 1 source registers are used for the
1661 // destination register building, the cost for this destination register
1662 // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
1663 // source register is used, build mask and calculate the cost as a cost
1664 // of PermuteSingleSrc.
1665 // Also, for the single register permute we try to identify if the
1666 // destination register is just a copy of the source register or the
1667 // copy of the previous destination register (the cost is
1668 // TTI::TCC_Basic). If the source register is just reused, the cost for
1669 // this operation is 0.
1670 NumOfDests =
1672 FixedVectorType::get(BaseTp->getElementType(), Mask.size()))
1673 .first;
1674 unsigned E = *NumOfDests.getValue();
1675 unsigned NormalizedVF =
1676 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
1677 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
1678 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
1679 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
1680 copy(Mask, NormalizedMask.begin());
1681 unsigned PrevSrcReg = 0;
1682 ArrayRef<int> PrevRegMask;
1685 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
1686 [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask,
1687 &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
1688 if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) {
1689 // Check if the previous register can be just copied to the next
1690 // one.
1691 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
1692 PrevRegMask != RegMask)
1694 RegMask, CostKind, 0, nullptr);
1695 else
1696 // Just a copy of previous destination register.
1698 return;
1699 }
1700 if (SrcReg != DestReg &&
1701 any_of(RegMask, [](int I) { return I != PoisonMaskElem; })) {
1702 // Just a copy of the source register.
1704 }
1705 PrevSrcReg = SrcReg;
1706 PrevRegMask = RegMask;
1707 },
1708 [this, SingleOpTy, CostKind, &Cost](ArrayRef<int> RegMask,
1709 unsigned /*Unused*/,
1710 unsigned /*Unused*/) {
1711 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
1712 CostKind, 0, nullptr);
1713 });
1714 return Cost;
1715 }
1716
1717 InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1718 return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1719 std::nullopt, CostKind, 0, nullptr);
1720 }
1721
1722 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
1723 }
1724
1725 // For 2-input shuffles, we must account for splitting the 2 inputs into many.
1726 if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
1727 // We assume that source and destination have the same vector type.
1728 InstructionCost NumOfDests = LT.first;
1729 InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1;
1730 LT.first = NumOfDests * NumOfShufflesPerDest;
1731 }
1732
1733 static const CostTblEntry AVX512VBMIShuffleTbl[] = {
1734 {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
1735 {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
1736
1737 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
1738 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
1739
1740 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
1741 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
1742 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b
1743 };
1744
1745 if (ST->hasVBMI())
1746 if (const auto *Entry =
1747 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1748 return LT.first * Entry->Cost;
1749
1750 static const CostTblEntry AVX512BWShuffleTbl[] = {
1751 {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1752 {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
1753 {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1754
1755 {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
1756 {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
1757 {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
1758 {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
1759
1760 {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
1761 {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
1762 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
1763 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
1764 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
1765
1766 {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
1767 {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
1768 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
1769 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
1770 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1771
1772 {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
1773 {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
1774
1775 {TTI::SK_Splice, MVT::v32i16, 2}, // vshufi64x2 + palignr
1776 {TTI::SK_Splice, MVT::v32f16, 2}, // vshufi64x2 + palignr
1777 {TTI::SK_Splice, MVT::v64i8, 2}, // vshufi64x2 + palignr
1778 };
1779
1780 if (ST->hasBWI())
1781 if (const auto *Entry =
1782 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1783 return LT.first * Entry->Cost;
1784
1785 static const CostKindTblEntry AVX512ShuffleTbl[] = {
1786 {TTI::SK_Broadcast, MVT::v8f64, { 1, 1, 1, 1 } }, // vbroadcastsd
1787 {TTI::SK_Broadcast, MVT::v16f32, { 1, 1, 1, 1 } }, // vbroadcastss
1788 {TTI::SK_Broadcast, MVT::v8i64, { 1, 1, 1, 1 } }, // vpbroadcastq
1789 {TTI::SK_Broadcast, MVT::v16i32, { 1, 1, 1, 1 } }, // vpbroadcastd
1790 {TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw
1791 {TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw
1792 {TTI::SK_Broadcast, MVT::v64i8, { 1, 1, 1, 1 } }, // vpbroadcastb
1793
1794 {TTI::SK_Reverse, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1795 {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1796 {TTI::SK_Reverse, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1797 {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1798 {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca
1799 {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca
1800 {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca
1801
1802 {TTI::SK_Splice, MVT::v8f64, { 1, 1, 1, 1 } }, // vpalignd
1803 {TTI::SK_Splice, MVT::v4f64, { 1, 1, 1, 1 } }, // vpalignd
1804 {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd
1805 {TTI::SK_Splice, MVT::v8f32, { 1, 1, 1, 1 } }, // vpalignd
1806 {TTI::SK_Splice, MVT::v8i64, { 1, 1, 1, 1 } }, // vpalignd
1807 {TTI::SK_Splice, MVT::v4i64, { 1, 1, 1, 1 } }, // vpalignd
1808 {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd
1809 {TTI::SK_Splice, MVT::v8i32, { 1, 1, 1, 1 } }, // vpalignd
1810 {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr
1811 {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr
1812 {TTI::SK_Splice, MVT::v64i8, { 4, 4, 4, 4 } }, // split + palignr
1813
1814 {TTI::SK_PermuteSingleSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1815 {TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermpd
1816 {TTI::SK_PermuteSingleSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermpd
1817 {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1818 {TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermps
1819 {TTI::SK_PermuteSingleSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermps
1820 {TTI::SK_PermuteSingleSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1821 {TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermq
1822 {TTI::SK_PermuteSingleSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermq
1823 {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1824 {TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermd
1825 {TTI::SK_PermuteSingleSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermd
1826 {TTI::SK_PermuteSingleSrc, MVT::v16i8, { 1, 3, 1, 1 } }, // pshufb
1827
1828 {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermt2pd
1829 {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermt2ps
1830 {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermt2q
1831 {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermt2d
1832 {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermt2pd
1833 {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermt2ps
1834 {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermt2q
1835 {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermt2d
1836 {TTI::SK_PermuteTwoSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermt2pd
1837 {TTI::SK_PermuteTwoSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermt2ps
1838 {TTI::SK_PermuteTwoSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermt2q
1839 {TTI::SK_PermuteTwoSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermt2d
1840
1841 // FIXME: This just applies the type legalization cost rules above
1842 // assuming these completely split.
1843 {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } },
1844 {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } },
1845 {TTI::SK_PermuteSingleSrc, MVT::v64i8, { 14, 14, 14, 14 } },
1846 {TTI::SK_PermuteTwoSrc, MVT::v32i16, { 42, 42, 42, 42 } },
1847 {TTI::SK_PermuteTwoSrc, MVT::v32f16, { 42, 42, 42, 42 } },
1848 {TTI::SK_PermuteTwoSrc, MVT::v64i8, { 42, 42, 42, 42 } },
1849
1850 {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq
1851 {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq
1852 {TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vpternlogq
1853 {TTI::SK_Select, MVT::v8f64, { 1, 1, 1, 1 } }, // vblendmpd
1854 {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps
1855 {TTI::SK_Select, MVT::v8i64, { 1, 1, 1, 1 } }, // vblendmq
1856 {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd
1857 };
1858
1859 if (ST->hasAVX512())
1860 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1861 if (auto KindCost = Entry->Cost[CostKind])
1862 return LT.first * *KindCost;
1863
1864 static const CostTblEntry AVX2ShuffleTbl[] = {
1865 {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1866 {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1867 {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1868 {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1869 {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1870 {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
1871 {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1872
1873 {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1874 {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1875 {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1876 {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1877 {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1878 {TTI::SK_Reverse, MVT::v16f16, 2}, // vperm2i128 + pshufb
1879 {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1880
1881 {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1882 {TTI::SK_Select, MVT::v16f16, 1}, // vpblendvb
1883 {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1884
1885 {TTI::SK_Splice, MVT::v8i32, 2}, // vperm2i128 + vpalignr
1886 {TTI::SK_Splice, MVT::v8f32, 2}, // vperm2i128 + vpalignr
1887 {TTI::SK_Splice, MVT::v16i16, 2}, // vperm2i128 + vpalignr
1888 {TTI::SK_Splice, MVT::v16f16, 2}, // vperm2i128 + vpalignr
1889 {TTI::SK_Splice, MVT::v32i8, 2}, // vperm2i128 + vpalignr
1890
1891 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1892 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1893 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1894 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1895 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1896 // + vpblendvb
1897 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vperm2i128 + 2*vpshufb
1898 // + vpblendvb
1899 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1900 // + vpblendvb
1901
1902 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1903 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1904 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1905 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1906 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1907 // + vpblendvb
1908 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 7}, // 2*vperm2i128 + 4*vpshufb
1909 // + vpblendvb
1910 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
1911 // + vpblendvb
1912 };
1913
1914 if (ST->hasAVX2())
1915 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1916 return LT.first * Entry->Cost;
1917
1918 static const CostTblEntry XOPShuffleTbl[] = {
1919 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
1920 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
1921 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
1922 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
1923 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1924 // + vinsertf128
1925 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
1926 // + vinsertf128
1927
1928 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1929 // + vinsertf128
1930 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
1931 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
1932 // + vinsertf128
1933 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
1934 };
1935
1936 if (ST->hasXOP())
1937 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1938 return LT.first * Entry->Cost;
1939
1940 static const CostTblEntry AVX1ShuffleTbl[] = {
1941 {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1942 {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1943 {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1944 {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1945 {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1946 {TTI::SK_Broadcast, MVT::v16f16, 3}, // vpshuflw + vpshufd + vinsertf128
1947 {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
1948
1949 {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1950 {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1951 {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1952 {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1953 {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1954 // + vinsertf128
1955 {TTI::SK_Reverse, MVT::v16f16, 4}, // vextractf128 + 2*pshufb
1956 // + vinsertf128
1957 {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
1958 // + vinsertf128
1959
1960 {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
1961 {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
1962 {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
1963 {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
1964 {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
1965 {TTI::SK_Select, MVT::v16f16, 3}, // vpand + vpandn + vpor
1966 {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
1967
1968 {TTI::SK_Splice, MVT::v4i64, 2}, // vperm2f128 + shufpd
1969 {TTI::SK_Splice, MVT::v4f64, 2}, // vperm2f128 + shufpd
1970 {TTI::SK_Splice, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1971 {TTI::SK_Splice, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1972 {TTI::SK_Splice, MVT::v16i16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1973 {TTI::SK_Splice, MVT::v16f16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1974 {TTI::SK_Splice, MVT::v32i8, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1975
1976 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
1977 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
1978 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1979 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1980 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
1981 // + 2*por + vinsertf128
1982 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 8}, // vextractf128 + 4*pshufb
1983 // + 2*por + vinsertf128
1984 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
1985 // + 2*por + vinsertf128
1986
1987 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
1988 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
1989 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1990 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1991 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
1992 // + 4*por + vinsertf128
1993 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 15}, // 2*vextractf128 + 8*pshufb
1994 // + 4*por + vinsertf128
1995 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
1996 // + 4*por + vinsertf128
1997 };
1998
1999 if (ST->hasAVX())
2000 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
2001 return LT.first * Entry->Cost;
2002
2003 static const CostTblEntry SSE41ShuffleTbl[] = {
2004 {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
2005 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
2006 {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
2007 {TTI::SK_Select, MVT::v4f32, 1}, // blendps
2008 {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
2009 {TTI::SK_Select, MVT::v8f16, 1}, // pblendw
2010 {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
2011 };
2012
2013 if (ST->hasSSE41())
2014 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
2015 return LT.first * Entry->Cost;
2016
2017 static const CostTblEntry SSSE3ShuffleTbl[] = {
2018 {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
2019 {TTI::SK_Broadcast, MVT::v8f16, 1}, // pshufb
2020 {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
2021
2022 {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
2023 {TTI::SK_Reverse, MVT::v8f16, 1}, // pshufb
2024 {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
2025
2026 {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
2027 {TTI::SK_Select, MVT::v8f16, 3}, // 2*pshufb + por
2028 {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
2029
2030 {TTI::SK_Splice, MVT::v4i32, 1}, // palignr
2031 {TTI::SK_Splice, MVT::v4f32, 1}, // palignr
2032 {TTI::SK_Splice, MVT::v8i16, 1}, // palignr
2033 {TTI::SK_Splice, MVT::v8f16, 1}, // palignr
2034 {TTI::SK_Splice, MVT::v16i8, 1}, // palignr
2035
2036 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
2037 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // pshufb
2038 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
2039
2040 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
2041 {TTI::SK_PermuteTwoSrc, MVT::v8f16, 3}, // 2*pshufb + por
2042 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
2043 };
2044
2045 if (ST->hasSSSE3())
2046 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
2047 return LT.first * Entry->Cost;
2048
2049 static const CostTblEntry SSE2ShuffleTbl[] = {
2050 {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
2051 {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
2052 {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
2053 {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
2054 {TTI::SK_Broadcast, MVT::v8f16, 2}, // pshuflw + pshufd
2055 {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
2056
2057 {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
2058 {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
2059 {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
2060 {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
2061 {TTI::SK_Reverse, MVT::v8f16, 3}, // pshuflw + pshufhw + pshufd
2062 {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
2063 // + 2*pshufd + 2*unpck + packus
2064
2065 {TTI::SK_Select, MVT::v2i64, 1}, // movsd
2066 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
2067 {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
2068 {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
2069 {TTI::SK_Select, MVT::v8f16, 3}, // pand + pandn + por
2070 {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
2071
2072 {TTI::SK_Splice, MVT::v2i64, 1}, // shufpd
2073 {TTI::SK_Splice, MVT::v2f64, 1}, // shufpd
2074 {TTI::SK_Splice, MVT::v4i32, 2}, // 2*{unpck,movsd,pshufd}
2075 {TTI::SK_Splice, MVT::v8i16, 3}, // psrldq + psrlldq + por
2076 {TTI::SK_Splice, MVT::v8f16, 3}, // psrldq + psrlldq + por
2077 {TTI::SK_Splice, MVT::v16i8, 3}, // psrldq + psrlldq + por
2078
2079 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
2080 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
2081 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
2082 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
2083 // + pshufd/unpck
2084 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 5}, // 2*pshuflw + 2*pshufhw
2085 // + pshufd/unpck
2086 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
2087 // + 2*pshufd + 2*unpck + 2*packus
2088
2089 { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
2090 { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
2091 { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
2092 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
2093 { TTI::SK_PermuteTwoSrc, MVT::v8f16, 8 }, // blend+permute
2094 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
2095 };
2096
2097 static const CostTblEntry SSE3BroadcastLoadTbl[] = {
2098 {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
2099 };
2100
2101 if (ST->hasSSE2()) {
2102 bool IsLoad =
2103 llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
2104 if (ST->hasSSE3() && IsLoad)
2105 if (const auto *Entry =
2106 CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
2108 LT.second.getVectorElementCount()) &&
2109 "Table entry missing from isLegalBroadcastLoad()");
2110 return LT.first * Entry->Cost;
2111 }
2112
2113 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
2114 return LT.first * Entry->Cost;
2115 }
2116
2117 static const CostTblEntry SSE1ShuffleTbl[] = {
2118 { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
2119 { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
2120 { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
2121 { TTI::SK_Splice, MVT::v4f32, 2 }, // 2*shufps
2122 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
2123 { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
2124 };
2125
2126 if (ST->hasSSE1())
2127 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
2128 return LT.first * Entry->Cost;
2129
2130 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
2131}
2132
2134 Type *Src,
2137 const Instruction *I) {
2138 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2139 assert(ISD && "Invalid opcode");
2140
2141 // TODO: Allow non-throughput costs that aren't binary.
2142 auto AdjustCost = [&CostKind](InstructionCost Cost,
2145 return Cost == 0 ? 0 : N;
2146 return Cost * N;
2147 };
2148
2149 // The cost tables include both specific, custom (non-legal) src/dst type
2150 // conversions and generic, legalized types. We test for customs first, before
2151 // falling back to legalization.
2152 // FIXME: Need a better design of the cost table to handle non-simple types of
2153 // potential massive combinations (elem_num x src_type x dst_type).
2154 static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
2155 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
2156 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
2157
2158 // Mask sign extend has an instruction.
2159 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
2160 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 },
2161 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
2162 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 },
2163 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
2164 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 },
2165 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
2166 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 },
2167 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
2168 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 },
2169 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
2170 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
2171 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
2172 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
2173 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 },
2174 { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 },
2175 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, 1 },
2176
2177 // Mask zero extend is a sext + shift.
2178 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
2179 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 },
2180 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
2181 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 },
2182 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
2183 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 },
2184 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
2185 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 },
2186 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
2187 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 },
2188 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
2189 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
2190 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
2191 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
2192 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 },
2193 { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 },
2194 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, 2 },
2195
2196 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 },
2197 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 },
2198 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 },
2199 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 },
2200 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 },
2201 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 },
2202 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 },
2203 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 },
2204 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 },
2205 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 },
2206 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 },
2207 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 },
2208 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 },
2209 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 },
2210 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, 2 },
2211 { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, 2 },
2212 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, 2 },
2213
2214 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 2 },
2215 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm
2216 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // vpmovwb
2217 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // vpmovwb
2218 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // vpmovwb
2219 };
2220
2221 static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
2222 // Mask sign extend has an instruction.
2223 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 },
2224 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 },
2225 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 },
2226 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 },
2227 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 },
2228 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, 1 },
2229 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 },
2230 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 },
2231
2232 // Mask zero extend is a sext + shift.
2233 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 },
2234 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 },
2235 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 },
2236 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 },
2237 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 },
2238 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, 2 },
2239 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 },
2240 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 },
2241
2242 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 },
2243 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 },
2244 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 },
2245 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 },
2246 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
2247 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 },
2248 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 },
2249 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, 2 },
2250
2251 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
2252 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
2253
2254 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
2255 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
2256
2257 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 },
2258 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 },
2259
2260 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 },
2261 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 },
2262 };
2263
2264 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
2265 // 256-bit wide vectors.
2266
2267 static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
2268 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 },
2269 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 },
2270 { ISD::FP_EXTEND, MVT::v16f64, MVT::v16f32, 4 }, // 2*vcvtps2pd+vextractf64x4
2271 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 },
2272
2273 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
2274 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
2275 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
2276 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd
2277 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
2278 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
2279 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
2280 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd
2281 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd
2282 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd
2283 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd
2284 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd
2285 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq
2286 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq
2287 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq
2288 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 2 }, // vpmovdb
2289 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 2 }, // vpmovdb
2290 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 }, // vpmovdb
2291 { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, 2 }, // vpmovdb
2292 { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, 2 }, // vpmovdb
2293 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 }, // vpmovdw
2294 { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, 2 }, // vpmovdw
2295 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 2 }, // vpmovqb
2296 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1 }, // vpshufb
2297 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 }, // vpmovqb
2298 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, 2 }, // vpmovqb
2299 { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, 2 }, // vpmovqb
2300 { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, 2 }, // vpmovqb
2301 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 }, // vpmovqw
2302 { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, 2 }, // vpmovqw
2303 { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, 2 }, // vpmovqw
2304 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, // vpmovqd
2305 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd
2306 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb
2307
2308 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32
2309 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 8 },
2310 { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, 8 },
2311
2312 // Sign extend is zmm vpternlogd+vptruncdb.
2313 // Zero extend is zmm broadcast load+vptruncdw.
2314 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 3 },
2315 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 4 },
2316 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 3 },
2317 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 4 },
2318 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 3 },
2319 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 4 },
2320 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 3 },
2321 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 4 },
2322
2323 // Sign extend is zmm vpternlogd+vptruncdw.
2324 // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
2325 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 3 },
2326 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 4 },
2327 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 3 },
2328 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 4 },
2329 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 3 },
2330 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 4 },
2331 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 3 },
2332 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
2333
2334 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd
2335 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld
2336 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd
2337 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld
2338 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd
2339 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld
2340 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq
2341 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq
2342 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq
2343 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq
2344
2345 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd
2346 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld
2347 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq
2348 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq
2349
2350 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
2351 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
2352 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
2353 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
2354 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
2355 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
2356 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
2357 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
2358 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
2359 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
2360
2361 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
2362 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
2363
2364 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
2365 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
2366 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 },
2367 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 },
2368 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
2369 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 },
2370 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
2371 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
2372
2373 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
2374 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
2375 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 },
2376 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 },
2377 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
2378 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 },
2379 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
2380 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
2381 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 },
2382 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 },
2383
2384 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 },
2385 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, 7 },
2386 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64,15 },
2387 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32,11 },
2388 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64,31 },
2389 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, 3 },
2390 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, 7 },
2391 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, 5 },
2392 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64,15 },
2393 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 1 },
2394 { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, 3 },
2395
2396 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 },
2397 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 3 },
2398 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 3 },
2399 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 },
2400 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 3 },
2401 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 3 },
2402 };
2403
2404 static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] {
2405 // Mask sign extend has an instruction.
2406 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
2407 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 },
2408 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
2409 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 },
2410 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
2411 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 },
2412 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
2413 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 },
2414 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
2415 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 },
2416 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
2417 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
2418 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
2419 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
2420 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, 1 },
2421 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, 1 },
2422 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, 1 },
2423
2424 // Mask zero extend is a sext + shift.
2425 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
2426 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 },
2427 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
2428 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 },
2429 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
2430 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 },
2431 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
2432 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 },
2433 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
2434 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 },
2435 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
2436 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
2437 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
2438 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
2439 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, 2 },
2440 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, 2 },
2441 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, 2 },
2442
2443 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 },
2444 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 },
2445 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 },
2446 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 },
2447 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 },
2448 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 },
2449 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 },
2450 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 },
2451 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 },
2452 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 },
2453 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 },
2454 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 },
2455 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 },
2456 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 },
2457 { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, 2 },
2458 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, 2 },
2459 { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, 2 },
2460
2461 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 },
2462 };
2463
2464 static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = {
2465 // Mask sign extend has an instruction.
2466 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 },
2467 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 },
2468 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 },
2469 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, 1 },
2470 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 },
2471 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, 1 },
2472 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, 1 },
2473 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 },
2474
2475 // Mask zero extend is a sext + shift.
2476 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 },
2477 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 },
2478 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 },
2479 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, 2 },
2480 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 },
2481 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, 2 },
2482 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, 2 },
2483 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 },
2484
2485 { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, 2 },
2486 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, 2 },
2487 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 },
2488 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 },
2489 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 },
2490 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 },
2491 { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, 2 },
2492 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
2493
2494 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
2495 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
2496 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
2497 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
2498
2499 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
2500 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
2501 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
2502 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
2503
2504 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, 1 },
2505 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 },
2506 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
2507 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 },
2508
2509 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, 1 },
2510 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 },
2511 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
2512 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 },
2513 };
2514
2515 static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = {
2516 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
2517 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
2518 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
2519 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8
2520 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
2521 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
2522 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
2523 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16
2524 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd
2525 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd
2526 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd
2527 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, 2 }, // vpslld+vptestmd
2528 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq
2529 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq
2530 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd
2531 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, // vpmovqb
2532 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, // vpmovqw
2533 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, // vpmovwb
2534
2535 // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
2536 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
2537 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 5 },
2538 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 6 },
2539 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 5 },
2540 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 6 },
2541 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 5 },
2542 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 6 },
2543 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 10 },
2544 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 12 },
2545
2546 // sign extend is vpcmpeq+maskedmove+vpmovdw
2547 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
2548 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 4 },
2549 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 5 },
2550 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 4 },
2551 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 5 },
2552 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 4 },
2553 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 5 },
2554 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 10 },
2555 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 12 },
2556
2557 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd
2558 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld
2559 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd
2560 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld
2561 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd
2562 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld
2563 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, 1 }, // vpternlogd
2564 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, 2 }, // vpternlogd+psrld
2565
2566 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq
2567 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq
2568 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq
2569 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq
2570
2571 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 1 },
2572 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 1 },
2573 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 1 },
2574 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 1 },
2575 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
2576 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
2577 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 1 },
2578 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 1 },
2579 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
2580 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
2581 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
2582 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
2583
2584 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
2585 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 },
2586 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
2587 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 },
2588
2589 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 },
2590 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 },
2591 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
2592 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 },
2593 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
2594 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 },
2595 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
2596 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
2597 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
2598 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
2599 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 },
2600 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
2601 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 5 },
2602
2603 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 },
2604 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 },
2605 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, 5 },
2606
2607 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 },
2608 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 },
2609 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
2610 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 1 },
2611 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 },
2612 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 },
2613 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 },
2614 };
2615
2616 static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
2617 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
2618 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
2619 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
2620 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
2621 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
2622 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
2623
2624 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 2 },
2625 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 2 },
2626 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 2 },
2627 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 2 },
2628 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
2629 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
2630 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 2 },
2631 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 2 },
2632 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
2633 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
2634 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
2635 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
2636 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
2637 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
2638
2639 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
2640
2641 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 4 },
2642 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 4 },
2643 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 1 },
2644 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 1 },
2645 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 1 },
2646 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 4 },
2647 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 4 },
2648 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 1 },
2649 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 1 },
2650 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 5 },
2651 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 },
2652 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 },
2653
2654 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 },
2655 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 },
2656
2657 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 1 },
2658 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 1 },
2659 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 1 },
2660 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 3 },
2661
2662 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 3 },
2663 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 3 },
2664 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 1 },
2665 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 },
2666 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
2667 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4 },
2668 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 3 },
2669 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 4 },
2670
2671 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 },
2672 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 },
2673 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 },
2674 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 },
2675 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
2676 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
2677 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 3 },
2678
2679 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 },
2680 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 },
2681 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 },
2682 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 },
2683 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 },
2684 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
2685 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 2 },
2686 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
2687 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
2688 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 },
2689 };
2690
2691 static const TypeConversionCostTblEntry AVXConversionTbl[] = {
2692 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 4 },
2693 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 },
2694 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 4 },
2695 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 },
2696 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
2697 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
2698
2699 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 3 },
2700 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 3 },
2701 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 3 },
2702 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 3 },
2703 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
2704 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
2705 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 3 },
2706 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 3 },
2707 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
2708 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
2709 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
2710 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
2711
2712 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 4 },
2713 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 5 },
2714 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 4 },
2715 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 9 },
2716 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, 11 },
2717
2718 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 },
2719 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
2720 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // and+extract+packuswb
2721 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 5 },
2722 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
2723 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 5 },
2724 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 3 }, // and+extract+2*packusdw
2725 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 },
2726
2727 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
2728 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 },
2729 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 },
2730 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 },
2731 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 },
2732 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
2733 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 },
2734 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
2735 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
2736 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 },
2737 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 5 },
2738 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 8 },
2739
2740 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 },
2741 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 },
2742 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 },
2743 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 },
2744 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 },
2745 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
2746 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 },
2747 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 4 },
2748 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 4 },
2749 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
2750 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 },
2751 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 },
2752 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 10 },
2753 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 10 },
2754 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 18 },
2755 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
2756 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 10 },
2757
2758 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 },
2759 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, 2 },
2760 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, 2 },
2761 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, 2 },
2762 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 2 },
2763 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, 2 },
2764 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 2 },
2765 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, 2 },
2766 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 2 },
2767 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 2 },
2768 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 5 },
2769
2770 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, 2 },
2771 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, 2 },
2772 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, 2 },
2773 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, 2 },
2774 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 2 },
2775 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, 2 },
2776 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 2 },
2777 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, 2 },
2778 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 },
2779 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
2780 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 6 },
2781 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 7 },
2782 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 7 },
2783
2784 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 },
2785 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 },
2786 };
2787
2788 static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
2789 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 1 },
2790 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 1 },
2791 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 1 },
2792 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 1 },
2793 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
2794 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
2795 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 1 },
2796 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 1 },
2797 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
2798 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
2799 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
2800 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
2801
2802 // These truncates end up widening elements.
2803 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ
2804 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ
2805 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD
2806
2807 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 2 },
2808 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 2 },
2809 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 2 },
2810
2811 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 1 },
2812 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 1 },
2813 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 1 },
2814 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 1 },
2815 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 },
2816 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
2817 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 },
2818 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
2819 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
2820 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 1 },
2821 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
2822
2823 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 1 },
2824 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 1 },
2825 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 4 },
2826 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 },
2827 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 },
2828 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
2829 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 },
2830 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
2831 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 3 },
2832 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 },
2833 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 2 },
2834 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 12 },
2835 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 22 },
2836 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 4 },
2837
2838 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 1 },
2839 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 1 },
2840 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 1 },
2841 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 1 },
2842 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 2 },
2843 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 2 },
2844 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 1 },
2845 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 1 },
2846 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
2847 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 1 },
2848
2849 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 1 },
2850 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 },
2851 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 1 },
2852 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 },
2853 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 2 },
2854 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 2 },
2855 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 1 },
2856 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 1 },
2857 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 4 },
2858 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
2859 };
2860
2861 static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
2862 // These are somewhat magic numbers justified by comparing the
2863 // output of llvm-mca for our various supported scheduler models
2864 // and basing it off the worst case scenario.
2865 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 3 },
2866 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 3 },
2867 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 3 },
2868 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 3 },
2869 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 3 },
2870 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 },
2871 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 3 },
2872 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 },
2873 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 },
2874 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4 },
2875 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 8 },
2876 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 8 },
2877
2878 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 3 },
2879 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 3 },
2880 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 8 },
2881 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 9 },
2882 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 },
2883 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 4 },
2884 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 4 },
2885 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 },
2886 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 7 },
2887 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 7 },
2888 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
2889 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 15 },
2890 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 18 },
2891
2892 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 4 },
2893 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 4 },
2894 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 4 },
2895 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 4 },
2896 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 6 },
2897 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 6 },
2898 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 5 },
2899 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 5 },
2900 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 4 },
2901 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 4 },
2902
2903 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 4 },
2904 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 },
2905 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 4 },
2906 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 15 },
2907 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 6 },
2908 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 6 },
2909 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 5 },
2910 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 5 },
2911 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 8 },
2912 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 8 },
2913
2914 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 4 },
2915 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 4 },
2916 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 2 },
2917 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 3 },
2918 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
2919 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 2 },
2920 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 2 },
2921 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 3 },
2922 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
2923 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 2 },
2924 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
2925 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 2 },
2926
2927 // These truncates are really widening elements.
2928 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD
2929 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // PUNPCKLWD+DQ
2930 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // PUNPCKLBW+WD+PSHUFD
2931 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 1 }, // PUNPCKLWD
2932 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD
2933 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW
2934
2935 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 2 }, // PAND+PACKUSWB
2936 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 },
2937 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 3 }, // PAND+2*PACKUSWB
2938 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 },
2939 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 },
2940 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 3 },
2941 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
2942 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32,10 },
2943 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB
2944 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW
2945 { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, 1 }, // PSHUFD
2946 };
2947
2948 // Attempt to map directly to (simple) MVT types to let us match custom entries.
2949 EVT SrcTy = TLI->getValueType(DL, Src);
2950 EVT DstTy = TLI->getValueType(DL, Dst);
2951
2952 // The function getSimpleVT only handles simple value types.
2953 if (SrcTy.isSimple() && DstTy.isSimple()) {
2954 MVT SimpleSrcTy = SrcTy.getSimpleVT();
2955 MVT SimpleDstTy = DstTy.getSimpleVT();
2956
2957 if (ST->useAVX512Regs()) {
2958 if (ST->hasBWI())
2959 if (const auto *Entry = ConvertCostTableLookup(
2960 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2961 return AdjustCost(Entry->Cost);
2962
2963 if (ST->hasDQI())
2964 if (const auto *Entry = ConvertCostTableLookup(
2965 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2966 return AdjustCost(Entry->Cost);
2967
2968 if (ST->hasAVX512())
2969 if (const auto *Entry = ConvertCostTableLookup(
2970 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2971 return AdjustCost(Entry->Cost);
2972 }
2973
2974 if (ST->hasBWI())
2975 if (const auto *Entry = ConvertCostTableLookup(
2976 AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2977 return AdjustCost(Entry->Cost);
2978
2979 if (ST->hasDQI())
2980 if (const auto *Entry = ConvertCostTableLookup(
2981 AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2982 return AdjustCost(Entry->Cost);
2983
2984 if (ST->hasAVX512())
2985 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2986 SimpleDstTy, SimpleSrcTy))
2987 return AdjustCost(Entry->Cost);
2988
2989 if (ST->hasAVX2()) {
2990 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2991 SimpleDstTy, SimpleSrcTy))
2992 return AdjustCost(Entry->Cost);
2993 }
2994
2995 if (ST->hasAVX()) {
2996 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2997 SimpleDstTy, SimpleSrcTy))
2998 return AdjustCost(Entry->Cost);
2999 }
3000
3001 if (ST->hasSSE41()) {
3002 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3003 SimpleDstTy, SimpleSrcTy))
3004 return AdjustCost(Entry->Cost);
3005 }
3006
3007 if (ST->hasSSE2()) {
3008 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3009 SimpleDstTy, SimpleSrcTy))
3010 return AdjustCost(Entry->Cost);
3011 }
3012 }
3013
3014 // Fall back to legalized types.
3015 std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src);
3016 std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst);
3017
3018 // If we're truncating to the same legalized type - just assume its free.
3019 if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
3020 return TTI::TCC_Free;
3021
3022 if (ST->useAVX512Regs()) {
3023 if (ST->hasBWI())
3024 if (const auto *Entry = ConvertCostTableLookup(
3025 AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
3026 return AdjustCost(Entry->Cost, std::max(LTSrc.first, LTDest.first));
3027
3028 if (ST->hasDQI())
3029 if (const auto *Entry = ConvertCostTableLookup(
3030 AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
3031 return AdjustCost(Entry->Cost, std::max(LTSrc.first, LTDest.first));
3032
3033 if (ST->hasAVX512())
3034 if (const auto *Entry = ConvertCostTableLookup(
3035 AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
3036 return AdjustCost(Entry->Cost, std::max(LTSrc.first, LTDest.first));
3037 }
3038
3039 if (ST->hasBWI())
3040 if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
3041 LTDest.second, LTSrc.second))
3042 return AdjustCost(Entry->Cost, std::max(LTSrc.first, LTDest.first));
3043
3044 if (ST->hasDQI())
3045 if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
3046 LTDest.second, LTSrc.second))
3047 return AdjustCost(Entry->Cost, std::max(LTSrc.first, LTDest.first));
3048
3049 if (ST->hasAVX512())
3050 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3051 LTDest.second, LTSrc.second))
3052 return AdjustCost(Entry->Cost, std::max(LTSrc.first, LTDest.first));
3053
3054 if (ST->hasAVX2())
3055 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3056 LTDest.second, LTSrc.second))
3057 return AdjustCost(Entry->Cost, std::max(LTSrc.first, LTDest.first));
3058
3059 if (ST->hasAVX())
3060 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3061 LTDest.second, LTSrc.second))
3062 return AdjustCost(Entry->Cost, std::max(LTSrc.first, LTDest.first));
3063
3064 if (ST->hasSSE41())
3065 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3066 LTDest.second, LTSrc.second))
3067 return AdjustCost(Entry->Cost, std::max(LTSrc.first, LTDest.first));
3068
3069 if (ST->hasSSE2())
3070 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3071 LTDest.second, LTSrc.second))
3072 return AdjustCost(Entry->Cost, std::max(LTSrc.first, LTDest.first));
3073
3074 // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
3075 // sitofp.
3076 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3077 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
3078 Type *ExtSrc = Src->getWithNewBitWidth(32);
3079 unsigned ExtOpc =
3080 (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
3081
3082 // For scalar loads the extend would be free.
3083 InstructionCost ExtCost = 0;
3084 if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
3085 ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
3086
3087 return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
3089 }
3090
3091 // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
3092 // i32.
3093 if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
3094 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
3095 Type *TruncDst = Dst->getWithNewBitWidth(32);
3096 return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
3097 getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
3099 }
3100
3101 return AdjustCost(
3102 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3103}
3104
3106 Type *CondTy,
3107 CmpInst::Predicate VecPred,
3109 const Instruction *I) {
3110 // Early out if this type isn't scalar/vector integer/float.
3111 if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy()))
3112 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3113 I);
3114
3115 // Legalize the type.
3116 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3117
3118 MVT MTy = LT.second;
3119
3120 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3121 assert(ISD && "Invalid opcode");
3122
3123 InstructionCost ExtraCost = 0;
3124 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
3125 // Some vector comparison predicates cost extra instructions.
3126 // TODO: Adjust ExtraCost based on CostKind?
3127 // TODO: Should we invert this and assume worst case cmp costs
3128 // and reduce for particular predicates?
3129 if (MTy.isVector() &&
3130 !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
3131 (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
3132 ST->hasBWI())) {
3133 // Fallback to I if a specific predicate wasn't specified.
3134 CmpInst::Predicate Pred = VecPred;
3135 if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
3137 Pred = cast<CmpInst>(I)->getPredicate();
3138
3139 bool CmpWithConstant = false;
3140 if (auto *CmpInstr = dyn_cast_or_null<CmpInst>(I))
3141 CmpWithConstant = isa<Constant>(CmpInstr->getOperand(1));
3142
3143 switch (Pred) {
3145 // xor(cmpeq(x,y),-1)
3146 ExtraCost = CmpWithConstant ? 0 : 1;
3147 break;
3150 // xor(cmpgt(x,y),-1)
3151 ExtraCost = CmpWithConstant ? 0 : 1;
3152 break;
3155 // cmpgt(xor(x,signbit),xor(y,signbit))
3156 // xor(cmpeq(pmaxu(x,y),x),-1)
3157 ExtraCost = CmpWithConstant ? 1 : 2;
3158 break;
3161 if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
3162 (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
3163 // cmpeq(psubus(x,y),0)
3164 // cmpeq(pminu(x,y),x)
3165 ExtraCost = 1;
3166 } else {
3167 // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
3168 ExtraCost = CmpWithConstant ? 2 : 3;
3169 }
3170 break;
3173 // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases.
3174 // Use FCMP_UEQ expansion - FCMP_ONE should be the same.
3175 if (CondTy && !ST->hasAVX())
3176 return getCmpSelInstrCost(Opcode, ValTy, CondTy,
3178 getCmpSelInstrCost(Opcode, ValTy, CondTy,
3180 getArithmeticInstrCost(Instruction::Or, CondTy, CostKind);
3181
3182 break;
3185 // Assume worst case scenario and add the maximum extra cost.
3186 ExtraCost = 3;
3187 break;
3188 default:
3189 break;
3190 }
3191 }
3192 }
3193
3194 static const CostKindTblEntry SLMCostTbl[] = {
3195 // slm pcmpeq/pcmpgt throughput is 2
3196 { ISD::SETCC, MVT::v2i64, { 2, 5, 1, 2 } },
3197 // slm pblendvb/blendvpd/blendvps throughput is 4
3198 { ISD::SELECT, MVT::v2f64, { 4, 4, 1, 3 } }, // vblendvpd
3199 { ISD::SELECT, MVT::v4f32, { 4, 4, 1, 3 } }, // vblendvps
3200 { ISD::SELECT, MVT::v2i64, { 4, 4, 1, 3 } }, // pblendvb
3201 { ISD::SELECT, MVT::v8i32, { 4, 4, 1, 3 } }, // pblendvb
3202 { ISD::SELECT, MVT::v8i16, { 4, 4, 1, 3 } }, // pblendvb
3203 { ISD::SELECT, MVT::v16i8, { 4, 4, 1, 3 } }, // pblendvb
3204 };
3205
3206 static const CostKindTblEntry AVX512BWCostTbl[] = {
3207 { ISD::SETCC, MVT::v32i16, { 1, 1, 1, 1 } },
3208 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 1 } },
3209 { ISD::SETCC, MVT::v64i8, { 1, 1, 1, 1 } },
3210 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 1 } },
3211
3212 { ISD::SELECT, MVT::v32i16, { 1, 1, 1, 1 } },
3213 { ISD::SELECT, MVT::v64i8, { 1, 1, 1, 1 } },
3214 };
3215
3216 static const CostKindTblEntry AVX512CostTbl[] = {
3217 { ISD::SETCC, MVT::v8f64, { 1, 4, 1, 1 } },
3218 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 1 } },
3219 { ISD::SETCC, MVT::v16f32, { 1, 4, 1, 1 } },
3220 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 1 } },
3221
3222 { ISD::SETCC, MVT::v8i64, { 1, 1, 1, 1 } },
3223 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 1 } },
3224 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3225 { ISD::SETCC, MVT::v16i32, { 1, 1, 1, 1 } },
3226 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 1 } },
3227 { ISD::SETCC, MVT::v32i16, { 3, 7, 5, 5 } },
3228 { ISD::SETCC, MVT::v64i8, { 3, 7, 5, 5 } },
3229
3230 { ISD::SELECT, MVT::v8i64, { 1, 1, 1, 1 } },
3231 { ISD::SELECT, MVT::v4i64, { 1, 1, 1, 1 } },
3232 { ISD::SELECT, MVT::v2i64, { 1, 1, 1, 1 } },
3233 { ISD::SELECT, MVT::v16i32, { 1, 1, 1, 1 } },
3234 { ISD::SELECT, MVT::v8i32, { 1, 1, 1, 1 } },
3235 { ISD::SELECT, MVT::v4i32, { 1, 1, 1, 1 } },
3236 { ISD::SELECT, MVT::v8f64, { 1, 1, 1, 1 } },
3237 { ISD::SELECT, MVT::v4f64, { 1, 1, 1, 1 } },
3238 { ISD::SELECT, MVT::v2f64, { 1, 1, 1, 1 } },
3239 { ISD::SELECT, MVT::f64, { 1, 1, 1, 1 } },
3240 { ISD::SELECT, MVT::v16f32, { 1, 1, 1, 1 } },
3241 { ISD::SELECT, MVT::v8f32 , { 1, 1, 1, 1 } },
3242 { ISD::SELECT, MVT::v4f32, { 1, 1, 1, 1 } },
3243 { ISD::SELECT, MVT::f32 , { 1, 1, 1, 1 } },
3244
3245 { ISD::SELECT, MVT::v32i16, { 2, 2, 4, 4 } },
3246 { ISD::SELECT, MVT::v16i16, { 1, 1, 1, 1 } },
3247 { ISD::SELECT, MVT::v8i16, { 1, 1, 1, 1 } },
3248 { ISD::SELECT, MVT::v64i8, { 2, 2, 4, 4 } },
3249 { ISD::SELECT, MVT::v32i8, { 1, 1, 1, 1 } },
3250 { ISD::SELECT, MVT::v16i8, { 1, 1, 1, 1 } },
3251 };
3252
3253 static const CostKindTblEntry AVX2CostTbl[] = {
3254 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 2 } },
3255 { ISD::SETCC, MVT::v2f64, { 1, 4, 1, 1 } },
3256 { ISD::SETCC, MVT::f64, { 1, 4, 1, 1 } },
3257 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 2 } },
3258 { ISD::SETCC, MVT::v4f32, { 1, 4, 1, 1 } },
3259 { ISD::SETCC, MVT::f32, { 1, 4, 1, 1 } },
3260
3261 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 2 } },
3262 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 2 } },
3263 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 2 } },
3264 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 2 } },
3265
3266 { ISD::SELECT, MVT::v4f64, { 2, 2, 1, 2 } }, // vblendvpd
3267 { ISD::SELECT, MVT::v8f32, { 2, 2, 1, 2 } }, // vblendvps
3268 { ISD::SELECT, MVT::v4i64, { 2, 2, 1, 2 } }, // pblendvb
3269 { ISD::SELECT, MVT::v8i32, { 2, 2, 1, 2 } }, // pblendvb
3270 { ISD::SELECT, MVT::v16i16, { 2, 2, 1, 2 } }, // pblendvb
3271 { ISD::SELECT, MVT::v32i8, { 2, 2, 1, 2 } }, // pblendvb
3272 };
3273
3274 static const CostKindTblEntry XOPCostTbl[] = {
3275 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3276 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3277 };
3278
3279 static const CostKindTblEntry AVX1CostTbl[] = {
3280 { ISD::SETCC, MVT::v4f64, { 2, 3, 1, 2 } },
3281 { ISD::SETCC, MVT::v2f64, { 1, 3, 1, 1 } },
3282 { ISD::SETCC, MVT::f64, { 1, 3, 1, 1 } },
3283 { ISD::SETCC, MVT::v8f32, { 2, 3, 1, 2 } },
3284 { ISD::SETCC, MVT::v4f32, { 1, 3, 1, 1 } },
3285 { ISD::SETCC, MVT::f32, { 1, 3, 1, 1 } },
3286
3287 // AVX1 does not support 8-wide integer compare.
3288 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3289 { ISD::SETCC, MVT::v8i32, { 4, 2, 5, 6 } },
3290 { ISD::SETCC, MVT::v16i16, { 4, 2, 5, 6 } },
3291 { ISD::SETCC, MVT::v32i8, { 4, 2, 5, 6 } },
3292
3293 { ISD::SELECT, MVT::v4f64, { 3, 3, 1, 2 } }, // vblendvpd
3294 { ISD::SELECT, MVT::v8f32, { 3, 3, 1, 2 } }, // vblendvps
3295 { ISD::SELECT, MVT::v4i64, { 3, 3, 1, 2 } }, // vblendvpd
3296 { ISD::SELECT, MVT::v8i32, { 3, 3, 1, 2 } }, // vblendvps
3297 { ISD::SELECT, MVT::v16i16, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3298 { ISD::SELECT, MVT::v32i8, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3299 };
3300
3301 static const CostKindTblEntry SSE42CostTbl[] = {
3302 { ISD::SETCC, MVT::v2i64, { 1, 2, 1, 2 } },
3303 };
3304
3305 static const CostKindTblEntry SSE41CostTbl[] = {
3306 { ISD::SETCC, MVT::v2f64, { 1, 5, 1, 1 } },
3307 { ISD::SETCC, MVT::v4f32, { 1, 5, 1, 1 } },
3308
3309 { ISD::SELECT, MVT::v2f64, { 2, 2, 1, 2 } }, // blendvpd
3310 { ISD::SELECT, MVT::f64, { 2, 2, 1, 2 } }, // blendvpd
3311 { ISD::SELECT, MVT::v4f32, { 2, 2, 1, 2 } }, // blendvps
3312 { ISD::SELECT, MVT::f32 , { 2, 2, 1, 2 } }, // blendvps
3313 { ISD::SELECT, MVT::v2i64, { 2, 2, 1, 2 } }, // pblendvb
3314 { ISD::SELECT, MVT::v4i32, { 2, 2, 1, 2 } }, // pblendvb
3315 { ISD::SELECT, MVT::v8i16, { 2, 2, 1, 2 } }, // pblendvb
3316 { ISD::SELECT, MVT::v16i8, { 2, 2, 1, 2 } }, // pblendvb
3317 };
3318
3319 static const CostKindTblEntry SSE2CostTbl[] = {
3320 { ISD::SETCC, MVT::v2f64, { 2, 5, 1, 1 } },
3321 { ISD::SETCC, MVT::f64, { 1, 5, 1, 1 } },
3322
3323 { ISD::SETCC, MVT::v2i64, { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion
3324 { ISD::SETCC, MVT::v4i32, { 1, 1, 1, 1 } },
3325 { ISD::SETCC, MVT::v8i16, { 1, 1, 1, 1 } },
3326 { ISD::SETCC, MVT::v16i8, { 1, 1, 1, 1 } },
3327
3328 { ISD::SELECT, MVT::v2f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3329 { ISD::SELECT, MVT::f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3330 { ISD::SELECT, MVT::v2i64, { 2, 2, 3, 3 } }, // pand + pandn + por
3331 { ISD::SELECT, MVT::v4i32, { 2, 2, 3, 3 } }, // pand + pandn + por
3332 { ISD::SELECT, MVT::v8i16, { 2, 2, 3, 3 } }, // pand + pandn + por
3333 { ISD::SELECT, MVT::v16i8, { 2, 2, 3, 3 } }, // pand + pandn + por
3334 };
3335
3336 static const CostKindTblEntry SSE1CostTbl[] = {
3337 { ISD::SETCC, MVT::v4f32, { 2, 5, 1, 1 } },
3338 { ISD::SETCC, MVT::f32, { 1, 5, 1, 1 } },
3339
3340 { ISD::SELECT, MVT::v4f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3341 { ISD::SELECT, MVT::f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3342 };
3343
3344 if (ST->useSLMArithCosts())
3345 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
3346 if (auto KindCost = Entry->Cost[CostKind])
3347 return LT.first * (ExtraCost + *KindCost);
3348
3349 if (ST->hasBWI())
3350 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3351 if (auto KindCost = Entry->Cost[CostKind])
3352 return LT.first * (ExtraCost + *KindCost);
3353
3354 if (ST->hasAVX512())
3355 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3356 if (auto KindCost = Entry->Cost[CostKind])
3357 return LT.first * (ExtraCost + *KindCost);
3358
3359 if (ST->hasAVX2())
3360 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3361 if (auto KindCost = Entry->Cost[CostKind])
3362 return LT.first * (ExtraCost + *KindCost);
3363
3364 if (ST->hasXOP())
3365 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3366 if (auto KindCost = Entry->Cost[CostKind])
3367 return LT.first * (ExtraCost + *KindCost);
3368
3369 if (ST->hasAVX())
3370 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3371 if (auto KindCost = Entry->Cost[CostKind])
3372 return LT.first * (ExtraCost + *KindCost);
3373
3374 if (ST->hasSSE42())
3375 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3376 if (auto KindCost = Entry->Cost[CostKind])
3377 return LT.first * (ExtraCost + *KindCost);
3378
3379 if (ST->hasSSE41())
3380 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3381 if (auto KindCost = Entry->Cost[CostKind])
3382 return LT.first * (ExtraCost + *KindCost);
3383
3384 if (ST->hasSSE2())
3385 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3386 if (auto KindCost = Entry->Cost[CostKind])
3387 return LT.first * (ExtraCost + *KindCost);
3388
3389 if (ST->hasSSE1())
3390 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3391 if (auto KindCost = Entry->Cost[CostKind])
3392 return LT.first * (ExtraCost + *KindCost);
3393
3394 // Assume a 3cy latency for fp select ops.
3395 if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select)
3396 if (ValTy->getScalarType()->isFloatingPointTy())
3397 return 3;
3398
3399 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
3400}
3401
3403
3407 // Costs should match the codegen from:
3408 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
3409 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
3410 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
3411 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
3412 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
3413
3414 // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
3415 // specialized in these tables yet.
3416 static const CostKindTblEntry AVX512VBMI2CostTbl[] = {
3417 { ISD::FSHL, MVT::v8i64, { 1, 1, 1, 1 } },
3418 { ISD::FSHL, MVT::v4i64, { 1, 1, 1, 1 } },
3419 { ISD::FSHL, MVT::v2i64, { 1, 1, 1, 1 } },
3420 { ISD::FSHL, MVT::v16i32, { 1, 1, 1, 1 } },
3421 { ISD::FSHL, MVT::v8i32, { 1, 1, 1, 1 } },
3422 { ISD::FSHL, MVT::v4i32, { 1, 1, 1, 1 } },
3423 { ISD::FSHL, MVT::v32i16, { 1, 1, 1, 1 } },
3424 { ISD::FSHL, MVT::v16i16, { 1, 1, 1, 1 } },
3425 { ISD::FSHL, MVT::v8i16, { 1, 1, 1, 1 } },
3426 { ISD::ROTL, MVT::v32i16, { 1, 1, 1, 1 } },
3427 { ISD::ROTL, MVT::v16i16, { 1, 1, 1, 1 } },
3428 { ISD::ROTL, MVT::v8i16, { 1, 1, 1, 1 } },
3429 { ISD::ROTR, MVT::v32i16, { 1, 1, 1, 1 } },
3430 { ISD::ROTR, MVT::v16i16, { 1, 1, 1, 1 } },
3431 { ISD::ROTR, MVT::v8i16, { 1, 1, 1, 1 } },
3432 { X86ISD::VROTLI, MVT::v32i16, { 1, 1, 1, 1 } },
3433 { X86ISD::VROTLI, MVT::v16i16, { 1, 1, 1, 1 } },
3434 { X86ISD::VROTLI, MVT::v8i16, { 1, 1, 1, 1 } },
3435 };
3436 static const CostKindTblEntry AVX512BITALGCostTbl[] = {
3437 { ISD::CTPOP, MVT::v32i16, { 1, 1, 1, 1 } },
3438 { ISD::CTPOP, MVT::v64i8, { 1, 1, 1, 1 } },
3439 { ISD::CTPOP, MVT::v16i16, { 1, 1, 1, 1 } },
3440 { ISD::CTPOP, MVT::v32i8, { 1, 1, 1, 1 } },
3441 { ISD::CTPOP, MVT::v8i16, { 1, 1, 1, 1 } },
3442 { ISD::CTPOP, MVT::v16i8, { 1, 1, 1, 1 } },
3443 };
3444 static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = {
3445 { ISD::CTPOP, MVT::v8i64, { 1, 1, 1, 1 } },
3446 { ISD::CTPOP, MVT::v16i32, { 1, 1, 1, 1 } },
3447 { ISD::CTPOP, MVT::v4i64, { 1, 1, 1, 1 } },
3448 { ISD::CTPOP, MVT::v8i32, { 1, 1, 1, 1 } },
3449 { ISD::CTPOP, MVT::v2i64, { 1, 1, 1, 1 } },
3450 { ISD::CTPOP, MVT::v4i32, { 1, 1, 1, 1 } },
3451 };
3452 static const CostKindTblEntry AVX512CDCostTbl[] = {
3453 { ISD::CTLZ, MVT::v8i64, { 1, 5, 1, 1 } },
3454 { ISD::CTLZ, MVT::v16i32, { 1, 5, 1, 1 } },
3455 { ISD::CTLZ, MVT::v32i16, { 18, 27, 23, 27 } },
3456 { ISD::CTLZ, MVT::v64i8, { 3, 16, 9, 11 } },
3457 { ISD::CTLZ, MVT::v4i64, { 1, 5, 1, 1 } },
3458 { ISD::CTLZ, MVT::v8i32, { 1, 5, 1, 1 } },
3459 { ISD::CTLZ, MVT::v16i16, { 8, 19, 11, 13 } },
3460 { ISD::CTLZ, MVT::v32i8, { 2, 11, 9, 10 } },
3461 { ISD::CTLZ, MVT::v2i64, { 1, 5, 1, 1 } },
3462 { ISD::CTLZ, MVT::v4i32, { 1, 5, 1, 1 } },
3463 { ISD::CTLZ, MVT::v8i16, { 3, 15, 4, 6 } },
3464 { ISD::CTLZ, MVT::v16i8, { 2, 10, 9, 10 } },
3465
3466 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3467 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3468 { ISD::CTTZ, MVT::v4i64, { 1, 8, 6, 6 } },
3469 { ISD::CTTZ, MVT::v8i32, { 1, 8, 6, 6 } },
3470 { ISD::CTTZ, MVT::v2i64, { 1, 8, 6, 6 } },
3471 { ISD::CTTZ, MVT::v4i32, { 1, 8, 6, 6 } },
3472 };
3473 static const CostKindTblEntry AVX512BWCostTbl[] = {
3474 { ISD::ABS, MVT::v32i16, { 1, 1, 1, 1 } },
3475 { ISD::ABS, MVT::v64i8, { 1, 1, 1, 1 } },
3476 { ISD::BITREVERSE, MVT::v2i64, { 3, 10, 10, 11 } },
3477 { ISD::BITREVERSE, MVT::v4i64, { 3, 11, 10, 11 } },
3478 { ISD::BITREVERSE, MVT::v8i64, { 3, 12, 10, 14 } },
3479 { ISD::BITREVERSE, MVT::v4i32, { 3, 10, 10, 11 } },
3480 { ISD::BITREVERSE, MVT::v8i32, { 3, 11, 10, 11 } },
3481 { ISD::BITREVERSE, MVT::v16i32, { 3, 12, 10, 14 } },
3482 { ISD::BITREVERSE, MVT::v8i16, { 3, 10, 10, 11 } },
3483 { ISD::BITREVERSE, MVT::v16i16, { 3, 11, 10, 11 } },
3484 { ISD::BITREVERSE, MVT::v32i16, { 3, 12, 10, 14 } },
3485 { ISD::BITREVERSE, MVT::v16i8, { 2, 5, 9, 9 } },
3486 { ISD::BITREVERSE, MVT::v32i8, { 2, 5, 9, 9 } },
3487 { ISD::BITREVERSE, MVT::v64i8, { 2, 5, 9, 12 } },
3488 { ISD::BSWAP, MVT::v2i64, { 1, 1, 1, 2 } },
3489 { ISD::BSWAP, MVT::v4i64, { 1, 1, 1, 2 } },
3490 { ISD::BSWAP, MVT::v8i64, { 1, 1, 1, 2 } },
3491 { ISD::BSWAP, MVT::v4i32, { 1, 1, 1, 2 } },
3492 { ISD::BSWAP, MVT::v8i32, { 1, 1, 1, 2 } },
3493 { ISD::BSWAP, MVT::v16i32, { 1, 1, 1, 2 } },
3494 { ISD::BSWAP, MVT::v8i16, { 1, 1, 1, 2 } },
3495 { ISD::BSWAP, MVT::v16i16, { 1, 1, 1, 2 } },
3496 { ISD::BSWAP, MVT::v32i16, { 1, 1, 1, 2 } },
3497 { ISD::CTLZ, MVT::v8i64, { 8, 22, 23, 23 } },
3498 { ISD::CTLZ, MVT::v16i32, { 8, 23, 25, 25 } },
3499 { ISD::CTLZ, MVT::v32i16, { 4, 15, 15, 16 } },
3500 { ISD::CTLZ, MVT::v64i8, { 3, 12, 10, 9 } },
3501 { ISD::CTPOP, MVT::v2i64, { 3, 7, 10, 10 } },
3502 { ISD::CTPOP, MVT::v4i64, { 3, 7, 10, 10 } },
3503 { ISD::CTPOP, MVT::v8i64, { 3, 8, 10, 12 } },
3504 { ISD::CTPOP, MVT::v4i32, { 7, 11, 14, 14 } },
3505 { ISD::CTPOP, MVT::v8i32, { 7, 11, 14, 14 } },
3506 { ISD::CTPOP, MVT::v16i32, { 7, 12, 14, 16 } },
3507 { ISD::CTPOP, MVT::v8i16, { 2, 7, 11, 11 } },
3508 { ISD::CTPOP, MVT::v16i16, { 2, 7, 11, 11 } },
3509 { ISD::CTPOP, MVT::v32i16, { 3, 7, 11, 13 } },
3510 { ISD::CTPOP, MVT::v16i8, { 2, 4, 8, 8 } },
3511 { ISD::CTPOP, MVT::v32i8, { 2, 4, 8, 8 } },
3512 { ISD::CTPOP, MVT::v64i8, { 2, 5, 8, 10 } },
3513 { ISD::CTTZ, MVT::v8i16, { 3, 9, 14, 14 } },
3514 { ISD::CTTZ, MVT::v16i16, { 3, 9, 14, 14 } },
3515 { ISD::CTTZ, MVT::v32i16, { 3, 10, 14, 16 } },
3516 { ISD::CTTZ, MVT::v16i8, { 2, 6, 11, 11 } },
3517 { ISD::CTTZ, MVT::v32i8, { 2, 6, 11, 11 } },
3518 { ISD::CTTZ, MVT::v64i8, { 3, 7, 11, 13 } },
3519 { ISD::ROTL, MVT::v32i16, { 2, 8, 6, 8 } },
3520 { ISD::ROTL, MVT::v16i16, { 2, 8, 6, 7 } },
3521 { ISD::ROTL, MVT::v8i16, { 2, 7, 6, 7 } },
3522 { ISD::ROTL, MVT::v64i8, { 5, 6, 11, 12 } },
3523 { ISD::ROTL, MVT::v32i8, { 5, 15, 7, 10 } },
3524 { ISD::ROTL, MVT::v16i8, { 5, 15, 7, 10 } },
3525 { ISD::ROTR, MVT::v32i16, { 2, 8, 6, 8 } },
3526 { ISD::ROTR, MVT::v16i16, { 2, 8, 6, 7 } },
3527 { ISD::ROTR, MVT::v8i16, { 2, 7, 6, 7 } },
3528 { ISD::ROTR, MVT::v64i8, { 5, 6, 12, 14 } },
3529 { ISD::ROTR, MVT::v32i8, { 5, 14, 6, 9 } },
3530 { ISD::ROTR, MVT::v16i8, { 5, 14, 6, 9 } },
3531 { X86ISD::VROTLI, MVT::v32i16, { 2, 5, 3, 3 } },
3532 { X86ISD::VROTLI, MVT::v16i16, { 1, 5, 3, 3 } },
3533 { X86ISD::VROTLI, MVT::v8i16, { 1, 5, 3, 3 } },
3534 { X86ISD::VROTLI, MVT::v64i8, { 2, 9, 3, 4 } },
3535 { X86ISD::VROTLI, MVT::v32i8, { 1, 9, 3, 4 } },
3536 { X86ISD::VROTLI, MVT::v16i8, { 1, 8, 3, 4 } },
3537 { ISD::SADDSAT, MVT::v32i16, { 1 } },
3538 { ISD::SADDSAT, MVT::v64i8, { 1 } },
3539 { ISD::SMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3540 { ISD::SMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3541 { ISD::SMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3542 { ISD::SMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3543 { ISD::SSUBSAT, MVT::v32i16, { 1 } },
3544 { ISD::SSUBSAT, MVT::v64i8, { 1 } },
3545 { ISD::UADDSAT, MVT::v32i16, { 1 } },
3546 { ISD::UADDSAT, MVT::v64i8, { 1 } },
3547 { ISD::UMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3548 { ISD::UMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3549 { ISD::UMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3550 { ISD::UMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3551 { ISD::USUBSAT, MVT::v32i16, { 1 } },
3552 { ISD::USUBSAT, MVT::v64i8, { 1 } },
3553 };
3554 static const CostKindTblEntry AVX512CostTbl[] = {
3555 { ISD::ABS, MVT::v8i64, { 1, 1, 1, 1 } },
3556 { ISD::ABS, MVT::v4i64, { 1, 1, 1, 1 } },
3557 { ISD::ABS, MVT::v2i64, { 1, 1, 1, 1 } },
3558 { ISD::ABS, MVT::v16i32, { 1, 1, 1, 1 } },
3559 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 1 } },
3560 { ISD::ABS, MVT::v32i16, { 2, 7, 4, 4 } },
3561 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 1 } },
3562 { ISD::ABS, MVT::v64i8, { 2, 7, 4, 4 } },
3563 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 1 } },
3564 { ISD::BITREVERSE, MVT::v8i64, { 9, 13, 20, 20 } },
3565 { ISD::BITREVERSE, MVT::v16i32, { 9, 13, 20, 20 } },
3566 { ISD::BITREVERSE, MVT::v32i16, { 9, 13, 20, 20 } },
3567 { ISD::BITREVERSE, MVT::v64i8, { 6, 11, 17, 17 } },
3568 { ISD::BSWAP, MVT::v8i64, { 4, 7, 5, 5 } },
3569 { ISD::BSWAP, MVT::v16i32, { 4, 7, 5, 5 } },
3570 { ISD::BSWAP, MVT::v32i16, { 4, 7, 5, 5 } },
3571 { ISD::CTLZ, MVT::v8i64, { 10, 28, 32, 32 } },
3572 { ISD::CTLZ, MVT::v16i32, { 12, 30, 38, 38 } },
3573 { ISD::CTLZ, MVT::v32i16, { 8, 15, 29, 29 } },
3574 { ISD::CTLZ, MVT::v64i8, { 6, 11, 19, 19 } },
3575 { ISD::CTPOP, MVT::v8i64, { 16, 16, 19, 19 } },
3576 { ISD::CTPOP, MVT::v16i32, { 24, 19, 27, 27 } },
3577 { ISD::CTPOP, MVT::v32i16, { 18, 15, 22, 22 } },
3578 { ISD::CTPOP, MVT::v64i8, { 12, 11, 16, 16 } },
3579 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3580 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3581 { ISD::CTTZ, MVT::v32i16, { 7, 17, 27, 27 } },
3582 { ISD::CTTZ, MVT::v64i8, { 6, 13, 21, 21 } },
3583 { ISD::ROTL, MVT::v8i64, { 1, 1, 1, 1 } },
3584 { ISD::ROTL, MVT::v4i64, { 1, 1, 1, 1 } },
3585 { ISD::ROTL, MVT::v2i64, { 1, 1, 1, 1 } },
3586 { ISD::ROTL, MVT::v16i32, { 1, 1, 1, 1 } },
3587 { ISD::ROTL, MVT::v8i32, { 1, 1, 1, 1 } },
3588 { ISD::ROTL, MVT::v4i32, { 1, 1, 1, 1 } },
3589 { ISD::ROTR, MVT::v8i64, { 1, 1, 1, 1 } },
3590 { ISD::ROTR, MVT::v4i64, { 1, 1, 1, 1 } },
3591 { ISD::ROTR, MVT::v2i64, { 1, 1, 1, 1 } },
3592 { ISD::ROTR, MVT::v16i32, { 1, 1, 1, 1 } },
3593 { ISD::ROTR, MVT::v8i32, { 1, 1, 1, 1 } },
3594 { ISD::ROTR, MVT::v4i32, { 1, 1, 1, 1 } },
3595 { X86ISD::VROTLI, MVT::v8i64, { 1, 1, 1, 1 } },
3596 { X86ISD::VROTLI, MVT::v4i64, { 1, 1, 1, 1 } },
3597 { X86ISD::VROTLI, MVT::v2i64, { 1, 1, 1, 1 } },
3598 { X86ISD::VROTLI, MVT::v16i32, { 1, 1, 1, 1 } },
3599 { X86ISD::VROTLI, MVT::v8i32, { 1, 1, 1, 1 } },
3600 { X86ISD::VROTLI, MVT::v4i32, { 1, 1, 1, 1 } },
3601 { ISD::SMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3602 { ISD::SMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3603 { ISD::SMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3604 { ISD::SMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3605 { ISD::SMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3606 { ISD::SMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3607 { ISD::SMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3608 { ISD::SMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3609 { ISD::SMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3610 { ISD::SMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3611 { ISD::SMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3612 { ISD::SMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3613 { ISD::UMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3614 { ISD::UMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3615 { ISD::UMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3616 { ISD::UMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3617 { ISD::UMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3618 { ISD::UMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3619 { ISD::UMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3620 { ISD::UMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3621 { ISD::UMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3622 { ISD::UMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3623 { ISD::UMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3624 { ISD::UMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3625 { ISD::USUBSAT, MVT::v16i32, { 2 } }, // pmaxud + psubd
3626 { ISD::USUBSAT, MVT::v2i64, { 2 } }, // pmaxuq + psubq
3627 { ISD::USUBSAT, MVT::v4i64, { 2 } }, // pmaxuq + psubq
3628 { ISD::USUBSAT, MVT::v8i64, { 2 } }, // pmaxuq + psubq
3629 { ISD::UADDSAT, MVT::v16i32, { 3 } }, // not + pminud + paddd
3630 { ISD::UADDSAT, MVT::v2i64, { 3 } }, // not + pminuq + paddq
3631 { ISD::UADDSAT, MVT::v4i64, { 3 } }, // not + pminuq + paddq
3632 { ISD::UADDSAT, MVT::v8i64, { 3 } }, // not + pminuq + paddq
3633 { ISD::SADDSAT, MVT::v32i16, { 2 } },
3634 { ISD::SADDSAT, MVT::v64i8, { 2 } },
3635 { ISD::SSUBSAT, MVT::v32i16, { 2 } },
3636 { ISD::SSUBSAT, MVT::v64i8, { 2 } },
3637 { ISD::UADDSAT, MVT::v32i16, { 2 } },
3638 { ISD::UADDSAT, MVT::v64i8, { 2 } },
3639 { ISD::USUBSAT, MVT::v32i16, { 2 } },
3640 { ISD::USUBSAT, MVT::v64i8, { 2 } },
3641 { ISD::FMAXNUM, MVT::f32, { 2, 2, 3, 3 } },
3642 { ISD::FMAXNUM, MVT::v4f32, { 1, 1, 3, 3 } },
3643 { ISD::FMAXNUM, MVT::v8f32, { 2, 2, 3, 3 } },
3644 { ISD::FMAXNUM, MVT::v16f32, { 4, 4, 3, 3 } },
3645 { ISD::FMAXNUM, MVT::f64, { 2, 2, 3, 3 } },
3646 { ISD::FMAXNUM, MVT::v2f64, { 1, 1, 3, 3 } },
3647 { ISD::FMAXNUM, MVT::v4f64, { 2, 2, 3, 3 } },
3648 { ISD::FMAXNUM, MVT::v8f64, { 3, 3, 3, 3 } },
3649 { ISD::FSQRT, MVT::f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3650 { ISD::FSQRT, MVT::v4f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3651 { ISD::FSQRT, MVT::v8f32, { 6, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3652 { ISD::FSQRT, MVT::v16f32, { 12, 20, 1, 3 } }, // Skylake from http://www.agner.org/
3653 { ISD::FSQRT, MVT::f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3654 { ISD::FSQRT, MVT::v2f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3655 { ISD::FSQRT, MVT::v4f64, { 12, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3656 { ISD::FSQRT, MVT::v8f64, { 24, 32, 1, 3 } }, // Skylake from http://www.agner.org/
3657 };
3658 static const CostKindTblEntry XOPCostTbl[] = {
3659 { ISD::BITREVERSE, MVT::v4i64, { 3, 6, 5, 6 } },
3660 { ISD::BITREVERSE, MVT::v8i32, { 3, 6, 5, 6 } },
3661 { ISD::BITREVERSE, MVT::v16i16, { 3, 6, 5, 6 } },
3662 { ISD::BITREVERSE, MVT::v32i8, { 3, 6, 5, 6 } },
3663 { ISD::BITREVERSE, MVT::v2i64, { 2, 7, 1, 1 } },
3664 { ISD::BITREVERSE, MVT::v4i32, { 2, 7, 1, 1 } },
3665 { ISD::BITREVERSE, MVT::v8i16, { 2, 7, 1, 1 } },
3666 { ISD::BITREVERSE, MVT::v16i8, { 2, 7, 1, 1 } },
3667 { ISD::BITREVERSE, MVT::i64, { 2, 2, 3, 4 } },
3668 { ISD::BITREVERSE, MVT::i32, { 2, 2, 3, 4 } },
3669 { ISD::BITREVERSE, MVT::i16, { 2, 2, 3, 4 } },
3670 { ISD::BITREVERSE, MVT::i8, { 2, 2, 3, 4 } },
3671 // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
3672 { ISD::ROTL, MVT::v4i64, { 4, 7, 5, 6 } },
3673 { ISD::ROTL, MVT::v8i32, { 4, 7, 5, 6 } },
3674 { ISD::ROTL, MVT::v16i16, { 4, 7, 5, 6 } },
3675 { ISD::ROTL, MVT::v32i8, { 4, 7, 5, 6 } },
3676 { ISD::ROTL, MVT::v2i64, { 1, 3, 1, 1 } },
3677 { ISD::ROTL, MVT::v4i32, { 1, 3, 1, 1 } },
3678 { ISD::ROTL, MVT::v8i16, { 1, 3, 1, 1 } },
3679 { ISD::ROTL, MVT::v16i8, { 1, 3, 1, 1 } },
3680 { ISD::ROTR, MVT::v4i64, { 4, 7, 8, 9 } },
3681 { ISD::ROTR, MVT::v8i32, { 4, 7, 8, 9 } },
3682 { ISD::ROTR, MVT::v16i16, { 4, 7, 8, 9 } },
3683 { ISD::ROTR, MVT::v32i8, { 4, 7, 8, 9 } },
3684 { ISD::ROTR, MVT::v2i64, { 1, 3, 3, 3 } },
3685 { ISD::ROTR, MVT::v4i32, { 1, 3, 3, 3 } },
3686 { ISD::ROTR, MVT::v8i16, { 1, 3, 3, 3 } },
3687 { ISD::ROTR, MVT::v16i8, { 1, 3, 3, 3 } },
3688 { X86ISD::VROTLI, MVT::v4i64, { 4, 7, 5, 6 } },
3689 { X86ISD::VROTLI, MVT::v8i32, { 4, 7, 5, 6 } },
3690 { X86ISD::VROTLI, MVT::v16i16, { 4, 7, 5, 6 } },
3691 { X86ISD::VROTLI, MVT::v32i8, { 4, 7, 5, 6 } },
3692 { X86ISD::VROTLI, MVT::v2i64, { 1, 3, 1, 1 } },
3693 { X86ISD::VROTLI, MVT::v4i32, { 1, 3, 1, 1 } },
3694 { X86ISD::VROTLI, MVT::v8i16, { 1, 3, 1, 1 } },
3695 { X86ISD::VROTLI, MVT::v16i8, { 1, 3, 1, 1 } },
3696 };
3697 static const CostKindTblEntry AVX2CostTbl[] = {
3698 { ISD::ABS, MVT::v2i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3699 { ISD::ABS, MVT::v4i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3700 { ISD::ABS, MVT::v4i32, { 1, 1, 1, 1 } },
3701 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 2 } },
3702 { ISD::ABS, MVT::v8i16, { 1, 1, 1, 1 } },
3703 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 2 } },
3704 { ISD::ABS, MVT::v16i8, { 1, 1, 1, 1 } },
3705 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 2 } },
3706 { ISD::BITREVERSE, MVT::v2i64, { 3, 11, 10, 11 } },
3707 { ISD::BITREVERSE, MVT::v4i64, { 5, 11, 10, 17 } },
3708 { ISD::BITREVERSE, MVT::v4i32, { 3, 11, 10, 11 } },
3709 { ISD::BITREVERSE, MVT::v8i32, { 5, 11, 10, 17 } },
3710 { ISD::BITREVERSE, MVT::v8i16, { 3, 11, 10, 11 } },
3711 { ISD::BITREVERSE, MVT::v16i16, { 5, 11, 10, 17 } },
3712 { ISD::BITREVERSE, MVT::v16i8, { 3, 6, 9, 9 } },
3713 { ISD::BITREVERSE, MVT::v32i8, { 4, 5, 9, 15 } },
3714 { ISD::BSWAP, MVT::v2i64, { 1, 2, 1, 2 } },
3715 { ISD::BSWAP, MVT::v4i64, { 1, 3, 1, 2 } },
3716 { ISD::BSWAP, MVT::v4i32, { 1, 2, 1, 2 } },
3717 { ISD::BSWAP, MVT::v8i32, { 1, 3, 1, 2 } },
3718 { ISD::BSWAP, MVT::v8i16, { 1, 2, 1, 2 } },
3719 { ISD::BSWAP, MVT::v16i16, { 1, 3, 1, 2 } },
3720 { ISD::CTLZ, MVT::v2i64, { 7, 18, 24, 25 } },
3721 { ISD::CTLZ, MVT::v4i64, { 14, 18, 24, 44 } },
3722 { ISD::CTLZ, MVT::v4i32, { 5, 16, 19, 20 } },
3723 { ISD::CTLZ, MVT::v8i32, { 10, 16, 19, 34 } },
3724 { ISD::CTLZ, MVT::v8i16, { 4, 13, 14, 15 } },
3725 { ISD::CTLZ, MVT::v16i16, { 6, 14, 14, 24 } },
3726 { ISD::CTLZ, MVT::v16i8, { 3, 12, 9, 10 } },
3727 { ISD::CTLZ, MVT::v32i8, { 4, 12, 9, 14 } },
3728 { ISD::CTPOP, MVT::v2i64, { 3, 9, 10, 10 } },
3729 { ISD::CTPOP, MVT::v4i64, { 4, 9, 10, 14 } },
3730 { ISD::CTPOP, MVT::v4i32, { 7, 12, 14, 14 } },
3731 { ISD::CTPOP, MVT::v8i32, { 7, 12, 14, 18 } },
3732 { ISD::CTPOP, MVT::v8i16, { 3, 7, 11, 11 } },
3733 { ISD::CTPOP, MVT::v16i16, { 6, 8, 11, 18 } },
3734 { ISD::CTPOP, MVT::v16i8, { 2, 5, 8, 8 } },
3735 { ISD::CTPOP, MVT::v32i8, { 3, 5, 8, 12 } },
3736 { ISD::CTTZ, MVT::v2i64, { 4, 11, 13, 13 } },
3737 { ISD::CTTZ, MVT::v4i64, { 5, 11, 13, 20 } },
3738 { ISD::CTTZ, MVT::v4i32, { 7, 14, 17, 17 } },
3739 { ISD::CTTZ, MVT::v8i32, { 7, 15, 17, 24 } },
3740 { ISD::CTTZ, MVT::v8i16, { 4, 9, 14, 14 } },
3741 { ISD::CTTZ, MVT::v16i16, { 6, 9, 14, 24 } },
3742 { ISD::CTTZ, MVT::v16i8, { 3, 7, 11, 11 } },
3743 { ISD::CTTZ, MVT::v32i8, { 5, 7, 11, 18 } },
3744 { ISD::SADDSAT, MVT::v16i16, { 1 } },
3745 { ISD::SADDSAT, MVT::v32i8, { 1 } },
3746 { ISD::SMAX, MVT::v2i64, { 2, 7, 2, 3 } },
3747 { ISD::SMAX, MVT::v4i64, { 2, 7, 2, 3 } },
3748 { ISD::SMAX, MVT::v8i32, { 1, 1, 1, 2 } },
3749 { ISD::SMAX, MVT::v16i16, { 1, 1, 1, 2 } },
3750 { ISD::SMAX, MVT::v32i8, { 1, 1, 1, 2 } },
3751 { ISD::SMIN, MVT::v2i64, { 2, 7, 2, 3 } },
3752 { ISD::SMIN, MVT::v4i64, { 2, 7, 2, 3 } },
3753 { ISD::SMIN, MVT::v8i32, { 1, 1, 1, 2 } },
3754 { ISD::SMIN, MVT::v16i16, { 1, 1, 1, 2 } },
3755 { ISD::SMIN, MVT::v32i8, { 1, 1, 1, 2 } },
3756 { ISD::SSUBSAT, MVT::v16i16, { 1 } },
3757 { ISD::SSUBSAT, MVT::v32i8, { 1 } },
3758 { ISD::UADDSAT, MVT::v16i16, { 1 } },
3759 { ISD::UADDSAT, MVT::v32i8, { 1 } },
3760 { ISD::UADDSAT, MVT::v8i32, { 3 } }, // not + pminud + paddd
3761 { ISD::UMAX, MVT::v2i64, { 2, 8, 5, 6 } },
3762 { ISD::UMAX, MVT::v4i64, { 2, 8, 5, 8 } },
3763 { ISD::UMAX, MVT::v8i32, { 1, 1, 1, 2 } },
3764 { ISD::UMAX, MVT::v16i16, { 1, 1, 1, 2 } },
3765 { ISD::UMAX, MVT::v32i8, { 1, 1, 1, 2 } },
3766 { ISD::UMIN, MVT::v2i64, { 2, 8, 5, 6 } },
3767 { ISD::UMIN, MVT::v4i64, { 2, 8, 5, 8 } },
3768 { ISD::UMIN, MVT::v8i32, { 1, 1, 1, 2 } },
3769 { ISD::UMIN, MVT::v16i16, { 1, 1, 1, 2 } },
3770 { ISD::UMIN, MVT::v32i8, { 1, 1, 1, 2 } },
3771 { ISD::USUBSAT, MVT::v16i16, { 1 } },
3772 { ISD::USUBSAT, MVT::v32i8, { 1 } },
3773 { ISD::USUBSAT, MVT::v8i32, { 2 } }, // pmaxud + psubd
3774 { ISD::FMAXNUM, MVT::f32, { 2, 7, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3775 { ISD::FMAXNUM, MVT::v4f32, { 2, 7, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3776 { ISD::FMAXNUM, MVT::v8f32, { 3, 7, 3, 6 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3777 { ISD::FMAXNUM, MVT::f64, { 2, 7, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3778 { ISD::FMAXNUM, MVT::v2f64, { 2, 7, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3779 { ISD::FMAXNUM, MVT::v4f64, { 3, 7, 3, 6 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3780 { ISD::FSQRT, MVT::f32, { 7, 15, 1, 1 } }, // vsqrtss
3781 { ISD::FSQRT, MVT::v4f32, { 7, 15, 1, 1 } }, // vsqrtps
3782 { ISD::FSQRT, MVT::v8f32, { 14, 21, 1, 3 } }, // vsqrtps
3783 { ISD::FSQRT, MVT::f64, { 14, 21, 1, 1 } }, // vsqrtsd
3784 { ISD::FSQRT, MVT::v2f64, { 14, 21, 1, 1 } }, // vsqrtpd
3785 { ISD::FSQRT, MVT::v4f64, { 28, 35, 1, 3 } }, // vsqrtpd
3786 };
3787 static const CostKindTblEntry AVX1CostTbl[] = {
3788 { ISD::ABS, MVT::v4i64, { 6, 8, 6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3789 { ISD::ABS, MVT::v8i32, { 3, 6, 4, 5 } },
3790 { ISD::ABS, MVT::v16i16, { 3, 6, 4, 5 } },
3791 { ISD::ABS, MVT::v32i8, { 3, 6, 4, 5 } },
3792 { ISD::BITREVERSE, MVT::v4i64, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3793 { ISD::BITREVERSE, MVT::v2i64, { 8, 13, 10, 16 } },
3794 { ISD::BITREVERSE, MVT::v8i32, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3795 { ISD::BITREVERSE, MVT::v4i32, { 8, 13, 10, 16 } },
3796 { ISD::BITREVERSE, MVT::v16i16, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3797 { ISD::BITREVERSE, MVT::v8i16, { 8, 13, 10, 16 } },
3798 { ISD::BITREVERSE, MVT::v32i8, { 13, 15, 17, 26 } }, // 2 x 128-bit Op + extract/insert
3799 { ISD::BITREVERSE, MVT::v16i8, { 7, 7, 9, 13 } },
3800 { ISD::BSWAP, MVT::v4i64, { 5, 6, 5, 10 } },
3801 { ISD::BSWAP, MVT::v2i64, { 2, 2, 1, 3 } },
3802 { ISD::BSWAP, MVT::v8i32, { 5, 6, 5, 10 } },
3803 { ISD::BSWAP, MVT::v4i32, { 2, 2, 1, 3 } },
3804 { ISD::BSWAP, MVT::v16i16, { 5, 6, 5, 10 } },
3805 { ISD::BSWAP, MVT::v8i16, { 2, 2, 1, 3 } },
3806 { ISD::CTLZ, MVT::v4i64, { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert
3807 { ISD::CTLZ, MVT::v2i64, { 14, 24, 24, 28 } },
3808 { ISD::CTLZ, MVT::v8i32, { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert
3809 { ISD::CTLZ, MVT::v4i32, { 12, 20, 19, 23 } },
3810 { ISD::CTLZ, MVT::v16i16, { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert
3811 { ISD::CTLZ, MVT::v8i16, { 9, 16, 14, 18 } },
3812 { ISD::CTLZ, MVT::v32i8, { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert
3813 { ISD::CTLZ, MVT::v16i8, { 7, 12, 9, 13 } },
3814 { ISD::CTPOP, MVT::v4i64, { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert
3815 { ISD::CTPOP, MVT::v2i64, { 7, 14, 10, 14 } },
3816 { ISD::CTPOP, MVT::v8i32, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
3817 { ISD::CTPOP, MVT::v4i32, { 9, 20, 14, 18 } },
3818 { ISD::CTPOP, MVT::v16i16, { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert
3819 { ISD::CTPOP, MVT::v8i16, { 8, 18, 11, 15 } },
3820 { ISD::CTPOP, MVT::v32i8, { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert
3821 { ISD::CTPOP, MVT::v16i8, { 6, 12, 8, 12 } },
3822 { ISD::CTTZ, MVT::v4i64, { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert
3823 { ISD::CTTZ, MVT::v2i64, { 9, 19, 13, 17 } },
3824 { ISD::CTTZ, MVT::v8i32, { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert
3825 { ISD::CTTZ, MVT::v4i32, { 11, 24, 17, 21 } },
3826 { ISD::CTTZ, MVT::v16i16, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
3827 { ISD::CTTZ, MVT::v8i16, { 9, 21, 14, 18 } },
3828 { ISD::CTTZ, MVT::v32i8, { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert
3829 { ISD::CTTZ, MVT::v16i8, { 8, 16, 11, 15 } },
3830 { ISD::SADDSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3831 { ISD::SADDSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3832 { ISD::SMAX, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
3833 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 4 } },
3834 { ISD::SMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3835 { ISD::SMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3836 { ISD::SMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3837 { ISD::SMIN, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
3838 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
3839 { ISD::SMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3840 { ISD::SMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3841 { ISD::SMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3842 { ISD::SSUBSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3843 { ISD::SSUBSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3844 { ISD::UADDSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3845 { ISD::UADDSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3846 { ISD::UADDSAT, MVT::v8i32, { 8 } }, // 2 x 128-bit Op + extract/insert
3847 { ISD::UMAX, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
3848 { ISD::UMAX, MVT::v2i64, { 4, 8, 5, 7 } },
3849 { ISD::UMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3850 { ISD::UMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3851 { ISD::UMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3852 { ISD::UMIN, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
3853 { ISD::UMIN, MVT::v2i64, { 4, 8, 5, 7 } },
3854 { ISD::UMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3855 { ISD::UMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3856 { ISD::UMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3857 { ISD::USUBSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3858 { ISD::USUBSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3859 { ISD::USUBSAT, MVT::v8i32, { 6 } }, // 2 x 128-bit Op + extract/insert
3860 { ISD::FMAXNUM, MVT::f32, { 3, 6, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3861 { ISD::FMAXNUM, MVT::v4f32, { 3, 6, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3862 { ISD::FMAXNUM, MVT::v8f32, { 5, 7, 3, 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3863 { ISD::FMAXNUM, MVT::f64, { 3, 6, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3864 { ISD::FMAXNUM, MVT::v2f64, { 3, 6, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3865 { ISD::FMAXNUM, MVT::v4f64, { 5, 7, 3, 10 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3866 { ISD::FSQRT, MVT::f32, { 21, 21, 1, 1 } }, // vsqrtss
3867 { ISD::FSQRT, MVT::v4f32, { 21, 21, 1, 1 } }, // vsqrtps
3868 { ISD::FSQRT, MVT::v8f32, { 42, 42, 1, 3 } }, // vsqrtps
3869 { ISD::FSQRT, MVT::f64, { 27, 27, 1, 1 } }, // vsqrtsd
3870 { ISD::FSQRT, MVT::v2f64, { 27, 27, 1, 1 } }, // vsqrtpd
3871 { ISD::FSQRT, MVT::v4f64, { 54, 54, 1, 3 } }, // vsqrtpd
3872 };
3873 static const CostKindTblEntry GFNICostTbl[] = {
3874 { ISD::BITREVERSE, MVT::i8, { 3, 3, 3, 4 } }, // gf2p8affineqb
3875 { ISD::BITREVERSE, MVT::i16, { 3, 3, 4, 6 } }, // gf2p8affineqb
3876 { ISD::BITREVERSE, MVT::i32, { 3, 3, 4, 5 } }, // gf2p8affineqb
3877 { ISD::BITREVERSE, MVT::i64, { 3, 3, 4, 6 } }, // gf2p8affineqb
3878 { ISD::BITREVERSE, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3879 { ISD::BITREVERSE, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3880 { ISD::BITREVERSE, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3881 { ISD::BITREVERSE, MVT::v8i16, { 1, 8, 2, 4 } }, // gf2p8affineqb
3882 { ISD::BITREVERSE, MVT::v16i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
3883 { ISD::BITREVERSE, MVT::v32i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
3884 { ISD::BITREVERSE, MVT::v4i32, { 1, 8, 2, 4 } }, // gf2p8affineqb
3885 { ISD::BITREVERSE, MVT::v8i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
3886 { ISD::BITREVERSE, MVT::v16i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
3887 { ISD::BITREVERSE, MVT::v2i64, { 1, 8, 2, 4 } }, // gf2p8affineqb
3888 { ISD::BITREVERSE, MVT::v4i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
3889 { ISD::BITREVERSE, MVT::v8i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
3890 { X86ISD::VROTLI, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3891 { X86ISD::VROTLI, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3892 { X86ISD::VROTLI, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3893 };
3894 static const CostKindTblEntry GLMCostTbl[] = {
3895 { ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss
3896 { ISD::FSQRT, MVT::v4f32, { 37, 41, 1, 5 } }, // sqrtps
3897 { ISD::FSQRT, MVT::f64, { 34, 35, 1, 1 } }, // sqrtsd
3898 { ISD::FSQRT, MVT::v2f64, { 67, 71, 1, 5 } }, // sqrtpd
3899 };
3900 static const CostKindTblEntry SLMCostTbl[] = {
3901 { ISD::BSWAP, MVT::v2i64, { 5, 5, 1, 5 } },
3902 { ISD::BSWAP, MVT::v4i32, { 5, 5, 1, 5 } },
3903 { ISD::BSWAP, MVT::v8i16, { 5, 5, 1, 5 } },
3904 { ISD::FSQRT, MVT::f32, { 20, 20, 1, 1 } }, // sqrtss
3905 { ISD::FSQRT, MVT::v4f32, { 40, 41, 1, 5 } }, // sqrtps
3906 { ISD::FSQRT, MVT::f64, { 35, 35, 1, 1 } }, // sqrtsd
3907 { ISD::FSQRT, MVT::v2f64, { 70, 71, 1, 5 } }, // sqrtpd
3908 };
3909 static const CostKindTblEntry SSE42CostTbl[] = {
3910 { ISD::USUBSAT, MVT::v4i32, { 2 } }, // pmaxud + psubd
3911 { ISD::UADDSAT, MVT::v4i32, { 3 } }, // not + pminud + paddd
3912 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3913 { ISD::FMAXNUM, MVT::v4f32, { 4, 4, 4, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3914 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3915 { ISD::FMAXNUM, MVT::v2f64, { 4, 4, 4, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3916 { ISD::FSQRT, MVT::f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
3917 { ISD::FSQRT, MVT::v4f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
3918 };
3919 static const CostKindTblEntry SSE41CostTbl[] = {
3920 { ISD::ABS, MVT::v2i64, { 3, 4, 3, 5 } }, // BLENDVPD(X,PSUBQ(0,X),X)
3921 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 3 } },
3922 { ISD::SMAX, MVT::v4i32, { 1, 1, 1, 1 } },
3923 { ISD::SMAX, MVT::v16i8, { 1, 1, 1, 1 } },
3924 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
3925 { ISD::SMIN, MVT::v4i32, { 1, 1, 1, 1 } },
3926 { ISD::SMIN, MVT::v16i8, { 1, 1, 1, 1 } },
3927 { ISD::UMAX, MVT::v2i64, { 2, 11, 6, 7 } },
3928 { ISD::UMAX, MVT::v4i32, { 1, 1, 1, 1 } },
3929 { ISD::UMAX, MVT::v8i16, { 1, 1, 1, 1 } },
3930 { ISD::UMIN, MVT::v2i64, { 2, 11, 6, 7 } },
3931 { ISD::UMIN, MVT::v4i32, { 1, 1, 1, 1 } },
3932 { ISD::UMIN, MVT::v8i16, { 1, 1, 1, 1 } },
3933 };
3934 static const CostKindTblEntry SSSE3CostTbl[] = {
3935 { ISD::ABS, MVT::v4i32, { 1, 2, 1, 1 } },
3936 { ISD::ABS, MVT::v8i16, { 1, 2, 1, 1 } },
3937 { ISD::ABS, MVT::v16i8, { 1, 2, 1, 1 } },
3938 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 11, 21 } },
3939 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 11, 21 } },
3940 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 11, 21 } },
3941 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 10, 16 } },
3942 { ISD::BSWAP, MVT::v2i64, { 2, 3, 1, 5 } },
3943 { ISD::BSWAP, MVT::v4i32, { 2, 3, 1, 5 } },
3944 { ISD::BSWAP, MVT::v8i16, { 2, 3, 1, 5 } },
3945 { ISD::CTLZ, MVT::v2i64, { 18, 28, 28, 35 } },
3946 { ISD::CTLZ, MVT::v4i32, { 15, 20, 22, 28 } },
3947 { ISD::CTLZ, MVT::v8i16, { 13, 17, 16, 22 } },
3948 { ISD::CTLZ, MVT::v16i8, { 11, 15, 10, 16 } },
3949 { ISD::CTPOP, MVT::v2i64, { 13, 19, 12, 18 } },
3950 { ISD::CTPOP, MVT::v4i32, { 18, 24, 16, 22 } },
3951 { ISD::CTPOP, MVT::v8i16, { 13, 18, 14, 20 } },
3952 { ISD::CTPOP, MVT::v16i8, { 11, 12, 10, 16 } },
3953 { ISD::CTTZ, MVT::v2i64, { 13, 25, 15, 22 } },
3954 { ISD::CTTZ, MVT::v4i32, { 18, 26, 19, 25 } },
3955 { ISD::CTTZ, MVT::v8i16, { 13, 20, 17, 23 } },
3956 { ISD::CTTZ, MVT::v16i8, { 11, 16, 13, 19 } }
3957 };
3958 static const CostKindTblEntry SSE2CostTbl[] = {
3959 { ISD::ABS, MVT::v2i64, { 3, 6, 5, 5 } },
3960 { ISD::ABS, MVT::v4i32, { 1, 4, 4, 4 } },
3961 { ISD::ABS, MVT::v8i16, { 1, 2, 3, 3 } },
3962 { ISD::ABS, MVT::v16i8, { 1, 2, 3, 3 } },
3963 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 32, 32 } },
3964 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 30, 30 } },
3965 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 25, 25 } },
3966 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 21, 21 } },
3967 { ISD::BSWAP, MVT::v2i64, { 5, 6, 11, 11 } },
3968 { ISD::BSWAP, MVT::v4i32, { 5, 5, 9, 9 } },
3969 { ISD::BSWAP, MVT::v8i16, { 5, 5, 4, 5 } },
3970 { ISD::CTLZ, MVT::v2i64, { 10, 45, 36, 38 } },
3971 { ISD::CTLZ, MVT::v4i32, { 10, 45, 38, 40 } },
3972 { ISD::CTLZ, MVT::v8i16, { 9, 38, 32, 34 } },
3973 { ISD::CTLZ, MVT::v16i8, { 8, 39, 29, 32 } },
3974 { ISD::CTPOP, MVT::v2i64, { 12, 26, 16, 18 } },
3975 { ISD::CTPOP, MVT::v4i32, { 15, 29, 21, 23 } },
3976 { ISD::CTPOP, MVT::v8i16, { 13, 25, 18, 20 } },
3977 { ISD::CTPOP, MVT::v16i8, { 10, 21, 14, 16 } },
3978 { ISD::CTTZ, MVT::v2i64, { 14, 28, 19, 21 } },
3979 { ISD::CTTZ, MVT::v4i32, { 18, 31, 24, 26 } },
3980 { ISD::CTTZ, MVT::v8i16, { 16, 27, 21, 23 } },
3981 { ISD::CTTZ, MVT::v16i8, { 13, 23, 17, 19 } },
3982 { ISD::SADDSAT, MVT::v8i16, { 1 } },
3983 { ISD::SADDSAT, MVT::v16i8, { 1 } },
3984 { ISD::SMAX, MVT::v2i64, { 4, 8, 15, 15 } },
3985 { ISD::SMAX, MVT::v4i32, { 2, 4, 5, 5 } },
3986 { ISD::SMAX, MVT::v8i16, { 1, 1, 1, 1 } },
3987 { ISD::SMAX, MVT::v16i8, { 2, 4, 5, 5 } },
3988 { ISD::SMIN, MVT::v2i64, { 4, 8, 15, 15 } },
3989 { ISD::SMIN, MVT::v4i32, { 2, 4, 5, 5 } },
3990 { ISD::SMIN, MVT::v8i16, { 1, 1, 1, 1 } },
3991 { ISD::SMIN, MVT::v16i8, { 2, 4, 5, 5 } },
3992 { ISD::SSUBSAT, MVT::v8i16, { 1 } },
3993 { ISD::SSUBSAT, MVT::v16i8, { 1 } },
3994 { ISD::UADDSAT, MVT::v8i16, { 1 } },
3995 { ISD::UADDSAT, MVT::v16i8, { 1 } },
3996 { ISD::UMAX, MVT::v2i64, { 4, 8, 15, 15 } },
3997 { ISD::UMAX, MVT::v4i32, { 2, 5, 8, 8 } },
3998 { ISD::UMAX, MVT::v8i16, { 1, 3, 3, 3 } },
3999 { ISD::UMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4000 { ISD::UMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4001 { ISD::UMIN, MVT::v4i32, { 2, 5, 8, 8 } },
4002 { ISD::UMIN, MVT::v8i16, { 1, 3, 3, 3 } },
4003 { ISD::UMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4004 { ISD::USUBSAT, MVT::v8i16, { 1 } },
4005 { ISD::USUBSAT, MVT::v16i8, { 1 } },
4006 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } },
4007 { ISD::FMAXNUM, MVT::v2f64, { 4, 6, 6, 6 } },
4008 { ISD::FSQRT, MVT::f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4009 { ISD::FSQRT, MVT::v2f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4010 };
4011 static const CostKindTblEntry SSE1CostTbl[] = {
4012 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } },
4013 { ISD::FMAXNUM, MVT::v4f32, { 4, 6, 6, 6 } },
4014 { ISD::FSQRT, MVT::f32, { 28, 30, 1, 2 } }, // Pentium III from http://www.agner.org/
4015 { ISD::FSQRT, MVT::v4f32, { 56, 56, 1, 2 } }, // Pentium III from http://www.agner.org/
4016 };
4017 static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets
4018 { ISD::CTTZ, MVT::i64, { 1 } },
4019 };
4020 static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
4021 { ISD::CTTZ, MVT::i32, { 1 } },
4022 { ISD::CTTZ, MVT::i16, { 1 } },
4023 { ISD::CTTZ, MVT::i8, { 1 } },
4024 };
4025 static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets
4026 { ISD::CTLZ, MVT::i64, { 1 } },
4027 };
4028 static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
4029 { ISD::CTLZ, MVT::i32, { 1 } },
4030 { ISD::CTLZ, MVT::i16, { 2 } },
4031 { ISD::CTLZ, MVT::i8, { 2 } },
4032 };
4033 static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets
4034 { ISD::CTPOP, MVT::i64, { 1, 1, 1, 1 } }, // popcnt
4035 };
4036 static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
4037 { ISD::CTPOP, MVT::i32, { 1, 1, 1, 1 } }, // popcnt
4038 { ISD::CTPOP, MVT::i16, { 1, 1, 2, 2 } }, // popcnt(zext())
4039 { ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext())
4040 };
4041 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
4042 { ISD::ABS, MVT::i64, { 1, 2, 3, 4 } }, // SUB+CMOV
4043 { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } },
4044 { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } },
4045 { ISD::CTLZ, MVT::i64, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
4046 { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 1, 1, 1 } }, // BSR+XOR
4047 { ISD::CTTZ, MVT::i64, { 3 } }, // TEST+BSF+CMOV/BRANCH
4048 { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 1, 1, 1 } }, // BSR
4049 { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } },
4050 { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } },
4051 { ISD::ROTR, MVT::i64, { 2, 3, 1, 3 } },
4052 { X86ISD::VROTLI, MVT::i64, { 1, 1, 1, 1 } },
4053 { ISD::FSHL, MVT::i64, { 4, 4, 1, 4 } },
4054 { ISD::SMAX, MVT::i64, { 1, 3, 2, 3 } },
4055 { ISD::SMIN, MVT::i64, { 1, 3, 2, 3 } },
4056 { ISD::UMAX, MVT::i64, { 1, 3, 2, 3 } },
4057 { ISD::UMIN, MVT::i64, { 1, 3, 2, 3 } },
4058 { ISD::SADDO, MVT::i64, { 1 } },
4059 { ISD::UADDO, MVT::i64, { 1 } },
4060 { ISD::UMULO, MVT::i64, { 2 } }, // mulq + seto
4061 };
4062 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
4063 { ISD::ABS, MVT::i32, { 1, 2, 3, 4 } }, // SUB+XOR+SRA or SUB+CMOV
4064 { ISD::ABS, MVT::i16, { 2, 2, 3, 4 } }, // SUB+XOR+SRA or SUB+CMOV
4065 { ISD::ABS, MVT::i8, { 2, 4, 4, 4 } }, // SUB+XOR+SRA
4066 { ISD::BITREVERSE, MVT::i32, { 9, 12, 17, 19 } },
4067 { ISD::BITREVERSE, MVT::i16, { 9, 12, 17, 19 } },
4068 { ISD::BITREVERSE, MVT::i8, { 7, 9, 13, 14 } },
4069 { ISD::BSWAP, MVT::i32, { 1, 1, 1, 1 } },
4070 { ISD::BSWAP, MVT::i16, { 1, 2, 1, 2 } }, // ROL
4071 { ISD::CTLZ, MVT::i32, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
4072 { ISD::CTLZ, MVT::i16, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
4073 { ISD::CTLZ, MVT::i8, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
4074 { ISD::CTLZ_ZERO_UNDEF, MVT::i32,{ 1, 1, 1, 1 } }, // BSR+XOR
4075 { ISD::CTLZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 3, 3 } }, // BSR+XOR
4076 { ISD::CTLZ_ZERO_UNDEF, MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR
4077 { ISD::CTTZ, MVT::i32, { 3 } }, // TEST+BSF+CMOV/BRANCH
4078 { ISD::CTTZ, MVT::i16, { 3 } }, // TEST+BSF+CMOV/BRANCH
4079 { ISD::CTTZ, MVT::i8, { 3 } }, // TEST+BSF+CMOV/BRANCH
4080 { ISD::CTTZ_ZERO_UNDEF, MVT::i32,{ 1, 1, 1, 1 } }, // BSF
4081 { ISD::CTTZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 1, 1 } }, // BSF
4082 { ISD::CTTZ_ZERO_UNDEF, MVT::i8, { 2, 2, 1, 1 } }, // BSF
4083 { ISD::CTPOP, MVT::i32, { 8, 7, 15, 15 } },
4084 { ISD::CTPOP, MVT::i16, { 9, 8, 17, 17 } },
4085 { ISD::CTPOP, MVT::i8, { 7, 6, 6, 6 } },
4086 { ISD::ROTL, MVT::i32, { 2, 3, 1, 3 } },
4087 { ISD::ROTL, MVT::i16, { 2, 3, 1, 3 } },
4088 { ISD::ROTL, MVT::i8, { 2, 3, 1, 3 } },
4089 { ISD::ROTR, MVT::i32, { 2, 3, 1, 3 } },
4090 { ISD::ROTR, MVT::i16, { 2, 3, 1, 3 } },
4091 { ISD::ROTR, MVT::i8, { 2, 3, 1, 3 } },
4092 { X86ISD::VROTLI, MVT::i32, { 1, 1, 1, 1 } },
4093 { X86ISD::VROTLI, MVT::i16, { 1, 1, 1, 1 } },
4094 { X86ISD::VROTLI, MVT::i8, { 1, 1, 1, 1 } },
4095 { ISD::FSHL, MVT::i32, { 4, 4, 1, 4 } },
4096 { ISD::FSHL, MVT::i16, { 4, 4, 2, 5 } },
4097 { ISD::FSHL, MVT::i8, { 4, 4, 2, 5 } },
4098 { ISD::SMAX, MVT::i32, { 1, 2, 2, 3 } },
4099 { ISD::SMAX, MVT::i16, { 1, 4, 2, 4 } },
4100 { ISD::SMAX, MVT::i8, { 1, 4, 2, 4 } },
4101 { ISD::SMIN, MVT::i32, { 1, 2, 2, 3 } },
4102 { ISD::SMIN, MVT::i16, { 1, 4, 2, 4 } },
4103 { ISD::SMIN, MVT::i8, { 1, 4, 2, 4 } },
4104 { ISD::UMAX, MVT::i32, { 1, 2, 2, 3 } },
4105 { ISD::UMAX, MVT::i16, { 1, 4, 2, 4 } },
4106 { ISD::UMAX, MVT::i8, { 1, 4, 2, 4 } },
4107 { ISD::UMIN, MVT::i32, { 1, 2, 2, 3 } },
4108 { ISD::UMIN, MVT::i16, { 1, 4, 2, 4 } },
4109 { ISD::UMIN, MVT::i8, { 1, 4, 2, 4 } },
4110 { ISD::SADDO, MVT::i32, { 1 } },
4111 { ISD::SADDO, MVT::i16, { 1 } },
4112 { ISD::SADDO, MVT::i8, { 1 } },
4113 { ISD::UADDO, MVT::i32, { 1 } },
4114 { ISD::UADDO, MVT::i16, { 1 } },
4115 { ISD::UADDO, MVT::i8, { 1 } },
4116 { ISD::UMULO, MVT::i32, { 2 } }, // mul + seto
4117 { ISD::UMULO, MVT::i16, { 2 } },
4118 { ISD::UMULO, MVT::i8, { 2 } },
4119 };
4120
4121 Type *RetTy = ICA.getReturnType();
4122 Type *OpTy = RetTy;
4123 Intrinsic::ID IID = ICA.getID();
4124 unsigned ISD = ISD::DELETED_NODE;
4125 switch (IID) {
4126 default:
4127 break;
4128 case Intrinsic::abs:
4129 ISD = ISD::ABS;
4130 break;
4131 case Intrinsic::bitreverse:
4132 ISD = ISD::BITREVERSE;
4133 break;
4134 case Intrinsic::bswap:
4135 ISD = ISD::BSWAP;
4136 break;
4137 case Intrinsic::ctlz:
4138 ISD = ISD::CTLZ;
4139 break;
4140 case Intrinsic::ctpop:
4141 ISD = ISD::CTPOP;
4142 break;
4143 case Intrinsic::cttz:
4144 ISD = ISD::CTTZ;
4145 break;
4146 case Intrinsic::fshl:
4147 ISD = ISD::FSHL;
4148 if (!ICA.isTypeBasedOnly()) {
4149 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4150 if (Args[0] == Args[1]) {
4151 ISD = ISD::ROTL;
4152 // Handle uniform constant rotation amounts.
4153 // TODO: Handle funnel-shift cases.
4154 const APInt *Amt;
4155 if (Args[2] &&
4157 ISD = X86ISD::VROTLI;
4158 }
4159 }
4160 break;
4161 case Intrinsic::fshr:
4162 // FSHR has same costs so don't duplicate.
4163 ISD = ISD::FSHL;
4164 if (!ICA.isTypeBasedOnly()) {
4165 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4166 if (Args[0] == Args[1]) {
4167 ISD = ISD::ROTR;
4168 // Handle uniform constant rotation amount.
4169 // TODO: Handle funnel-shift cases.
4170 const APInt *Amt;
4171 if (Args[2] &&
4173 ISD = X86ISD::VROTLI;
4174 }
4175 }
4176 break;
4177 case Intrinsic::lrint:
4178 case Intrinsic::llrint:
4179 // X86 can use the CVTP2SI instructions to lower lrint/llrint calls, which
4180 // have the same costs as the CVTTP2SI (fptosi) instructions
4181 if (!ICA.isTypeBasedOnly()) {
4182 const SmallVectorImpl<Type *> &ArgTys = ICA.getArgTypes();
4183 return getCastInstrCost(Instruction::FPToSI, RetTy, ArgTys[0],
4185 }
4186 break;
4187 case Intrinsic::maxnum:
4188 case Intrinsic::minnum:
4189 // FMINNUM has same costs so don't duplicate.
4190 ISD = ISD::FMAXNUM;
4191 break;
4192 case Intrinsic::sadd_sat:
4193 ISD = ISD::SADDSAT;
4194 break;
4195 case Intrinsic::smax:
4196 ISD = ISD::SMAX;
4197 break;
4198 case Intrinsic::smin:
4199 ISD = ISD::SMIN;
4200 break;
4201 case Intrinsic::ssub_sat:
4202 ISD = ISD::SSUBSAT;
4203 break;
4204 case Intrinsic::uadd_sat:
4205 ISD = ISD::UADDSAT;
4206 break;
4207 case Intrinsic::umax:
4208 ISD = ISD::UMAX;
4209 break;
4210 case Intrinsic::umin:
4211 ISD = ISD::UMIN;
4212 break;
4213 case Intrinsic::usub_sat:
4214 ISD = ISD::USUBSAT;
4215 break;
4216 case Intrinsic::sqrt:
4217 ISD = ISD::FSQRT;
4218 break;
4219 case Intrinsic::sadd_with_overflow:
4220 case Intrinsic::ssub_with_overflow:
4221 // SSUBO has same costs so don't duplicate.
4222 ISD = ISD::SADDO;
4223 OpTy = RetTy->getContainedType(0);
4224 break;
4225 case Intrinsic::uadd_with_overflow:
4226 case Intrinsic::usub_with_overflow:
4227 // USUBO has same costs so don't duplicate.
4228 ISD = ISD::UADDO;
4229 OpTy = RetTy->getContainedType(0);
4230 break;
4231 case Intrinsic::umul_with_overflow:
4232 case Intrinsic::smul_with_overflow:
4233 // SMULO has same costs so don't duplicate.
4234 ISD = ISD::UMULO;
4235 OpTy = RetTy->getContainedType(0);
4236 break;
4237 }
4238
4239 if (ISD != ISD::DELETED_NODE) {
4240 // Legalize the type.
4241 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy);
4242 MVT MTy = LT.second;
4243
4244 // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost.
4245 if (((ISD == ISD::CTTZ && !ST->hasBMI()) ||
4246 (ISD == ISD::CTLZ && !ST->hasLZCNT())) &&
4247 !MTy.isVector() && !ICA.isTypeBasedOnly()) {
4248 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4249 if (auto *Cst = dyn_cast<ConstantInt>(Args[1]))
4250 if (Cst->isAllOnesValue())
4252 }
4253
4254 // FSQRT is a single instruction.
4255 if (ISD == ISD::FSQRT && CostKind == TTI::TCK_CodeSize)
4256 return LT.first;
4257
4258 auto adjustTableCost = [](int ISD, unsigned Cost,
4259 InstructionCost LegalizationCost,
4260 FastMathFlags FMF) {
4261 // If there are no NANs to deal with, then these are reduced to a
4262 // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
4263 // assume is used in the non-fast case.
4264 if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) {
4265 if (FMF.noNaNs())
4266 return LegalizationCost * 1;
4267 }
4268 return LegalizationCost * (int)Cost;
4269 };
4270
4271 if (ST->useGLMDivSqrtCosts())
4272 if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
4273 if (auto KindCost = Entry->Cost[CostKind])
4274 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4275 ICA.getFlags());
4276
4277 if (ST->useSLMArithCosts())
4278 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
4279 if (auto KindCost = Entry->Cost[CostKind])
4280 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4281 ICA.getFlags());
4282
4283 if (ST->hasVBMI2())
4284 if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy))
4285 if (auto KindCost = Entry->Cost[CostKind])
4286 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4287 ICA.getFlags());
4288
4289 if (ST->hasBITALG())
4290 if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
4291 if (auto KindCost = Entry->Cost[CostKind])
4292 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4293 ICA.getFlags());
4294
4295 if (ST->hasVPOPCNTDQ())
4296 if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
4297 if (auto KindCost = Entry->Cost[CostKind])
4298 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4299 ICA.getFlags());
4300
4301 if (ST->hasGFNI())
4302 if (const auto *Entry = CostTableLookup(GFNICostTbl, ISD, MTy))
4303 if (auto KindCost = Entry->Cost[CostKind])
4304 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4305 ICA.getFlags());
4306
4307 if (ST->hasCDI())
4308 if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
4309 if (auto KindCost = Entry->Cost[CostKind])
4310 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4311 ICA.getFlags());
4312
4313 if (ST->hasBWI())
4314 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
4315 if (auto KindCost = Entry->Cost[CostKind])
4316 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4317 ICA.getFlags());
4318
4319 if (ST->hasAVX512())
4320 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
4321 if (auto KindCost = Entry->Cost[CostKind])
4322 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4323 ICA.getFlags());
4324
4325 if (ST->hasXOP())
4326 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
4327 if (auto KindCost = Entry->Cost[CostKind])
4328 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4329 ICA.getFlags());
4330
4331 if (ST->hasAVX2())
4332 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
4333 if (auto KindCost = Entry->Cost[CostKind])
4334 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4335 ICA.getFlags());
4336
4337 if (ST->hasAVX())
4338 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
4339 if (auto KindCost = Entry->Cost[CostKind])
4340 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4341 ICA.getFlags());
4342
4343 if (ST->hasSSE42())
4344 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
4345 if (auto KindCost = Entry->Cost[CostKind])
4346 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4347 ICA.getFlags());
4348
4349 if (ST->hasSSE41())
4350 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
4351 if (auto KindCost = Entry->Cost[CostKind])
4352 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4353 ICA.getFlags());
4354
4355 if (ST->hasSSSE3())
4356 if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
4357 if (auto KindCost = Entry->Cost[CostKind])
4358 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4359 ICA.getFlags());
4360
4361 if (ST->hasSSE2())
4362 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
4363 if (auto KindCost = Entry->Cost[CostKind])
4364 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4365 ICA.getFlags());
4366
4367 if (ST->hasSSE1())
4368 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
4369 if (auto KindCost = Entry->Cost[CostKind])
4370 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4371 ICA.getFlags());
4372
4373 if (ST->hasBMI()) {
4374 if (ST->is64Bit())
4375 if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
4376 if (auto KindCost = Entry->Cost[CostKind])
4377 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4378 ICA.getFlags());
4379
4380 if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
4381 if (auto KindCost = Entry->Cost[CostKind])
4382 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4383 ICA.getFlags());
4384 }
4385
4386 if (ST->hasLZCNT()) {
4387 if (ST->is64Bit())
4388 if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
4389 if (auto KindCost = Entry->Cost[CostKind])
4390 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4391 ICA.getFlags());
4392
4393 if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
4394 if (auto KindCost = Entry->Cost[CostKind])
4395 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4396 ICA.getFlags());
4397 }
4398
4399 if (ST->hasPOPCNT()) {
4400 if (ST->is64Bit())
4401 if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
4402 if (auto KindCost = Entry->Cost[CostKind])
4403 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4404 ICA.getFlags());
4405
4406 if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
4407 if (auto KindCost = Entry->Cost[CostKind])
4408 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4409 ICA.getFlags());
4410 }
4411
4412 if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
4413 if (const Instruction *II = ICA.getInst()) {
4414 if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
4415 return TTI::TCC_Free;
4416 if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
4417 if (LI->hasOneUse())
4418 return TTI::TCC_Free;
4419 }
4420 }
4421 }
4422
4423 if (ST->is64Bit())
4424 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
4425 if (auto KindCost = Entry->Cost[CostKind])
4426 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4427 ICA.getFlags());
4428
4429 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
4430 if (auto KindCost = Entry->Cost[CostKind])
4431 return adjustTableCost(Entry->ISD, *KindCost, LT.first, ICA.getFlags());
4432 }
4433
4435}
4436
4439 unsigned Index, Value *Op0,
4440 Value *Op1) {
4441 static const CostTblEntry SLMCostTbl[] = {
4442 { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },
4443 { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 },
4444 { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 },
4445 { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 }
4446 };
4447
4448 assert(Val->isVectorTy() && "This must be a vector type");
4449 Type *ScalarType = Val->getScalarType();
4450 InstructionCost RegisterFileMoveCost = 0;
4451
4452 // Non-immediate extraction/insertion can be handled as a sequence of
4453 // aliased loads+stores via the stack.
4454 if (Index == -1U && (Opcode == Instruction::ExtractElement ||
4455 Opcode == Instruction::InsertElement)) {
4456 // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
4457 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
4458
4459 // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
4460 assert(isa<FixedVectorType>(Val) && "Fixed vector type expected");
4461 Align VecAlign = DL.getPrefTypeAlign(Val);
4462 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
4463
4464 // Extract - store vector to stack, load scalar.
4465 if (Opcode == Instruction::ExtractElement) {
4466 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4467 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
4468 CostKind);
4469 }
4470 // Insert - store vector to stack, store scalar, load vector.
4471 if (Opcode == Instruction::InsertElement) {
4472 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4473 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
4474 CostKind) +
4475 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind);
4476 }
4477 }
4478
4479 if (Index != -1U && (Opcode == Instruction::ExtractElement ||
4480 Opcode == Instruction::InsertElement)) {
4481 // Extraction of vXi1 elements are now efficiently handled by MOVMSK.
4482 if (Opcode == Instruction::ExtractElement &&
4483 ScalarType->getScalarSizeInBits() == 1 &&
4484 cast<FixedVectorType>(Val)->getNumElements() > 1)
4485 return 1;
4486
4487 // Legalize the type.
4488 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4489
4490 // This type is legalized to a scalar type.
4491 if (!LT.second.isVector())
4492 return 0;
4493
4494 // The type may be split. Normalize the index to the new type.
4495 unsigned SizeInBits = LT.second.getSizeInBits();
4496 unsigned NumElts = LT.second.getVectorNumElements();
4497 unsigned SubNumElts = NumElts;
4498 Index = Index % NumElts;
4499
4500 // For >128-bit vectors, we need to extract higher 128-bit subvectors.
4501 // For inserts, we also need to insert the subvector back.
4502 if (SizeInBits > 128) {
4503 assert((SizeInBits % 128) == 0 && "Illegal vector");
4504 unsigned NumSubVecs = SizeInBits / 128;
4505 SubNumElts = NumElts / NumSubVecs;
4506 if (SubNumElts <= Index) {
4507 RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
4508 Index %= SubNumElts;
4509 }
4510 }
4511
4512 MVT MScalarTy = LT.second.getScalarType();
4513 auto IsCheapPInsrPExtrInsertPS = [&]() {
4514 // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
4515 // Also, assume insertps is relatively cheap on all >= SSE41 targets.
4516 return (MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4517 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4518 (MScalarTy == MVT::f32 && ST->hasSSE41() &&
4519 Opcode == Instruction::InsertElement);
4520 };
4521
4522 if (Index == 0) {
4523 // Floating point scalars are already located in index #0.
4524 // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
4525 // true for all.
4526 if (ScalarType->isFloatingPointTy() &&
4527 (Opcode != Instruction::InsertElement || !Op0 ||
4528 isa<UndefValue>(Op0)))
4529 return RegisterFileMoveCost;
4530
4531 if (Opcode == Instruction::InsertElement &&
4532 isa_and_nonnull<UndefValue>(Op0)) {
4533 // Consider the gather cost to be cheap.
4534 if (isa_and_nonnull<LoadInst>(Op1))
4535 return RegisterFileMoveCost;
4536 if (!IsCheapPInsrPExtrInsertPS()) {
4537 // mov constant-to-GPR + movd/movq GPR -> XMM.
4538 if (isa_and_nonnull<Constant>(Op1) && Op1->getType()->isIntegerTy())
4539 return 2 + RegisterFileMoveCost;
4540 // Assume movd/movq GPR -> XMM is relatively cheap on all targets.
4541 return 1 + RegisterFileMoveCost;
4542 }
4543 }
4544
4545 // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
4546 if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
4547 return 1 + RegisterFileMoveCost;
4548 }
4549
4550 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4551 assert(ISD && "Unexpected vector opcode");
4552 if (ST->useSLMArithCosts())
4553 if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
4554 return Entry->Cost + RegisterFileMoveCost;
4555
4556 // Consider cheap cases.
4557 if (IsCheapPInsrPExtrInsertPS())
4558 return 1 + RegisterFileMoveCost;
4559
4560 // For extractions we just need to shuffle the element to index 0, which
4561 // should be very cheap (assume cost = 1). For insertions we need to shuffle
4562 // the elements to its destination. In both cases we must handle the
4563 // subvector move(s).
4564 // If the vector type is already less than 128-bits then don't reduce it.
4565 // TODO: Under what circumstances should we shuffle using the full width?
4566 InstructionCost ShuffleCost = 1;
4567 if (Opcode == Instruction::InsertElement) {
4568 auto *SubTy = cast<VectorType>(Val);
4569 EVT VT = TLI->getValueType(DL, Val);
4570 if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
4571 SubTy = FixedVectorType::get(ScalarType, SubNumElts);
4572 ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, std::nullopt,
4573 CostKind, 0, SubTy);
4574 }
4575 int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
4576 return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
4577 }
4578
4579 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1) +
4580 RegisterFileMoveCost;
4581}
4582
4585 bool Insert, bool Extract,
4587 assert(DemandedElts.getBitWidth() ==
4588 cast<FixedVectorType>(Ty)->getNumElements() &&
4589 "Vector size mismatch");
4590
4591 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4592 MVT MScalarTy = LT.second.getScalarType();
4593 unsigned LegalVectorBitWidth = LT.second.getSizeInBits();
4595
4596 constexpr unsigned LaneBitWidth = 128;
4597 assert((LegalVectorBitWidth < LaneBitWidth ||
4598 (LegalVectorBitWidth % LaneBitWidth) == 0) &&
4599 "Illegal vector");
4600
4601 const int NumLegalVectors = *LT.first.getValue();
4602 assert(NumLegalVectors >= 0 && "Negative cost!");
4603
4604 // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
4605 // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
4606 if (Insert) {
4607 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4608 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4609 (MScalarTy == MVT::f32 && ST->hasSSE41())) {
4610 // For types we can insert directly, insertion into 128-bit sub vectors is
4611 // cheap, followed by a cheap chain of concatenations.
4612 if (LegalVectorBitWidth <= LaneBitWidth) {
4613 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
4614 /*Extract*/ false, CostKind);
4615 } else {
4616 // In each 128-lane, if at least one index is demanded but not all
4617 // indices are demanded and this 128-lane is not the first 128-lane of
4618 // the legalized-vector, then this 128-lane needs a extracti128; If in
4619 // each 128-lane, there is at least one demanded index, this 128-lane
4620 // needs a inserti128.
4621
4622 // The following cases will help you build a better understanding:
4623 // Assume we insert several elements into a v8i32 vector in avx2,
4624 // Case#1: inserting into 1th index needs vpinsrd + inserti128.
4625 // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
4626 // inserti128.
4627 // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
4628 assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector");
4629 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
4630 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
4631 unsigned NumLegalElts =
4632 LT.second.getVectorNumElements() * NumLegalVectors;
4633 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
4634 "Vector has been legalized to smaller element count");
4635 assert((NumLegalElts % NumLanesTotal) == 0 &&
4636 "Unexpected elts per lane");
4637 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
4638
4639 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
4640 auto *LaneTy =
4641 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
4642
4643 for (unsigned I = 0; I != NumLanesTotal; ++I) {
4644 APInt LaneEltMask = WidenedDemandedElts.extractBits(
4645 NumEltsPerLane, NumEltsPerLane * I);
4646 if (LaneEltMask.isZero())
4647 continue;
4648 // FIXME: we don't need to extract if all non-demanded elements
4649 // are legalization-inserted padding.
4650 if (!LaneEltMask.isAllOnes())
4651 Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
4652 CostKind, I * NumEltsPerLane, LaneTy);
4653 Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert,
4654 /*Extract*/ false, CostKind);
4655 }
4656
4657 APInt AffectedLanes =
4658 APIntOps::ScaleBitMask(WidenedDemandedElts, NumLanesTotal);
4659 APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask(
4660 AffectedLanes, NumLegalVectors, /*MatchAllBits=*/true);
4661 for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) {
4662 for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) {
4663 unsigned I = NumLegalLanes * LegalVec + Lane;
4664 // No need to insert unaffected lane; or lane 0 of each legal vector
4665 // iff ALL lanes of that vector were affected and will be inserted.
4666 if (!AffectedLanes[I] ||
4667 (Lane == 0 && FullyAffectedLegalVectors[LegalVec]))
4668 continue;
4669 Cost += getShuffleCost(TTI::SK_InsertSubvector, Ty, std::nullopt,
4670 CostKind, I * NumEltsPerLane, LaneTy);
4671 }
4672 }
4673 }
4674 } else if (LT.second.isVector()) {
4675 // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
4676 // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
4677 // series of UNPCK followed by CONCAT_VECTORS - all of these can be
4678 // considered cheap.
4679 if (Ty->isIntOrIntVectorTy())
4680 Cost += DemandedElts.popcount();
4681
4682 // Get the smaller of the legalized or original pow2-extended number of
4683 // vector elements, which represents the number of unpacks we'll end up
4684 // performing.
4685 unsigned NumElts = LT.second.getVectorNumElements();
4686 unsigned Pow2Elts =
4687 PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements());
4688 Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
4689 }
4690 }
4691
4692 if (Extract) {
4693 // vXi1 can be efficiently extracted with MOVMSK.
4694 // TODO: AVX512 predicate mask handling.
4695 // NOTE: This doesn't work well for roundtrip scalarization.
4696 if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) {
4697 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
4698 unsigned MaxElts = ST->hasAVX2() ? 32 : 16;
4699 unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts;
4700 return MOVMSKCost;
4701 }
4702
4703 if (LT.second.isVector()) {
4704 unsigned NumLegalElts =
4705 LT.second.getVectorNumElements() * NumLegalVectors;
4706 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
4707 "Vector has been legalized to smaller element count");
4708
4709 // If we're extracting elements from a 128-bit subvector lane,
4710 // we only need to extract each lane once, not for every element.
4711 if (LegalVectorBitWidth > LaneBitWidth) {
4712 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
4713 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
4714 assert((NumLegalElts % NumLanesTotal) == 0 &&
4715 "Unexpected elts per lane");
4716 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
4717
4718 // Add cost for each demanded 128-bit subvector extraction.
4719 // Luckily this is a lot easier than for insertion.
4720 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
4721 auto *LaneTy =
4722 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
4723
4724 for (unsigned I = 0; I != NumLanesTotal; ++I) {
4725 APInt LaneEltMask = WidenedDemandedElts.extractBits(
4726 NumEltsPerLane, I * NumEltsPerLane);
4727 if (LaneEltMask.isZero())
4728 continue;
4729 Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
4730 CostKind, I * NumEltsPerLane, LaneTy);
4732 LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind);
4733 }
4734
4735 return Cost;
4736 }
4737 }
4738
4739 // Fallback to default extraction.
4740 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ false,
4741 Extract, CostKind);
4742 }
4743
4744 return Cost;
4745}
4746
4748X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
4749 int VF, const APInt &DemandedDstElts,
4751 const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy);
4752 // We don't differentiate element types here, only element bit width.
4753 EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits);
4754
4755 auto bailout = [&]() {
4756 return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
4757 DemandedDstElts, CostKind);
4758 };
4759
4760 // For now, only deal with AVX512 cases.
4761 if (!ST->hasAVX512())
4762 return bailout();
4763
4764 // Do we have a native shuffle for this element type, or should we promote?
4765 unsigned PromEltTyBits = EltTyBits;
4766 switch (EltTyBits) {
4767 case 32:
4768 case 64:
4769 break; // AVX512F.
4770 case 16:
4771 if (!ST->hasBWI())
4772 PromEltTyBits = 32; // promote to i32, AVX512F.
4773 break; // AVX512BW
4774 case 8:
4775 if (!ST->hasVBMI())
4776 PromEltTyBits = 32; // promote to i32, AVX512F.
4777 break; // AVX512VBMI
4778 case 1:
4779 // There is no support for shuffling i1 elements. We *must* promote.
4780 if (ST->hasBWI()) {
4781 if (ST->hasVBMI())
4782 PromEltTyBits = 8; // promote to i8, AVX512VBMI.
4783 else
4784 PromEltTyBits = 16; // promote to i16, AVX512BW.
4785 break;
4786 }
4787 PromEltTyBits = 32; // promote to i32, AVX512F.
4788 break;
4789 default:
4790 return bailout();
4791 }
4792 auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits);
4793
4794 auto *SrcVecTy = FixedVectorType::get(EltTy, VF);
4795 auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF);
4796
4797 int NumDstElements = VF * ReplicationFactor;
4798 auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements);
4799 auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements);
4800
4801 // Legalize the types.
4802 MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second;
4803 MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second;
4804 MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second;
4805 MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second;
4806 // They should have legalized into vector types.
4807 if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() ||
4808 !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector())
4809 return bailout();
4810
4811 if (PromEltTyBits != EltTyBits) {
4812 // If we have to perform the shuffle with wider elt type than our data type,
4813 // then we will first need to anyext (we don't care about the new bits)
4814 // the source elements, and then truncate Dst elements.
4815 InstructionCost PromotionCost;
4816 PromotionCost += getCastInstrCost(
4817 Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy,
4819 PromotionCost +=
4820 getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy,
4821 /*Src=*/PromDstVecTy,
4823 return PromotionCost + getReplicationShuffleCost(PromEltTy,
4824 ReplicationFactor, VF,
4825 DemandedDstElts, CostKind);
4826 }
4827
4828 assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&
4829 LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&
4830 "We expect that the legalization doesn't affect the element width, "
4831 "doesn't coalesce/split elements.");
4832
4833 unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements();
4834 unsigned NumDstVectors =
4835 divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec);
4836
4837 auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec);
4838
4839 // Not all the produced Dst elements may be demanded. In our case,
4840 // given that a single Dst vector is formed by a single shuffle,
4841 // if all elements that will form a single Dst vector aren't demanded,
4842 // then we won't need to do that shuffle, so adjust the cost accordingly.
4843 APInt DemandedDstVectors = APIntOps::ScaleBitMask(
4844 DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors);
4845 unsigned NumDstVectorsDemanded = DemandedDstVectors.popcount();
4846
4847 InstructionCost SingleShuffleCost = getShuffleCost(
4848 TTI::SK_PermuteSingleSrc, SingleDstVecTy, /*Mask=*/std::nullopt, CostKind,
4849 /*Index=*/0, /*SubTp=*/nullptr);
4850 return NumDstVectorsDemanded * SingleShuffleCost;
4851}
4852
4854 MaybeAlign Alignment,
4855 unsigned AddressSpace,
4857 TTI::OperandValueInfo OpInfo,
4858 const Instruction *I) {
4859 // TODO: Handle other cost kinds.
4861 if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
4862 // Store instruction with index and scale costs 2 Uops.
4863 // Check the preceding GEP to identify non-const indices.
4864 if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
4865 if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
4866 return TTI::TCC_Basic * 2;
4867 }
4868 }
4869 return TTI::TCC_Basic;
4870 }
4871
4872 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
4873 "Invalid Opcode");
4874 // Type legalization can't handle structs
4875 if (TLI->getValueType(DL, Src, true) == MVT::Other)
4876 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4877 CostKind);
4878
4879 // Legalize the type.
4880 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
4881
4882 auto *VTy = dyn_cast<FixedVectorType>(Src);
4883
4885
4886 // Add a cost for constant load to vector.
4887 if (Opcode == Instruction::Store && OpInfo.isConstant())
4888 Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src),
4889 /*AddressSpace=*/0, CostKind);
4890
4891 // Handle the simple case of non-vectors.
4892 // NOTE: this assumes that legalization never creates vector from scalars!
4893 if (!VTy || !LT.second.isVector()) {
4894 // Each load/store unit costs 1.
4895 return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1;
4896 }
4897
4898 bool IsLoad = Opcode == Instruction::Load;
4899
4900 Type *EltTy = VTy->getElementType();
4901
4902 const int EltTyBits = DL.getTypeSizeInBits(EltTy);
4903
4904 // Source of truth: how many elements were there in the original IR vector?
4905 const unsigned SrcNumElt = VTy->getNumElements();
4906
4907 // How far have we gotten?
4908 int NumEltRemaining = SrcNumElt;
4909 // Note that we intentionally capture by-reference, NumEltRemaining changes.
4910 auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
4911
4912 const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
4913
4914 // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
4915 const unsigned XMMBits = 128;
4916 if (XMMBits % EltTyBits != 0)
4917 // Vector size must be a multiple of the element size. I.e. no padding.
4918 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4919 CostKind);
4920 const int NumEltPerXMM = XMMBits / EltTyBits;
4921
4922 auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
4923
4924 for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
4925 NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
4926 // How many elements would a single op deal with at once?
4927 if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
4928 // Vector size must be a multiple of the element size. I.e. no padding.
4929 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4930 CostKind);
4931 int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
4932
4933 assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
4934 assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||
4935 (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
4936 "Unless we haven't halved the op size yet, "
4937 "we have less than two op's sized units of work left.");
4938
4939 auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
4940 ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
4941 : XMMVecTy;
4942
4943 assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
4944 "After halving sizes, the vector elt count is no longer a multiple "
4945 "of number of elements per operation?");
4946 auto *CoalescedVecTy =
4947 CurrNumEltPerOp == 1
4948 ? CurrVecTy
4950 IntegerType::get(Src->getContext(),
4951 EltTyBits * CurrNumEltPerOp),
4952 CurrVecTy->getNumElements() / CurrNumEltPerOp);
4953 assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
4954 DL.getTypeSizeInBits(CurrVecTy) &&
4955 "coalesciing elements doesn't change vector width.");
4956
4957 while (NumEltRemaining > 0) {
4958 assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
4959
4960 // Can we use this vector size, as per the remaining element count?
4961 // Iff the vector is naturally aligned, we can do a wide load regardless.
4962 if (NumEltRemaining < CurrNumEltPerOp &&
4963 (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) &&
4964 CurrOpSizeBytes != 1)
4965 break; // Try smalled vector size.
4966
4967 bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
4968
4969 // If we have fully processed the previous reg, we need to replenish it.
4970 if (SubVecEltsLeft == 0) {
4971 SubVecEltsLeft += CurrVecTy->getNumElements();
4972 // And that's free only for the 0'th subvector of a legalized vector.
4973 if (!Is0thSubVec)
4976 VTy, std::nullopt, CostKind, NumEltDone(),
4977 CurrVecTy);
4978 }
4979
4980 // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
4981 // for smaller widths (32/16/8) we have to insert/extract them separately.
4982 // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
4983 // but let's pretend that it is also true for 16/8 bit wide ops...)
4984 if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
4985 int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
4986 assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
4987 int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
4988 APInt DemandedElts =
4989 APInt::getBitsSet(CoalescedVecTy->getNumElements(),
4990 CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
4991 assert(DemandedElts.popcount() == 1 && "Inserting single value");
4992 Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
4993 !IsLoad, CostKind);
4994 }
4995
4996 // This isn't exactly right. We're using slow unaligned 32-byte accesses
4997 // as a proxy for a double-pumped AVX memory interface such as on
4998 // Sandybridge.
4999 // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
5000 // will be scalarized.
5001 if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
5002 Cost += 2;
5003 else if (CurrOpSizeBytes < 4)
5004 Cost += 2;
5005 else
5006 Cost += 1;
5007
5008 SubVecEltsLeft -= CurrNumEltPerOp;
5009 NumEltRemaining -= CurrNumEltPerOp;
5010 Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes);
5011 }
5012 }
5013
5014 assert(NumEltRemaining <= 0 && "Should have processed all the elements.");
5015
5016 return Cost;
5017}
5018
5020X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
5021 unsigned AddressSpace,
5023 bool IsLoad = (Instruction::Load == Opcode);
5024 bool IsStore = (Instruction::Store == Opcode);
5025
5026 auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
5027 if (!SrcVTy)
5028 // To calculate scalar take the regular cost, without mask
5029 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
5030
5031 unsigned NumElem = SrcVTy->getNumElements();
5032 auto *MaskTy =
5033 FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
5034 if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) ||
5035 (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) {
5036 // Scalarization
5037 APInt DemandedElts = APInt::getAllOnes(NumElem);
5039 MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind);
5040 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
5041 Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
5043 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
5044 InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
5046 SrcVTy, DemandedElts, IsLoad, IsStore, CostKind);
5047 InstructionCost MemopCost =
5048 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5049 Alignment, AddressSpace, CostKind);
5050 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
5051 }
5052
5053 // Legalize the type.
5054 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
5055 auto VT = TLI->getValueType(DL, SrcVTy);
5057 if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
5058 LT.second.getVectorNumElements() == NumElem)
5059 // Promotion requires extend/truncate for data and a shuffle for mask.
5060 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, std::nullopt,
5061 CostKind, 0, nullptr) +
5062 getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, std::nullopt,
5063 CostKind, 0, nullptr);
5064
5065 else if (LT.first * LT.second.getVectorNumElements() > NumElem) {
5066 auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
5067 LT.second.getVectorNumElements());
5068 // Expanding requires fill mask with zeroes
5069 Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, std::nullopt,
5070 CostKind, 0, MaskTy);
5071 }
5072
5073 // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
5074 if (!ST->hasAVX512())
5075 return Cost + LT.first * (IsLoad ? 2 : 8);
5076
5077 // AVX-512 masked load/store is cheaper
5078 return Cost + LT.first;
5079}
5080
5083 const Value *Base,
5084 const TTI::PointersChainInfo &Info,
5085 Type *AccessTy, TTI::TargetCostKind CostKind) {
5086 if (Info.isSameBase() && Info.isKnownStride()) {
5087 // If all the pointers have known stride all the differences are translated
5088 // into constants. X86 memory addressing allows encoding it into
5089 // displacement. So we just need to take the base GEP cost.
5090 if (const auto *BaseGEP = dyn_cast<GetElementPtrInst>(Base)) {
5091 SmallVector<const Value *> Indices(BaseGEP->indices());
5092 return getGEPCost(BaseGEP->getSourceElementType(),
5093 BaseGEP->getPointerOperand(), Indices, nullptr,
5094 CostKind);
5095 }
5096 return TTI::TCC_Free;
5097 }
5098 return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind);
5099}
5100
5102 ScalarEvolution *SE,
5103 const SCEV *Ptr) {
5104 // Address computations in vectorized code with non-consecutive addresses will
5105 // likely result in more instructions compared to scalar code where the
5106 // computation can more often be merged into the index mode. The resulting
5107 // extra micro-ops can significantly decrease throughput.
5108 const unsigned NumVectorInstToHideOverhead = 10;
5109
5110 // Cost modeling of Strided Access Computation is hidden by the indexing
5111 // modes of X86 regardless of the stride value. We dont believe that there
5112 // is a difference between constant strided access in gerenal and constant
5113 // strided value which is less than or equal to 64.
5114 // Even in the case of (loop invariant) stride whose value is not known at
5115 // compile time, the address computation will not incur more than one extra
5116 // ADD instruction.
5117 if (Ty->isVectorTy() && SE && !ST->hasAVX2()) {
5118 // TODO: AVX2 is the current cut-off because we don't have correct
5119 // interleaving costs for prior ISA's.
5121 return NumVectorInstToHideOverhead;
5123 return 1;
5124 }
5125
5126 return BaseT::getAddressComputationCost(Ty, SE, Ptr);
5127}
5128
5131 std::optional<FastMathFlags> FMF,
5134 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5135
5136 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5137 // and make it as the cost.
5138
5139 static const CostTblEntry SLMCostTbl[] = {
5140 { ISD::FADD, MVT::v2f64, 3 },
5141 { ISD::ADD, MVT::v2i64, 5 },
5142 };
5143
5144 static const CostTblEntry SSE2CostTbl[] = {
5145 { ISD::FADD, MVT::v2f64, 2 },
5146 { ISD::FADD, MVT::v2f32, 2 },
5147 { ISD::FADD, MVT::v4f32, 4 },
5148 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
5149 { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
5150 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
5151 { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
5152 { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
5153 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
5154 { ISD::ADD, MVT::v2i8, 2 },
5155 { ISD::ADD, MVT::v4i8, 2 },
5156 { ISD::ADD, MVT::v8i8, 2 },
5157 { ISD::ADD, MVT::v16i8, 3 },
5158 };
5159
5160 static const CostTblEntry AVX1CostTbl[] = {
5161 { ISD::FADD, MVT::v4f64, 3 },
5162 { ISD::FADD, MVT::v4f32, 3 },
5163 { ISD::FADD, MVT::v8f32, 4 },
5164 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
5165 { ISD::ADD, MVT::v4i64, 3 },
5166 { ISD::ADD, MVT::v8i32, 5 },
5167 { ISD::ADD, MVT::v16i16, 5 },
5168 { ISD::ADD, MVT::v32i8, 4 },
5169 };
5170
5171 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5172 assert(ISD && "Invalid opcode");
5173
5174 // Before legalizing the type, give a chance to look up illegal narrow types
5175 // in the table.
5176 // FIXME: Is there a better way to do this?
5177 EVT VT = TLI->getValueType(DL, ValTy);
5178 if (VT.isSimple()) {
5179 MVT MTy = VT.getSimpleVT();
5180 if (ST->useSLMArithCosts())
5181 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5182 return Entry->Cost;
5183
5184 if (ST->hasAVX())
5185 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5186 return Entry->Cost;
5187
5188 if (ST->hasSSE2())
5189 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5190 return Entry->Cost;
5191 }
5192
5193 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5194
5195 MVT MTy = LT.second;
5196
5197 auto *ValVTy = cast<FixedVectorType>(ValTy);
5198
5199 // Special case: vXi8 mul reductions are performed as vXi16.
5200 if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
5201 auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
5202 auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
5203 return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
5205 CostKind) +
5206 getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind);
5207 }
5208
5209 InstructionCost ArithmeticCost = 0;
5210 if (LT.first != 1 && MTy.isVector() &&
5211 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5212 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5213 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5214 MTy.getVectorNumElements());
5215 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5216 ArithmeticCost *= LT.first - 1;
5217 }
5218
5219 if (ST->useSLMArithCosts())
5220 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5221 return ArithmeticCost + Entry->Cost;
5222
5223 if (ST->hasAVX())
5224 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5225 return ArithmeticCost + Entry->Cost;
5226
5227 if (ST->hasSSE2())
5228 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5229 return ArithmeticCost + Entry->Cost;
5230
5231 // FIXME: These assume a naive kshift+binop lowering, which is probably
5232 // conservative in most cases.
5233 static const CostTblEntry AVX512BoolReduction[] = {
5234 { ISD::AND, MVT::v2i1, 3 },
5235 { ISD::AND, MVT::v4i1, 5 },
5236 { ISD::AND, MVT::v8i1, 7 },
5237 { ISD::AND, MVT::v16i1, 9 },
5238 { ISD::AND, MVT::v32i1, 11 },
5239 { ISD::AND, MVT::v64i1, 13 },
5240 { ISD::OR, MVT::v2i1, 3 },
5241 { ISD::OR, MVT::v4i1, 5 },
5242 { ISD::OR, MVT::v8i1, 7 },
5243 { ISD::OR, MVT::v16i1, 9 },
5244 { ISD::OR, MVT::v32i1, 11 },
5245 { ISD::OR, MVT::v64i1, 13 },
5246 };
5247
5248 static const CostTblEntry AVX2BoolReduction[] = {
5249 { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
5250 { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
5251 { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
5252 { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
5253 };
5254
5255 static const CostTblEntry AVX1BoolReduction[] = {
5256 { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
5257 { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
5258 { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5259 { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5260 { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
5261 { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
5262 { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5263 { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5264 };
5265
5266 static const CostTblEntry SSE2BoolReduction[] = {
5267 { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
5268 { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
5269 { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
5270 { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
5271 { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
5272 { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
5273 { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
5274 { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
5275 };
5276
5277 // Handle bool allof/anyof patterns.
5278 if (ValVTy->getElementType()->isIntegerTy(1)) {
5279 InstructionCost ArithmeticCost = 0;
5280 if (LT.first != 1 && MTy.isVector() &&
5281 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5282 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5283 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5284 MTy.getVectorNumElements());
5285 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5286 ArithmeticCost *= LT.first - 1;
5287 }
5288
5289 if (ST->hasAVX512())
5290 if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
5291 return ArithmeticCost + Entry->Cost;
5292 if (ST->hasAVX2())
5293 if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
5294 return ArithmeticCost + Entry->Cost;
5295 if (ST->hasAVX())
5296 if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
5297 return ArithmeticCost + Entry->Cost;
5298 if (ST->hasSSE2())
5299 if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
5300 return ArithmeticCost + Entry->Cost;
5301
5302 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5303 }
5304
5305 unsigned NumVecElts = ValVTy->getNumElements();
5306 unsigned ScalarSize = ValVTy->getScalarSizeInBits();
5307
5308 // Special case power of 2 reductions where the scalar type isn't changed
5309 // by type legalization.
5310 if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
5311 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5312
5313 InstructionCost ReductionCost = 0;
5314
5315 auto *Ty = ValVTy;
5316 if (LT.first != 1 && MTy.isVector() &&
5317 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5318 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5319 Ty = FixedVectorType::get(ValVTy->getElementType(),
5320 MTy.getVectorNumElements());
5321 ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5322 ReductionCost *= LT.first - 1;
5323 NumVecElts = MTy.getVectorNumElements();
5324 }
5325
5326 // Now handle reduction with the legal type, taking into account size changes
5327 // at each level.
5328 while (NumVecElts > 1) {
5329 // Determine the size of the remaining vector we need to reduce.
5330 unsigned Size = NumVecElts * ScalarSize;
5331 NumVecElts /= 2;
5332 // If we're reducing from 256/512 bits, use an extract_subvector.
5333 if (Size > 128) {
5334 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5335 ReductionCost +=
5337 NumVecElts, SubTy);
5338 Ty = SubTy;
5339 } else if (Size == 128) {
5340 // Reducing from 128 bits is a permute of v2f64/v2i64.
5341 FixedVectorType *ShufTy;
5342 if (ValVTy->isFloatingPointTy())
5343 ShufTy =
5344 FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
5345 else
5346 ShufTy =
5347 FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
5348 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5349 std::nullopt, CostKind, 0, nullptr);
5350 } else if (Size == 64) {
5351 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5352 FixedVectorType *ShufTy;
5353 if (ValVTy->isFloatingPointTy())
5354 ShufTy =
5355 FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
5356 else
5357 ShufTy =
5358 FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
5359 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5360 std::nullopt, CostKind, 0, nullptr);
5361 } else {
5362 // Reducing from smaller size is a shift by immediate.
5363 auto *ShiftTy = FixedVectorType::get(
5364 Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
5365 ReductionCost += getArithmeticInstrCost(
5366 Instruction::LShr, ShiftTy, CostKind,
5369 }
5370
5371 // Add the arithmetic op for this level.
5372 ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
5373 }
5374
5375 // Add the final extract element to the cost.
5376 return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5377 CostKind, 0, nullptr, nullptr);
5378}
5379
5382 FastMathFlags FMF) {
5383 IntrinsicCostAttributes ICA(IID, Ty, {Ty, Ty}, FMF);
5384 return getIntrinsicInstrCost(ICA, CostKind);
5385}
5386
5389 FastMathFlags FMF,
5391 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5392
5393 MVT MTy = LT.second;
5394
5395 int ISD;
5396 if (ValTy->isIntOrIntVectorTy()) {
5397 ISD = (IID == Intrinsic::umin || IID == Intrinsic::umax) ? ISD::UMIN
5398 : ISD::SMIN;
5399 } else {
5400 assert(ValTy->isFPOrFPVectorTy() &&
5401 "Expected float point or integer vector type.");
5402 ISD = (IID == Intrinsic::minnum || IID == Intrinsic::maxnum)
5403 ? ISD::FMINNUM
5404 : ISD::FMINIMUM;
5405 }
5406
5407 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5408 // and make it as the cost.
5409
5410 static const CostTblEntry SSE2CostTbl[] = {
5411 {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
5412 {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
5413 {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
5414 };
5415
5416 static const CostTblEntry SSE41CostTbl[] = {
5417 {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
5418 {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
5419 {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
5420 {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
5421 {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
5422 {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
5423 {ISD::SMIN, MVT::v2i8, 3}, // pminsb
5424 {ISD::SMIN, MVT::v4i8, 5}, // pminsb
5425 {ISD::SMIN, MVT::v8i8, 7}, // pminsb
5426 {ISD::SMIN, MVT::v16i8, 6},
5427 {ISD::UMIN, MVT::v2i8, 3}, // same as sse2
5428 {ISD::UMIN, MVT::v4i8, 5}, // same as sse2
5429 {ISD::UMIN, MVT::v8i8, 7}, // same as sse2
5430 {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
5431 };
5432
5433 static const CostTblEntry AVX1CostTbl[] = {
5434 {ISD::SMIN, MVT::v16i16, 6},
5435 {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
5436 {ISD::SMIN, MVT::v32i8, 8},
5437 {ISD::UMIN, MVT::v32i8, 8},
5438 };
5439
5440 static const CostTblEntry AVX512BWCostTbl[] = {
5441 {ISD::SMIN, MVT::v32i16, 8},
5442 {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
5443 {ISD::SMIN, MVT::v64i8, 10},
5444 {ISD::UMIN, MVT::v64i8, 10},
5445 };
5446
5447 // Before legalizing the type, give a chance to look up illegal narrow types
5448 // in the table.
5449 // FIXME: Is there a better way to do this?
5450 EVT VT = TLI->getValueType(DL, ValTy);
5451 if (VT.isSimple()) {
5452 MVT MTy = VT.getSimpleVT();
5453 if (ST->hasBWI())
5454 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5455 return Entry->Cost;
5456
5457 if (ST->hasAVX())
5458 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5459 return Entry->Cost;
5460
5461 if (ST->hasSSE41())
5462 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5463 return Entry->Cost;
5464
5465 if (ST->hasSSE2())
5466 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5467 return Entry->Cost;
5468 }
5469
5470 auto *ValVTy = cast<FixedVectorType>(ValTy);
5471 unsigned NumVecElts = ValVTy->getNumElements();
5472
5473 auto *Ty = ValVTy;
5474 InstructionCost MinMaxCost = 0;
5475 if (LT.first != 1 && MTy.isVector() &&
5476 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5477 // Type needs to be split. We need LT.first - 1 operations ops.
5478 Ty = FixedVectorType::get(ValVTy->getElementType(),
5479 MTy.getVectorNumElements());
5480 MinMaxCost = getMinMaxCost(IID, Ty, CostKind, FMF);
5481 MinMaxCost *= LT.first - 1;
5482 NumVecElts = MTy.getVectorNumElements();
5483 }
5484
5485 if (ST->hasBWI())
5486 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5487 return MinMaxCost + Entry->Cost;
5488
5489 if (ST->hasAVX())
5490 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5491 return MinMaxCost + Entry->Cost;
5492
5493 if (ST->hasSSE41())
5494 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5495 return MinMaxCost + Entry->Cost;
5496
5497 if (ST->hasSSE2())
5498 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5499 return MinMaxCost + Entry->Cost;
5500
5501 unsigned ScalarSize = ValTy->getScalarSizeInBits();
5502
5503 // Special case power of 2 reductions where the scalar type isn't changed
5504 // by type legalization.
5505 if (!isPowerOf2_32(ValVTy->getNumElements()) ||
5506 ScalarSize != MTy.getScalarSizeInBits())
5507 return BaseT::getMinMaxReductionCost(IID, ValTy, FMF, CostKind);
5508
5509 // Now handle reduction with the legal type, taking into account size changes
5510 // at each level.
5511 while (NumVecElts > 1) {
5512 // Determine the size of the remaining vector we need to reduce.
5513 unsigned Size = NumVecElts * ScalarSize;
5514 NumVecElts /= 2;
5515 // If we're reducing from 256/512 bits, use an extract_subvector.
5516 if (Size > 128) {
5517 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5518 MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
5519 CostKind, NumVecElts, SubTy);
5520 Ty = SubTy;
5521 } else if (Size == 128) {
5522 // Reducing from 128 bits is a permute of v2f64/v2i64.
5523 VectorType *ShufTy;
5524 if (ValTy->isFloatingPointTy())
5525 ShufTy =
5527 else
5528 ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
5529 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5530 std::nullopt, CostKind, 0, nullptr);
5531 } else if (Size == 64) {
5532 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5533 FixedVectorType *ShufTy;
5534 if (ValTy->isFloatingPointTy())
5535 ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
5536 else
5537 ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
5538 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5539 std::nullopt, CostKind, 0, nullptr);
5540 } else {
5541 // Reducing from smaller size is a shift by immediate.
5542 auto *ShiftTy = FixedVectorType::get(
5543 Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
5544 MinMaxCost += getArithmeticInstrCost(
5545 Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
5548 }
5549
5550 // Add the arithmetic op for this level.
5551 MinMaxCost += getMinMaxCost(IID, Ty, CostKind, FMF);
5552 }
5553
5554 // Add the final extract element to the cost.
5555 return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5556 CostKind, 0, nullptr, nullptr);
5557}
5558
5559/// Calculate the cost of materializing a 64-bit value. This helper
5560/// method might only calculate a fraction of a larger immediate. Therefore it
5561/// is valid to return a cost of ZERO.
5563 if (Val == 0)
5564 return TTI::TCC_Free;
5565
5566 if (isInt<32>(Val))
5567 return TTI::TCC_Basic;
5568
5569 return 2 * TTI::TCC_Basic;
5570}
5571
5574 assert(Ty->isIntegerTy());
5575
5576 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5577 if (BitSize == 0)
5578 return ~0U;
5579
5580 // Never hoist constants larger than 128bit, because this might lead to
5581 // incorrect code generation or assertions in codegen.
5582 // Fixme: Create a cost model for types larger than i128 once the codegen
5583 // issues have been fixed.
5584 if (BitSize > 128)
5585 return TTI::TCC_Free;
5586
5587 if (Imm == 0)
5588 return TTI::TCC_Free;
5589
5590 // Sign-extend all constants to a multiple of 64-bit.
5591 APInt ImmVal = Imm;
5592 if (BitSize % 64 != 0)
5593 ImmVal = Imm.sext(alignTo(BitSize, 64));
5594
5595 // Split the constant into 64-bit chunks and calculate the cost for each
5596 // chunk.
5598 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
5599 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
5600 int64_t Val = Tmp.getSExtValue();
5601 Cost += getIntImmCost(Val);
5602 }
5603 // We need at least one instruction to materialize the constant.
5604 return std::max<InstructionCost>(1, Cost);
5605}
5606
5608 const APInt &Imm, Type *Ty,
5610 Instruction *Inst) {
5611 assert(Ty->isIntegerTy());
5612
5613 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5614 // There is no cost model for constants with a bit size of 0. Return TCC_Free
5615 // here, so that constant hoisting will ignore this constant.
5616 if (BitSize == 0)
5617 return TTI::TCC_Free;
5618
5619 unsigned ImmIdx = ~0U;
5620 switch (Opcode) {
5621 default:
5622 return TTI::TCC_Free;
5623 case Instruction::GetElementPtr:
5624 // Always hoist the base address of a GetElementPtr. This prevents the
5625 // creation of new constants for every base constant that gets constant
5626 // folded with the offset.
5627 if (Idx == 0)
5628 return 2 * TTI::TCC_Basic;
5629 return TTI::TCC_Free;
5630 case Instruction::Store:
5631 ImmIdx = 0;
5632 break;
5633 case Instruction::ICmp:
5634 // This is an imperfect hack to prevent constant hoisting of
5635 // compares that might be trying to check if a 64-bit value fits in
5636 // 32-bits. The backend can optimize these cases using a right shift by 32.
5637 // Ideally we would check the compare predicate here. There also other
5638 // similar immediates the backend can use shifts for.
5639 if (Idx == 1 && Imm.getBitWidth() == 64) {
5640 uint64_t ImmVal = Imm.getZExtValue();
5641 if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
5642 return TTI::TCC_Free;
5643 }
5644 ImmIdx = 1;
5645 break;
5646 case Instruction::And:
5647 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
5648 // by using a 32-bit operation with implicit zero extension. Detect such
5649 // immediates here as the normal path expects bit 31 to be sign extended.
5650 if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.isIntN(32))
5651 return TTI::TCC_Free;
5652 ImmIdx = 1;
5653 break;
5654 case Instruction::Add:
5655 case Instruction::Sub:
5656 // For add/sub, we can use the opposite instruction for INT32_MIN.
5657 if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000)
5658 return TTI::TCC_Free;
5659 ImmIdx = 1;
5660 break;
5661 case Instruction::UDiv:
5662 case Instruction::SDiv:
5663 case Instruction::URem:
5664 case Instruction::SRem:
5665 // Division by constant is typically expanded later into a different
5666 // instruction sequence. This completely changes the constants.
5667 // Report them as "free" to stop ConstantHoist from marking them as opaque.
5668 return TTI::TCC_Free;
5669 case Instruction::Mul:
5670 case Instruction::Or:
5671 case Instruction::Xor:
5672 ImmIdx = 1;
5673 break;
5674 // Always return TCC_Free for the shift value of a shift instruction.
5675 case Instruction::Shl:
5676 case Instruction::LShr:
5677 case Instruction::AShr:
5678 if (Idx == 1)
5679 return TTI::TCC_Free;
5680 break;
5681 case Instruction::Trunc:
5682 case Instruction::ZExt:
5683 case Instruction::SExt:
5684 case Instruction::IntToPtr:
5685 case Instruction::PtrToInt:
5686 case Instruction::BitCast:
5687 case Instruction::PHI:
5688 case Instruction::Call:
5689 case Instruction::Select:
5690 case Instruction::Ret:
5691 case Instruction::Load:
5692 break;
5693 }
5694
5695 if (Idx == ImmIdx) {
5696 uint64_t NumConstants = divideCeil(BitSize, 64);
5698 return (Cost <= NumConstants * TTI::TCC_Basic)
5699 ? static_cast<int>(TTI::TCC_Free)
5700 : Cost;
5701 }
5702
5703 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
5704}
5705
5707 const APInt &Imm, Type *Ty,
5709 assert(Ty->isIntegerTy());
5710
5711 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5712 // There is no cost model for constants with a bit size of 0. Return TCC_Free
5713 // here, so that constant hoisting will ignore this constant.
5714 if (BitSize == 0)
5715 return TTI::TCC_Free;
5716
5717 switch (IID) {
5718 default:
5719 return TTI::TCC_Free;
5720 case Intrinsic::sadd_with_overflow:
5721 case Intrinsic::uadd_with_overflow:
5722 case Intrinsic::ssub_with_overflow:
5723 case Intrinsic::usub_with_overflow:
5724 case Intrinsic::smul_with_overflow:
5725 case Intrinsic::umul_with_overflow:
5726 if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(32))
5727 return TTI::TCC_Free;
5728 break;
5729 case Intrinsic::experimental_stackmap:
5730 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
5731 return TTI::TCC_Free;
5732 break;
5733 case Intrinsic::experimental_patchpoint_void:
5734 case Intrinsic::experimental_patchpoint:
5735 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
5736 return TTI::TCC_Free;
5737 break;
5738 }
5739 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
5740}
5741
5744 const Instruction *I) {
5746 return Opcode == Instruction::PHI ? 0 : 1;
5747 // Branches are assumed to be predicted.
5748 return 0;
5749}
5750
5751int X86TTIImpl::getGatherOverhead() const {
5752 // Some CPUs have more overhead for gather. The specified overhead is relative
5753 // to the Load operation. "2" is the number provided by Intel architects. This
5754 // parameter is used for cost estimation of Gather Op and comparison with
5755 // other alternatives.
5756 // TODO: Remove the explicit hasAVX512()?, That would mean we would only
5757 // enable gather with a -march.
5758 if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
5759 return 2;
5760
5761 return 1024;
5762}
5763
5764int X86TTIImpl::getScatterOverhead() const {
5765 if (ST->hasAVX512())
5766 return 2;
5767
5768 return 1024;
5769}
5770
5771// Return an average cost of Gather / Scatter instruction, maybe improved later.
5772// FIXME: Add TargetCostKind support.
5773InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode,
5775 Type *SrcVTy, const Value *Ptr,
5776 Align Alignment,
5777 unsigned AddressSpace) {
5778
5779 assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
5780 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
5781
5782 // Try to reduce index size from 64 bit (default for GEP)
5783 // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
5784 // operation will use 16 x 64 indices which do not fit in a zmm and needs
5785 // to split. Also check that the base pointer is the same for all lanes,
5786 // and that there's at most one variable index.
5787 auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
5788 unsigned IndexSize = DL.getPointerSizeInBits();
5789 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
5790 if (IndexSize < 64 || !GEP)
5791 return IndexSize;
5792
5793 unsigned NumOfVarIndices = 0;
5794 const Value *Ptrs = GEP->getPointerOperand();
5795 if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
5796 return IndexSize;
5797 for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) {
5798 if (isa<Constant>(GEP->getOperand(I)))
5799 continue;
5800 Type *IndxTy = GEP->getOperand(I)->getType();
5801 if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
5802 IndxTy = IndexVTy->getElementType();
5803 if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
5804 !isa<SExtInst>(GEP->getOperand(I))) ||
5805 ++NumOfVarIndices > 1)
5806 return IndexSize; // 64
5807 }
5808 return (unsigned)32;
5809 };
5810
5811 // Trying to reduce IndexSize to 32 bits for vector 16.
5812 // By default the IndexSize is equal to pointer size.
5813 unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
5814 ? getIndexSizeInBits(Ptr, DL)
5816
5817 auto *IndexVTy = FixedVectorType::get(
5818 IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
5819 std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(IndexVTy);
5820 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcVTy);
5821 InstructionCost::CostType SplitFactor =
5822 *std::max(IdxsLT.first, SrcLT.first).getValue();
5823 if (SplitFactor > 1) {
5824 // Handle splitting of vector of pointers
5825 auto *SplitSrcTy =
5826 FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
5827 return SplitFactor * getGSVectorCost(Opcode, CostKind, SplitSrcTy, Ptr,
5828 Alignment, AddressSpace);
5829 }
5830
5831 // The gather / scatter cost is given by Intel architects. It is a rough
5832 // number since we are looking at one instruction in a time.
5833 const int GSOverhead = (Opcode == Instruction::Load)
5834 ? getGatherOverhead()
5835 : getScatterOverhead();
5836 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5837 MaybeAlign(Alignment), AddressSpace,
5839}
5840
5841/// Return the cost of full scalarization of gather / scatter operation.
5842///
5843/// Opcode - Load or Store instruction.
5844/// SrcVTy - The type of the data vector that should be gathered or scattered.
5845/// VariableMask - The mask is non-constant at compile time.
5846/// Alignment - Alignment for one element.
5847/// AddressSpace - pointer[s] address space.
5848/// TODO: Remove this and use getCommonMaskedMemoryOpCost directly.
5849InstructionCost X86TTIImpl::getGSScalarCost(unsigned Opcode,
5851 Type *SrcVTy, bool VariableMask,
5852 Align Alignment,
5853 unsigned AddressSpace) {
5854 Type *ScalarTy = SrcVTy->getScalarType();
5855 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
5856 APInt DemandedElts = APInt::getAllOnes(VF);
5857
5858 InstructionCost MaskUnpackCost = 0;
5859 if (VariableMask) {
5860 auto *MaskTy =
5862 MaskUnpackCost = getScalarizationOverhead(
5863 MaskTy, DemandedElts, /*Insert=*/false, /*Extract=*/true, CostKind);
5864 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
5865 Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr,
5867 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
5868 MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
5869 }
5870
5871 InstructionCost AddressUnpackCost = getScalarizationOverhead(
5873 DemandedElts, /*Insert=*/false, /*Extract=*/true, CostKind);
5874
5875 // The cost of the scalar loads/stores.
5876 InstructionCost MemoryOpCost =
5877 VF * getMemoryOpCost(Opcode, ScalarTy, MaybeAlign(Alignment),
5879
5880 // The cost of forming the vector from loaded scalars/
5881 // scalarizing the vector to perform scalar stores.
5882 InstructionCost InsertExtractCost = getScalarizationOverhead(
5883 cast<FixedVectorType>(SrcVTy), DemandedElts,
5884 /*Insert=*/Opcode == Instruction::Load,
5885 /*Extract=*/Opcode == Instruction::Store, CostKind);
5886
5887 return AddressUnpackCost + MemoryOpCost + MaskUnpackCost + InsertExtractCost;
5888}
5889
5890/// Calculate the cost of Gather / Scatter operation
5892 unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask,
5894 const Instruction *I = nullptr) {
5896 if ((Opcode == Instruction::Load &&
5897 isLegalMaskedGather(SrcVTy, Align(Alignment)) &&
5898 !forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
5899 Align(Alignment))) ||
5900 (Opcode == Instruction::Store &&
5901 isLegalMaskedScatter(SrcVTy, Align(Alignment)) &&
5902 !forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
5903 Align(Alignment))))
5904 return 1;
5905 return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask,
5906 Alignment, CostKind, I);
5907 }
5908
5909 assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
5910 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
5911 if (!PtrTy && Ptr->getType()->isVectorTy())
5912 PtrTy = dyn_cast<PointerType>(
5913 cast<VectorType>(Ptr->getType())->getElementType());
5914 assert(PtrTy && "Unexpected type for Ptr argument");
5915 unsigned AddressSpace = PtrTy->getAddressSpace();
5916
5917 if ((Opcode == Instruction::Load &&
5918 (!isLegalMaskedGather(SrcVTy, Align(Alignment)) ||
5919 forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
5920 Align(Alignment)))) ||
5921 (Opcode == Instruction::Store &&
5922 (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) ||
5923 forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
5924 Align(Alignment)))))
5925 return getGSScalarCost(Opcode, CostKind, SrcVTy, VariableMask, Alignment,
5926 AddressSpace);
5927
5928 return getGSVectorCost(Opcode, CostKind, SrcVTy, Ptr, Alignment,
5929 AddressSpace);
5930}
5931
5933 const TargetTransformInfo::LSRCost &C2) {
5934 // X86 specific here are "instruction number 1st priority".
5935 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
5936 C1.NumIVMuls, C1.NumBaseAdds,
5937 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
5938 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
5939 C2.NumIVMuls, C2.NumBaseAdds,
5940 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
5941}
5942
5944 return ST->hasMacroFusion() || ST->hasBranchFusion();
5945}
5946
5947bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
5948 if (!ST->hasAVX())
5949 return false;
5950
5951 // The backend can't handle a single element vector.
5952 if (isa<VectorType>(DataTy) &&
5953 cast<FixedVectorType>(DataTy)->getNumElements() == 1)
5954 return false;
5955 Type *ScalarTy = DataTy->getScalarType();
5956
5957 if (ScalarTy->isPointerTy())
5958 return true;
5959
5960 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
5961 return true;
5962
5963 if (ScalarTy->isHalfTy() && ST->hasBWI())
5964 return true;
5965
5966 if (ScalarTy->isBFloatTy() && ST->hasBF16())
5967 return true;
5968
5969 if (!ScalarTy->isIntegerTy())
5970 return false;
5971
5972 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
5973 return IntWidth == 32 || IntWidth == 64 ||
5974 ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
5975}
5976
5977bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) {
5978 return isLegalMaskedLoad(DataType, Alignment);
5979}
5980
5981bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) {
5982 unsigned DataSize = DL.getTypeStoreSize(DataType);
5983 // The only supported nontemporal loads are for aligned vectors of 16 or 32
5984 // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
5985 // (the equivalent stores only require AVX).
5986 if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
5987 return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
5988
5989 return false;
5990}
5991
5992bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) {
5993 unsigned DataSize = DL.getTypeStoreSize(DataType);
5994
5995 // SSE4A supports nontemporal stores of float and double at arbitrary
5996 // alignment.
5997 if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
5998 return true;
5999
6000 // Besides the SSE4A subtarget exception above, only aligned stores are
6001 // available nontemporaly on any other subtarget. And only stores with a size
6002 // of 4..32 bytes (powers of 2, only) are permitted.
6003 if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
6004 !isPowerOf2_32(DataSize))
6005 return false;
6006
6007 // 32-byte vector nontemporal stores are supported by AVX (the equivalent
6008 // loads require AVX2).
6009 if (DataSize == 32)
6010 return ST->hasAVX();
6011 if (DataSize == 16)
6012 return ST->hasSSE1();
6013 return true;
6014}
6015
6017 ElementCount NumElements) const {
6018 // movddup
6019 return ST->hasSSE3() && !NumElements.isScalable() &&
6020 NumElements.getFixedValue() == 2 &&
6021 ElementTy == Type::getDoubleTy(ElementTy->getContext());
6022}
6023
6025 if (!isa<VectorType>(DataTy))
6026 return false;
6027
6028 if (!ST->hasAVX512())
6029 return false;
6030
6031 // The backend can't handle a single element vector.
6032 if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6033 return false;
6034
6035 Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
6036
6037 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6038 return true;
6039
6040 if (!ScalarTy->isIntegerTy())
6041 return false;
6042
6043 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6044 return IntWidth == 32 || IntWidth == 64 ||
6045 ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
6046}
6047
6049 return isLegalMaskedExpandLoad(DataTy, Alignment);
6050}
6051
6052bool X86TTIImpl::supportsGather() const {
6053 // Some CPUs have better gather performance than others.
6054 // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
6055 // enable gather with a -march.
6056 return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2());
6057}
6058
6060 // Gather / Scatter for vector 2 is not profitable on KNL / SKX
6061 // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend
6062 // it to 8 elements, but zeroing upper bits of the mask vector will add more
6063 // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO:
6064 // Check, maybe the gather/scatter instruction is better in the VariableMask
6065 // case.
6066 unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements();
6067 return NumElts == 1 ||
6068 (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())));
6069}
6070
6072 Type *ScalarTy = DataTy->getScalarType();
6073 if (ScalarTy->isPointerTy())
6074 return true;
6075
6076 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6077 return true;
6078
6079 if (!ScalarTy->isIntegerTy())
6080 return false;
6081
6082 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6083 return IntWidth == 32 || IntWidth == 64;
6084}
6085
6087 if (!supportsGather() || !ST->preferGather())
6088 return false;
6089 return isLegalMaskedGatherScatter(DataTy, Alignment);
6090}
6091
6092bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
6093 unsigned Opcode1,
6094 const SmallBitVector &OpcodeMask) const {
6095 // ADDSUBPS 4xf32 SSE3
6096 // VADDSUBPS 4xf32 AVX
6097 // VADDSUBPS 8xf32 AVX2
6098 // ADDSUBPD 2xf64 SSE3
6099 // VADDSUBPD 2xf64 AVX
6100 // VADDSUBPD 4xf64 AVX2
6101
6102 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
6103 assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible");
6104 if (!isPowerOf2_32(NumElements))
6105 return false;
6106 // Check the opcode pattern. We apply the mask on the opcode arguments and
6107 // then check if it is what we expect.
6108 for (int Lane : seq<int>(0, NumElements)) {
6109 unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0;
6110 // We expect FSub for even lanes and FAdd for odd lanes.
6111 if (Lane % 2 == 0 && Opc != Instruction::FSub)
6112 return false;
6113 if (Lane % 2 == 1 && Opc != Instruction::FAdd)
6114 return false;
6115 }
6116 // Now check that the pattern is supported by the target ISA.
6117 Type *ElemTy = cast<VectorType>(VecTy)->getElementType();
6118 if (ElemTy->isFloatTy())
6119 return ST->hasSSE3() && NumElements % 4 == 0;
6120 if (ElemTy->isDoubleTy())
6121 return ST->hasSSE3() && NumElements % 2 == 0;
6122 return false;
6123}
6124
6125bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) {
6126 // AVX2 doesn't support scatter
6127 if (!ST->hasAVX512() || !ST->preferScatter())
6128 return false;
6129 return isLegalMaskedGatherScatter(DataType, Alignment);
6130}
6131
6132bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
6133 EVT VT = TLI->getValueType(DL, DataType);
6134 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
6135}
6136
6138 // FDIV is always expensive, even if it has a very low uop count.
6139 // TODO: Still necessary for recent CPUs with low latency/throughput fdiv?
6140 if (I->getOpcode() == Instruction::FDiv)
6141 return true;
6142
6144}
6145
6147 return false;
6148}
6149
6151 const Function *Callee) const {
6152 const TargetMachine &TM = getTLI()->getTargetMachine();
6153
6154 // Work this as a subsetting of subtarget features.
6155 const FeatureBitset &CallerBits =
6156 TM.getSubtargetImpl(*Caller)->getFeatureBits();
6157 const FeatureBitset &CalleeBits =
6158 TM.getSubtargetImpl(*Callee)->getFeatureBits();
6159
6160 // Check whether features are the same (apart from the ignore list).
6161 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
6162 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
6163 if (RealCallerBits == RealCalleeBits)
6164 return true;
6165
6166 // If the features are a subset, we need to additionally check for calls
6167 // that may become ABI-incompatible as a result of inlining.
6168 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
6169 return false;
6170
6171 for (const Instruction &I : instructions(Callee)) {
6172 if (const auto *CB = dyn_cast<CallBase>(&I)) {
6173 // Having more target features is fine for inline ASM.
6174 if (CB->isInlineAsm())
6175 continue;
6176
6178 for (Value *Arg : CB->args())
6179 Types.push_back(Arg->getType());
6180 if (!CB->getType()->isVoidTy())
6181 Types.push_back(CB->getType());
6182
6183 // Simple types are always ABI compatible.
6184 auto IsSimpleTy = [](Type *Ty) {
6185 return !Ty->isVectorTy() && !Ty->isAggregateType();
6186 };
6187 if (all_of(Types, IsSimpleTy))
6188 continue;
6189
6190 if (Function *NestedCallee = CB->getCalledFunction()) {
6191 // Assume that intrinsics are always ABI compatible.
6192 if (NestedCallee->isIntrinsic())
6193 continue;
6194
6195 // Do a precise compatibility check.
6196 if (!areTypesABICompatible(Caller, NestedCallee, Types))
6197 return false;
6198 } else {
6199 // We don't know the target features of the callee,
6200 // assume it is incompatible.
6201 return false;
6202 }
6203 }
6204 }
6205 return true;
6206}
6207
6209 const Function *Callee,
6210 const ArrayRef<Type *> &Types) const {
6211 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
6212 return false;
6213
6214 // If we get here, we know the target features match. If one function
6215 // considers 512-bit vectors legal and the other does not, consider them
6216 // incompatible.
6217 const TargetMachine &TM = getTLI()->getTargetMachine();
6218
6219 if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
6220 TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs())
6221 return true;
6222
6223 // Consider the arguments compatible if they aren't vectors or aggregates.
6224 // FIXME: Look at the size of vectors.
6225 // FIXME: Look at the element types of aggregates to see if there are vectors.
6226 return llvm::none_of(Types,
6227 [](Type *T) { return T->isVectorTy() || T->isAggregateType(); });
6228}
6229
6231X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
6233 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
6234 Options.NumLoadsPerBlock = 2;
6235 // All GPR and vector loads can be unaligned.
6236 Options.AllowOverlappingLoads = true;
6237 if (IsZeroCmp) {
6238 // Only enable vector loads for equality comparison. Right now the vector
6239 // version is not as fast for three way compare (see #33329).
6240 const unsigned PreferredWidth = ST->getPreferVectorWidth();
6241 if (PreferredWidth >= 512 && ST->hasAVX512() && ST->hasEVEX512())
6242 Options.LoadSizes.push_back(64);
6243 if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
6244 if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
6245 }
6246 if (ST->is64Bit()) {
6247 Options.LoadSizes.push_back(8);
6248 }
6249 Options.LoadSizes.push_back(4);
6250 Options.LoadSizes.push_back(2);
6251 Options.LoadSizes.push_back(1);
6252 return Options;
6253}
6254
6256 return supportsGather();
6257}
6258
6260 return false;
6261}
6262
6264 // TODO: We expect this to be beneficial regardless of arch,
6265 // but there are currently some unexplained performance artifacts on Atom.
6266 // As a temporary solution, disable on Atom.
6267 return !(ST->isAtom());
6268}
6269
6270// Get estimation for interleaved load/store operations and strided load.
6271// \p Indices contains indices for strided load.
6272// \p Factor - the factor of interleaving.
6273// AVX-512 provides 3-src shuffles that significantly reduces the cost.
6275 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
6276 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
6277 TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
6278 // VecTy for interleave memop is <VF*Factor x Elt>.
6279 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6280 // VecTy = <12 x i32>.
6281
6282 // Calculate the number of memory operations (NumOfMemOps), required
6283 // for load/store the VecTy.
6284 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6285 unsigned VecTySize = DL.getTypeStoreSize(VecTy);
6286 unsigned LegalVTSize = LegalVT.getStoreSize();
6287 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
6288
6289 // Get the cost of one memory operation.
6290 auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
6291 LegalVT.getVectorNumElements());
6292 InstructionCost MemOpCost;
6293 bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps;
6294 if (UseMaskedMemOp)
6295 MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment,
6297 else
6298 MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, MaybeAlign(Alignment),
6300
6301 unsigned VF = VecTy->getNumElements() / Factor;
6302 MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
6303
6304 InstructionCost MaskCost;
6305 if (UseMaskedMemOp) {
6306 APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements());
6307 for (unsigned Index : Indices) {
6308 assert(Index < Factor && "Invalid index for interleaved memory op");
6309 for (unsigned Elm = 0; Elm < VF; Elm++)
6310 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
6311 }
6312
6313 Type *I1Type = Type::getInt1Ty(VecTy->getContext());
6314
6315 MaskCost = getReplicationShuffleCost(
6316 I1Type, Factor, VF,
6317 UseMaskForGaps ? DemandedLoadStoreElts
6319 CostKind);
6320
6321 // The Gaps mask is invariant and created outside the loop, therefore the
6322 // cost of creating it is not accounted for here. However if we have both
6323 // a MaskForGaps and some other mask that guards the execution of the
6324 // memory access, we need to account for the cost of And-ing the two masks
6325 // inside the loop.
6326 if (UseMaskForGaps) {
6327 auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements());
6328 MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind);
6329 }
6330 }
6331
6332 if (Opcode == Instruction::Load) {
6333 // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
6334 // contain the cost of the optimized shuffle sequence that the
6335 // X86InterleavedAccess pass will generate.
6336 // The cost of loads and stores are computed separately from the table.
6337
6338 // X86InterleavedAccess support only the following interleaved-access group.
6339 static const CostTblEntry AVX512InterleavedLoadTbl[] = {
6340 {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
6341 {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
6342 {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
6343 };
6344
6345 if (const auto *Entry =
6346 CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
6347 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6348 //If an entry does not exist, fallback to the default implementation.
6349
6350 // Kind of shuffle depends on number of loaded values.
6351 // If we load the entire data in one register, we can use a 1-src shuffle.
6352 // Otherwise, we'll merge 2 sources in each operation.
6353 TTI::ShuffleKind ShuffleKind =
6354 (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
6355
6356 InstructionCost ShuffleCost = getShuffleCost(
6357 ShuffleKind, SingleMemOpTy, std::nullopt, CostKind, 0, nullptr);
6358
6359 unsigned NumOfLoadsInInterleaveGrp =
6360 Indices.size() ? Indices.size() : Factor;
6361 auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
6362 VecTy->getNumElements() / Factor);
6363 InstructionCost NumOfResults =
6364 getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp;
6365
6366 // About a half of the loads may be folded in shuffles when we have only
6367 // one result. If we have more than one result, or the loads are masked,
6368 // we do not fold loads at all.
6369 unsigned NumOfUnfoldedLoads =
6370 UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
6371
6372 // Get a number of shuffle operations per result.
6373 unsigned NumOfShufflesPerResult =
6374 std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
6375
6376 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6377 // When we have more than one destination, we need additional instructions
6378 // to keep sources.
6379 InstructionCost NumOfMoves = 0;
6380 if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
6381 NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
6382
6383 InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
6384 MaskCost + NumOfUnfoldedLoads * MemOpCost +
6385 NumOfMoves;
6386
6387 return Cost;
6388 }
6389
6390 // Store.
6391 assert(Opcode == Instruction::Store &&
6392 "Expected Store Instruction at this point");
6393 // X86InterleavedAccess support only the following interleaved-access group.
6394 static const CostTblEntry AVX512InterleavedStoreTbl[] = {
6395 {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
6396 {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
6397 {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
6398
6399 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
6400 {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
6401 {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
6402 {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
6403 };
6404
6405 if (const auto *Entry =
6406 CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
6407 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6408 //If an entry does not exist, fallback to the default implementation.
6409
6410 // There is no strided stores meanwhile. And store can't be folded in
6411 // shuffle.
6412 unsigned NumOfSources = Factor; // The number of values to be merged.
6413 InstructionCost ShuffleCost = getShuffleCost(
6414 TTI::SK_PermuteTwoSrc, SingleMemOpTy, std::nullopt, CostKind, 0, nullptr);
6415 unsigned NumOfShufflesPerStore = NumOfSources - 1;
6416
6417 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6418 // We need additional instructions to keep sources.
6419 unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
6421 MaskCost +
6422 NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
6423 NumOfMoves;
6424 return Cost;
6425}
6426
6428 unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices,
6429 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
6430 bool UseMaskForCond, bool UseMaskForGaps) {
6431 auto *VecTy = cast<FixedVectorType>(BaseTy);
6432
6433 auto isSupportedOnAVX512 = [&](Type *VecTy) {
6434 Type *EltTy = cast<VectorType>(VecTy)->getElementType();
6435 if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
6436 EltTy->isIntegerTy(32) || EltTy->isPointerTy())
6437 return true;
6438 if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy())
6439 return ST->hasBWI();
6440 if (EltTy->isBFloatTy())
6441 return ST->hasBF16();
6442 return false;
6443 };
6444 if (ST->hasAVX512() && isSupportedOnAVX512(VecTy))
6446 Opcode, VecTy, Factor, Indices, Alignment,
6447 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
6448
6449 if (UseMaskForCond || UseMaskForGaps)
6450 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6451 Alignment, AddressSpace, CostKind,
6452 UseMaskForCond, UseMaskForGaps);
6453
6454 // Get estimation for interleaved load/store operations for SSE-AVX2.
6455 // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow
6456 // computing the cost using a generic formula as a function of generic
6457 // shuffles. We therefore use a lookup table instead, filled according to
6458 // the instruction sequences that codegen currently generates.
6459
6460 // VecTy for interleave memop is <VF*Factor x Elt>.
6461 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6462 // VecTy = <12 x i32>.
6463 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6464
6465 // This function can be called with VecTy=<6xi128>, Factor=3, in which case
6466 // the VF=2, while v2i128 is an unsupported MVT vector type
6467 // (see MachineValueType.h::getVectorVT()).
6468 if (!LegalVT.isVector())
6469 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6470 Alignment, AddressSpace, CostKind);
6471
6472 unsigned VF = VecTy->getNumElements() / Factor;
6473 Type *ScalarTy = VecTy->getElementType();
6474 // Deduplicate entries, model floats/pointers as appropriately-sized integers.
6475 if (!ScalarTy->isIntegerTy())
6476 ScalarTy =
6477 Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
6478
6479 // Get the cost of all the memory operations.
6480 // FIXME: discount dead loads.
6481 InstructionCost MemOpCosts = getMemoryOpCost(
6482 Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind);
6483
6484 auto *VT = FixedVectorType::get(ScalarTy, VF);
6485 EVT ETy = TLI->getValueType(DL, VT);
6486 if (!ETy.isSimple())
6487 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6488 Alignment, AddressSpace, CostKind);
6489
6490 // TODO: Complete for other data-types and strides.
6491 // Each combination of Stride, element bit width and VF results in a different
6492 // sequence; The cost tables are therefore accessed with:
6493 // Factor (stride) and VectorType=VFxiN.
6494 // The Cost accounts only for the shuffle sequence;
6495 // The cost of the loads/stores is accounted for separately.
6496 //
6497 static const CostTblEntry AVX2InterleavedLoadTbl[] = {
6498 {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8
6499 {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8
6500 {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8
6501 {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8
6502 {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8
6503
6504 {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16
6505 {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16
6506 {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16
6507
6508 {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32
6509 {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32
6510 {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32
6511
6512 {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64
6513 {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64
6514 {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64
6515 {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64
6516
6517 {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8
6518 {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8
6519 {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8
6520 {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
6521 {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8
6522
6523 {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16
6524 {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16
6525 {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16
6526 {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16
6527 {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16
6528
6529 {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32
6530 {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32
6531 {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32
6532 {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32
6533 {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32
6534
6535 {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64
6536 {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64
6537 {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64
6538 {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64
6539
6540 {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8
6541 {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8
6542 {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8
6543 {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8
6544 {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8
6545
6546 {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16
6547 {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16
6548 {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16
6549 {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16
6550 {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16
6551
6552 {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32
6553 {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32
6554 {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32
6555 {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32
6556 {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32
6557
6558 {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64
6559 {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64
6560 {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64
6561 {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64
6562
6563 {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8
6564 {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8
6565 {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8
6566 {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8
6567 {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8
6568
6569 {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16
6570 {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16
6571 {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16
6572 {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16
6573 {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16
6574
6575 {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32
6576 {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32
6577 {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32
6578 {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32
6579
6580 {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64
6581 {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64
6582 {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64
6583
6584 {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
6585 };
6586
6587 static const CostTblEntry SSSE3InterleavedLoadTbl[] = {
6588 {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16
6589 };
6590
6591 static const CostTblEntry SSE2InterleavedLoadTbl[] = {
6592 {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16
6593 {2, MVT::v4i16, 7}, // (load 8i16 and) deinterleave into 2 x 4i16
6594
6595 {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32
6596 {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32
6597
6598 {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64
6599 };
6600
6601 static const CostTblEntry AVX2InterleavedStoreTbl[] = {
6602 {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store)
6603 {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store)
6604
6605 {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store)
6606 {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store)
6607 {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store)
6608
6609 {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store)
6610 {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store)
6611 {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store)
6612 {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store)
6613
6614 {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store)
6615 {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store)
6616 {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store)
6617 {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store)
6618 {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store)
6619
6620 {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store)
6621 {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store)
6622 {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store)
6623 {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
6624 {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
6625
6626 {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store)
6627 {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store)
6628 {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store)
6629 {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store)
6630 {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store)
6631
6632 {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store)
6633 {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store)
6634 {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store)
6635 {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store)
6636 {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store)
6637
6638 {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store)
6639 {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store)
6640 {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store)
6641 {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store)
6642
6643 {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store)
6644 {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store)
6645 {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store)
6646 {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store)
6647 {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store)
6648
6649 {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store)
6650 {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store)
6651 {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store)
6652 {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store)
6653 {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store)
6654
6655 {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store)
6656 {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store)
6657 {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store)
6658 {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store)
6659 {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store)
6660
6661 {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store)
6662 {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store)
6663 {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store)
6664 {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store)
6665
6666 {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store)
6667 {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store)
6668 {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store)
6669 {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store)
6670 {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store)
6671
6672 {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store)
6673 {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store)
6674 {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store)
6675 {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store)
6676 {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store)
6677
6678 {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store)
6679 {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store)
6680 {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store)
6681 {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store)
6682
6683 {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store)
6684 {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store)
6685 {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store)
6686 };
6687
6688 static const CostTblEntry SSE2InterleavedStoreTbl[] = {
6689 {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store)
6690 {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store)
6691 {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store)
6692
6693 {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store)
6694 {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store)
6695
6696 {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store)
6697 };
6698
6699 if (Opcode == Instruction::Load) {
6700 auto GetDiscountedCost = [Factor, NumMembers = Indices.size(),
6701 MemOpCosts](const CostTblEntry *Entry) {
6702 // NOTE: this is just an approximation!
6703 // It can over/under -estimate the cost!
6704 return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor);
6705 };
6706
6707 if (ST->hasAVX2())
6708 if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor,
6709 ETy.getSimpleVT()))
6710 return GetDiscountedCost(Entry);
6711
6712 if (ST->hasSSSE3())
6713 if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor,
6714 ETy.getSimpleVT()))
6715 return GetDiscountedCost(Entry);
6716
6717 if (ST->hasSSE2())
6718 if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor,
6719 ETy.getSimpleVT()))
6720 return GetDiscountedCost(Entry);
6721 } else {
6722 assert(Opcode == Instruction::Store &&
6723 "Expected Store Instruction at this point");
6724 assert((!Indices.size() || Indices.size() == Factor) &&
6725 "Interleaved store only supports fully-interleaved groups.");
6726 if (ST->hasAVX2())
6727 if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor,
6728 ETy.getSimpleVT()))
6729 return MemOpCosts + Entry->Cost;
6730
6731 if (ST->hasSSE2())
6732 if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor,
6733 ETy.getSimpleVT()))
6734 return MemOpCosts + Entry->Cost;
6735 }
6736
6737 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6738 Alignment, AddressSpace, CostKind,
6739 UseMaskForCond, UseMaskForGaps);
6740}
6741
6743 int64_t BaseOffset,
6744 bool HasBaseReg, int64_t Scale,
6745 unsigned AddrSpace) const {
6746 // Scaling factors are not free at all.
6747 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
6748 // will take 2 allocations in the out of order engine instead of 1
6749 // for plain addressing mode, i.e. inst (reg1).
6750 // E.g.,
6751 // vaddps (%rsi,%rdx), %ymm0, %ymm1
6752 // Requires two allocations (one for the load, one for the computation)
6753 // whereas:
6754 // vaddps (%rsi), %ymm0, %ymm1
6755 // Requires just 1 allocation, i.e., freeing allocations for other operations
6756 // and having less micro operations to execute.
6757 //
6758 // For some X86 architectures, this is even worse because for instance for
6759 // stores, the complex addressing mode forces the instruction to use the
6760 // "load" ports instead of the dedicated "store" port.
6761 // E.g., on Haswell:
6762 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
6763 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
6765 AM.BaseGV = BaseGV;
6766 AM.BaseOffs = BaseOffset;
6767 AM.HasBaseReg = HasBaseReg;
6768 AM.Scale = Scale;
6769 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
6770 // Scale represents reg2 * scale, thus account for 1
6771 // as soon as we use a second register.
6772 return AM.Scale != 0;
6773 return -1;
6774}
Expand Atomic instructions
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
Hexagon Common GEP
static LVOptions Options
Definition: LVOptions.cpp:25
#define I(x, y, z)
Definition: MD5.cpp:58
if(VerifyEach)
const char LLVMTargetMachineRef TM
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
This file a TargetTransformInfo::Concept conforming object specific to the X86 target machine.
Class for arbitrary precision integers.
Definition: APInt.h:76
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1620
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1308
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:349
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:236
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:358
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1439
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:805
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:178
APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition: APInt.cpp:453
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1513
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:969
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:762
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:438
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:891
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:855
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:339
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:993
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:996
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:1023
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:1017
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:1016
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:1001
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:1004
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:1018
@ ICMP_NE
not equal
Definition: InstrTypes.h:1015
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:1021
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:1019
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:1003
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:410
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:865
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:672
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:472
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:874
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:319
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
unsigned getNumElements() const
Definition: DerivedTypes.h:582
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:973
static InstructionCost getInvalid(CostType Val=0)
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
const IntrinsicInst * getInst() const
Machine Value Type.
bool is128BitVector() const
Return true if this is a 128-bit vector type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:585
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Class to represent pointers.
Definition: DerivedTypes.h:646
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:679
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
bool test(unsigned Idx) const
size_type size() const
Returns the number of bits in this bitvector.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
bool isExpensiveToSpeculativelyExecute(const Instruction *I)
bool isStridedAccess(const SCEV *Ptr) const
unsigned minRequiredElementSize(const Value *Val, bool &isSigned) const
const SCEVConstant * getConstantStrideStep(ScalarEvolution *SE, const SCEV *Ptr) const
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
CacheLevel
The possible cache levels.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:342
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
static Type * getDoubleTy(LLVMContext &C)
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:234
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
static IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:146
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:143
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
static IntegerType * getInt8Ty(LLVMContext &C)
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:157
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:216
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
static VectorType * getExtendedElementVectorType(VectorType *VTy)
This static method is like getInteger except that the element types are twice as wide as the elements...
Definition: DerivedTypes.h:463
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * getDoubleElementsVectorType(VectorType *VTy)
This static method returns a VectorType with twice as many elements as the input type and the same el...
Definition: DerivedTypes.h:517
Type * getElementType() const
Definition: DerivedTypes.h:436
bool hasSSE1() const
Definition: X86Subtarget.h:200
bool hasSSE42() const
Definition: X86Subtarget.h:205
bool useAVX512Regs() const
Definition: X86Subtarget.h:267
bool hasSSE3() const
Definition: X86Subtarget.h:202
bool hasAVX512() const
Definition: X86Subtarget.h:208
bool hasSSE41() const
Definition: X86Subtarget.h:204
bool hasSSE2() const
Definition: X86Subtarget.h:201
bool hasSSSE3() const
Definition: X86Subtarget.h:203
bool hasAVX() const
Definition: X86Subtarget.h:206
unsigned getPreferVectorWidth() const
Definition: X86Subtarget.h:239
bool hasAVX2() const
Definition: X86Subtarget.h:207
InstructionCost getInterleavedMemoryOpCostAVX512(unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
bool isLegalMaskedGather(Type *DataType, Align Alignment)
InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask, TTI::TargetCostKind CostKind) const
std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool isLegalNTStore(Type *DataType, Align Alignment)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
bool isLegalNTLoad(Type *DataType, Align Alignment)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment)
bool isLegalMaskedGatherScatter(Type *DataType, Align Alignment)
bool isLegalMaskedLoad(Type *DataType, Align Alignment)
bool supportsEfficientVectorElementLoadStore() const
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
bool prefersVectorizedAddressing() const
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment)
std::optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const override
bool isLegalMaskedStore(Type *DataType, Align Alignment)
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
Calculate the cost of Gather / Scatter operation.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
unsigned getMaxInterleaveFactor(ElementCount VF)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
bool isLegalMaskedCompressStore(Type *DataType, Align Alignment)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
unsigned getNumberOfRegisters(unsigned ClassID) const
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
unsigned getAtomicMemIntrinsicMaxElementSize() const
bool isLegalMaskedScatter(Type *DataType, Align Alignment)
InstructionCost getIntImmCost(int64_t)
Calculate the cost of materializing a 64-bit value.
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isExpensiveToSpeculativelyExecute(const Instruction *I)
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
InstructionCost getMinMaxCost(Intrinsic::ID IID, Type *Ty, TTI::TargetCostKind CostKind, FastMathFlags FMF)
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Type) const
bool hasDivRemOp(Type *DataType, bool IsSigned)
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:199
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:2978
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:751
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:724
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:715
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:240
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:791
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:391
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:689
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:256
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:775
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:723
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:931
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:350
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:728
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:324
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:706
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:536
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:781
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:972
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:675
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:889
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:991
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:837
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:681
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:870
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:787
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:341
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
apint_match m_APIntAllowPoison(const APInt *&Res)
Match APInt while allowing poison in splat vector constants.
Definition: PatternMatch.h:305
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:428
AddressSpace
Definition: NVPTXBaseInfo.h:21
Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:372
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
constexpr int PoisonMaskElem
void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1824
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
InstructionCost Cost
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:439
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
#define N
std::optional< unsigned > operator[](TargetTransformInfo::TargetCostKind Kind) const
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:34
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition: Alignment.h:141
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Type Conversion Cost Table.
Definition: CostTable.h:55