LLVM 19.0.0git
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements a TargetTransformInfo analysis pass specific to the
10/// X86 target machine. It uses the target's detailed information to provide
11/// more precise answers to certain TTI queries, while letting the target
12/// independent and default TTI implementations handle the rest.
13///
14//===----------------------------------------------------------------------===//
15/// About Cost Model numbers used below it's necessary to say the following:
16/// the numbers correspond to some "generic" X86 CPU instead of usage of a
17/// specific CPU model. Usually the numbers correspond to the CPU where the
18/// feature first appeared. For example, if we do Subtarget.hasSSE42() in
19/// the lookups below the cost is based on Nehalem as that was the first CPU
20/// to support that feature level and thus has most likely the worst case cost,
21/// although we may discard an outlying worst cost from one CPU (e.g. Atom).
22///
23/// Some examples of other technologies/CPUs:
24/// SSE 3 - Pentium4 / Athlon64
25/// SSE 4.1 - Penryn
26/// SSE 4.2 - Nehalem / Silvermont
27/// AVX - Sandy Bridge / Jaguar / Bulldozer
28/// AVX2 - Haswell / Ryzen
29/// AVX-512 - Xeon Phi / Skylake
30///
31/// And some examples of instruction target dependent costs (latency)
32/// divss sqrtss rsqrtss
33/// AMD K7 11-16 19 3
34/// Piledriver 9-24 13-15 5
35/// Jaguar 14 16 2
36/// Pentium II,III 18 30 2
37/// Nehalem 7-14 7-18 3
38/// Haswell 10-13 11 5
39///
40/// Interpreting the 4 TargetCostKind types:
41/// TCK_RecipThroughput and TCK_Latency should try to match the worst case
42/// values reported by the CPU scheduler models (and llvm-mca).
43/// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the
44/// actual encoding size of the instruction.
45/// TCK_SizeAndLatency should match the worst case micro-op counts reported by
46/// by the CPU scheduler models (and llvm-mca), to ensure that they are
47/// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are
48/// often used as the cost thresholds where TCK_SizeAndLatency is requested.
49//===----------------------------------------------------------------------===//
50
58#include "llvm/Support/Debug.h"
59#include <optional>
60
61using namespace llvm;
62
63#define DEBUG_TYPE "x86tti"
64
65//===----------------------------------------------------------------------===//
66//
67// X86 cost model.
68//
69//===----------------------------------------------------------------------===//
70
71// Helper struct to store/access costs for each cost kind.
72// TODO: Move this to allow other targets to use it?
74 unsigned RecipThroughputCost = ~0U;
75 unsigned LatencyCost = ~0U;
76 unsigned CodeSizeCost = ~0U;
77 unsigned SizeAndLatencyCost = ~0U;
78
79 std::optional<unsigned>
81 unsigned Cost = ~0U;
82 switch (Kind) {
85 break;
88 break;
91 break;
94 break;
95 }
96 if (Cost == ~0U)
97 return std::nullopt;
98 return Cost;
99 }
100};
102
105 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
106 // TODO: Currently the __builtin_popcount() implementation using SSE3
107 // instructions is inefficient. Once the problem is fixed, we should
108 // call ST->hasSSE3() instead of ST->hasPOPCNT().
109 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
110}
111
112std::optional<unsigned> X86TTIImpl::getCacheSize(
114 switch (Level) {
116 // - Penryn
117 // - Nehalem
118 // - Westmere
119 // - Sandy Bridge
120 // - Ivy Bridge
121 // - Haswell
122 // - Broadwell
123 // - Skylake
124 // - Kabylake
125 return 32 * 1024; // 32 KByte
127 // - Penryn
128 // - Nehalem
129 // - Westmere
130 // - Sandy Bridge
131 // - Ivy Bridge
132 // - Haswell
133 // - Broadwell
134 // - Skylake
135 // - Kabylake
136 return 256 * 1024; // 256 KByte
137 }
138
139 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
140}
141
142std::optional<unsigned> X86TTIImpl::getCacheAssociativity(
144 // - Penryn
145 // - Nehalem
146 // - Westmere
147 // - Sandy Bridge
148 // - Ivy Bridge
149 // - Haswell
150 // - Broadwell
151 // - Skylake
152 // - Kabylake
153 switch (Level) {
155 [[fallthrough]];
157 return 8;
158 }
159
160 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
161}
162
163unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
164 bool Vector = (ClassID == 1);
165 if (Vector && !ST->hasSSE1())
166 return 0;
167
168 if (ST->is64Bit()) {
169 if (Vector && ST->hasAVX512())
170 return 32;
171 return 16;
172 }
173 return 8;
174}
175
178 unsigned PreferVectorWidth = ST->getPreferVectorWidth();
179 switch (K) {
181 return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
183 if (ST->hasAVX512() && ST->hasEVEX512() && PreferVectorWidth >= 512)
184 return TypeSize::getFixed(512);
185 if (ST->hasAVX() && PreferVectorWidth >= 256)
186 return TypeSize::getFixed(256);
187 if (ST->hasSSE1() && PreferVectorWidth >= 128)
188 return TypeSize::getFixed(128);
189 return TypeSize::getFixed(0);
191 return TypeSize::getScalable(0);
192 }
193
194 llvm_unreachable("Unsupported register kind");
195}
196
199 .getFixedValue();
200}
201
203 // If the loop will not be vectorized, don't interleave the loop.
204 // Let regular unroll to unroll the loop, which saves the overflow
205 // check and memory check cost.
206 if (VF.isScalar())
207 return 1;
208
209 if (ST->isAtom())
210 return 1;
211
212 // Sandybridge and Haswell have multiple execution ports and pipelined
213 // vector units.
214 if (ST->hasAVX())
215 return 4;
216
217 return 2;
218}
219
221 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
224 const Instruction *CxtI) {
225
226 // vXi8 multiplications are always promoted to vXi16.
227 // Sub-128-bit types can be extended/packed more efficiently.
228 if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
229 Ty->getPrimitiveSizeInBits() <= 64 && Ty->getScalarSizeInBits() == 8) {
230 Type *WideVecTy =
231 VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
232 return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
234 CostKind) +
235 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
237 CostKind) +
238 getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info);
239 }
240
241 // Legalize the type.
242 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
243
244 int ISD = TLI->InstructionOpcodeToISD(Opcode);
245 assert(ISD && "Invalid opcode");
246
247 if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
248 (LT.second.getScalarType() == MVT::i32 ||
249 LT.second.getScalarType() == MVT::i64)) {
250 // Check if the operands can be represented as a smaller datatype.
251 bool Op1Signed = false, Op2Signed = false;
252 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
253 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
254 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
255 bool SignedMode = Op1Signed || Op2Signed;
256
257 // If both vXi32 are representable as i15 and at least one is constant,
258 // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
259 // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
260 if (OpMinSize <= 15 && !ST->isPMADDWDSlow() &&
261 LT.second.getScalarType() == MVT::i32) {
262 bool Op1Constant =
263 isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
264 bool Op2Constant =
265 isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
266 bool Op1Sext = isa<SExtInst>(Args[0]) &&
267 (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
268 bool Op2Sext = isa<SExtInst>(Args[1]) &&
269 (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
270
271 bool IsZeroExtended = !Op1Signed || !Op2Signed;
272 bool IsConstant = Op1Constant || Op2Constant;
273 bool IsSext = Op1Sext || Op2Sext;
274 if (IsConstant || IsZeroExtended || IsSext)
275 LT.second =
276 MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
277 }
278
279 // Check if the vXi32 operands can be shrunk into a smaller datatype.
280 // This should match the codegen from reduceVMULWidth.
281 // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()).
282 if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) {
283 if (OpMinSize <= 7)
284 return LT.first * 3; // pmullw/sext
285 if (!SignedMode && OpMinSize <= 8)
286 return LT.first * 3; // pmullw/zext
287 if (OpMinSize <= 15)
288 return LT.first * 5; // pmullw/pmulhw/pshuf
289 if (!SignedMode && OpMinSize <= 16)
290 return LT.first * 5; // pmullw/pmulhw/pshuf
291 }
292
293 // If both vXi64 are representable as (unsigned) i32, then we can perform
294 // the multiple with a single PMULUDQ instruction.
295 // TODO: Add (SSE41+) PMULDQ handling for signed extensions.
296 if (!SignedMode && OpMinSize <= 32 && LT.second.getScalarType() == MVT::i64)
297 ISD = X86ISD::PMULUDQ;
298 }
299
300 // Vector multiply by pow2 will be simplified to shifts.
301 // Vector multiply by -pow2 will be simplified to shifts/negates.
302 if (ISD == ISD::MUL && Op2Info.isConstant() &&
303 (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) {
305 getArithmeticInstrCost(Instruction::Shl, Ty, CostKind,
306 Op1Info.getNoProps(), Op2Info.getNoProps());
307 if (Op2Info.isNegatedPowerOf2())
308 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
309 return Cost;
310 }
311
312 // On X86, vector signed division by constants power-of-two are
313 // normally expanded to the sequence SRA + SRL + ADD + SRA.
314 // The OperandValue properties may not be the same as that of the previous
315 // operation; conservatively assume OP_None.
316 if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
317 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
319 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
320 Op1Info.getNoProps(), Op2Info.getNoProps());
321 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
322 Op1Info.getNoProps(), Op2Info.getNoProps());
323 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
324 Op1Info.getNoProps(), Op2Info.getNoProps());
325
326 if (ISD == ISD::SREM) {
327 // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
328 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(),
329 Op2Info.getNoProps());
330 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(),
331 Op2Info.getNoProps());
332 }
333
334 return Cost;
335 }
336
337 // Vector unsigned division/remainder will be simplified to shifts/masks.
338 if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
339 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
340 if (ISD == ISD::UDIV)
341 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
342 Op1Info.getNoProps(), Op2Info.getNoProps());
343 // UREM
344 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
345 Op1Info.getNoProps(), Op2Info.getNoProps());
346 }
347
348 static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
349 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
350 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
351 { ISD::SRA, MVT::v16i8, { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb.
352 { ISD::SHL, MVT::v32i8, { 1, 8, 2, 3 } }, // psllw + pand.
353 { ISD::SRL, MVT::v32i8, { 1, 8, 2, 3 } }, // psrlw + pand.
354 { ISD::SRA, MVT::v32i8, { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb.
355 { ISD::SHL, MVT::v64i8, { 1, 8, 2, 3 } }, // psllw + pand.
356 { ISD::SRL, MVT::v64i8, { 1, 8, 2, 3 } }, // psrlw + pand.
357 { ISD::SRA, MVT::v64i8, { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb.
358
359 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // psllw
360 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
361 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
362 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // psllw
363 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
364 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
365 };
366
367 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI())
368 if (const auto *Entry =
369 CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second))
370 if (auto KindCost = Entry->Cost[CostKind])
371 return LT.first * *KindCost;
372
373 static const CostKindTblEntry AVX512UniformConstCostTable[] = {
374 { ISD::SHL, MVT::v64i8, { 2, 12, 5, 6 } }, // psllw + pand.
375 { ISD::SRL, MVT::v64i8, { 2, 12, 5, 6 } }, // psrlw + pand.
376 { ISD::SRA, MVT::v64i8, { 3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb.
377
378 { ISD::SHL, MVT::v16i16, { 2, 7, 4, 4 } }, // psllw + split.
379 { ISD::SRL, MVT::v16i16, { 2, 7, 4, 4 } }, // psrlw + split.
380 { ISD::SRA, MVT::v16i16, { 2, 7, 4, 4 } }, // psraw + split.
381
382 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, // pslld
383 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, // psrld
384 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, // psrad
385 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, // pslld
386 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, // psrld
387 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, // psrad
388
389 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, // psraq
390 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, // psllq
391 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, // psrlq
392 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, // psraq
393 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, // psllq
394 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, // psrlq
395 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, // psraq
396
397 { ISD::SDIV, MVT::v16i32, { 6 } }, // pmuludq sequence
398 { ISD::SREM, MVT::v16i32, { 8 } }, // pmuludq+mul+sub sequence
399 { ISD::UDIV, MVT::v16i32, { 5 } }, // pmuludq sequence
400 { ISD::UREM, MVT::v16i32, { 7 } }, // pmuludq+mul+sub sequence
401 };
402
403 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512())
404 if (const auto *Entry =
405 CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second))
406 if (auto KindCost = Entry->Cost[CostKind])
407 return LT.first * *KindCost;
408
409 static const CostKindTblEntry AVX2UniformConstCostTable[] = {
410 { ISD::SHL, MVT::v16i8, { 1, 8, 2, 3 } }, // psllw + pand.
411 { ISD::SRL, MVT::v16i8, { 1, 8, 2, 3 } }, // psrlw + pand.
412 { ISD::SRA, MVT::v16i8, { 2, 10, 5, 6 } }, // psrlw, pand, pxor, psubb.
413 { ISD::SHL, MVT::v32i8, { 2, 8, 2, 4 } }, // psllw + pand.
414 { ISD::SRL, MVT::v32i8, { 2, 8, 2, 4 } }, // psrlw + pand.
415 { ISD::SRA, MVT::v32i8, { 3, 10, 5, 9 } }, // psrlw, pand, pxor, psubb.
416
417 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw
418 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw
419 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw
420 { ISD::SHL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psllw
421 { ISD::SRL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psrlw
422 { ISD::SRA, MVT::v16i16,{ 2, 2, 1, 2 } }, // psraw
423
424 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
425 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld
426 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad
427 { ISD::SHL, MVT::v8i32, { 2, 2, 1, 2 } }, // pslld
428 { ISD::SRL, MVT::v8i32, { 2, 2, 1, 2 } }, // psrld
429 { ISD::SRA, MVT::v8i32, { 2, 2, 1, 2 } }, // psrad
430
431 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq
432 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq
433 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
434 { ISD::SHL, MVT::v4i64, { 2, 2, 1, 2 } }, // psllq
435 { ISD::SRL, MVT::v4i64, { 2, 2, 1, 2 } }, // psrlq
436 { ISD::SRA, MVT::v4i64, { 4, 4, 3, 6 } }, // psrad + shuffle + split.
437
438 { ISD::SDIV, MVT::v8i32, { 6 } }, // pmuludq sequence
439 { ISD::SREM, MVT::v8i32, { 8 } }, // pmuludq+mul+sub sequence
440 { ISD::UDIV, MVT::v8i32, { 5 } }, // pmuludq sequence
441 { ISD::UREM, MVT::v8i32, { 7 } }, // pmuludq+mul+sub sequence
442 };
443
444 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2())
445 if (const auto *Entry =
446 CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second))
447 if (auto KindCost = Entry->Cost[CostKind])
448 return LT.first * *KindCost;
449
450 static const CostKindTblEntry AVXUniformConstCostTable[] = {
451 { ISD::SHL, MVT::v16i8, { 2, 7, 2, 3 } }, // psllw + pand.
452 { ISD::SRL, MVT::v16i8, { 2, 7, 2, 3 } }, // psrlw + pand.
453 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
454 { ISD::SHL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psllw + pand) + split.
455 { ISD::SRL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psrlw + pand) + split.
456 { ISD::SRA, MVT::v32i8, { 7, 7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split.
457
458 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 1 } }, // psllw.
459 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 1 } }, // psrlw.
460 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 1 } }, // psraw.
461 { ISD::SHL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psllw + split.
462 { ISD::SRL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psrlw + split.
463 { ISD::SRA, MVT::v16i16,{ 3, 6, 4, 5 } }, // psraw + split.
464
465 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 1 } }, // pslld.
466 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 1 } }, // psrld.
467 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 1 } }, // psrad.
468 { ISD::SHL, MVT::v8i32, { 3, 6, 4, 5 } }, // pslld + split.
469 { ISD::SRL, MVT::v8i32, { 3, 6, 4, 5 } }, // psrld + split.
470 { ISD::SRA, MVT::v8i32, { 3, 6, 4, 5 } }, // psrad + split.
471
472 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 1 } }, // psllq.
473 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 1 } }, // psrlq.
474 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
475 { ISD::SHL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
476 { ISD::SRL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
477 { ISD::SRA, MVT::v4i64, { 5, 7, 8, 9 } }, // 2 x psrad + shuffle + split.
478
479 { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split.
480 { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split.
481 { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split.
482 { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split.
483 };
484
485 // XOP has faster vXi8 shifts.
486 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() &&
487 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
488 if (const auto *Entry =
489 CostTableLookup(AVXUniformConstCostTable, ISD, LT.second))
490 if (auto KindCost = Entry->Cost[CostKind])
491 return LT.first * *KindCost;
492
493 static const CostKindTblEntry SSE2UniformConstCostTable[] = {
494 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
495 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
496 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
497
498 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw.
499 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw.
500 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw.
501
502 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
503 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld.
504 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad.
505
506 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq.
507 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq.
508 { ISD::SRA, MVT::v2i64, { 3, 5, 6, 6 } }, // 2 x psrad + shuffle.
509
510 { ISD::SDIV, MVT::v4i32, { 6 } }, // pmuludq sequence
511 { ISD::SREM, MVT::v4i32, { 8 } }, // pmuludq+mul+sub sequence
512 { ISD::UDIV, MVT::v4i32, { 5 } }, // pmuludq sequence
513 { ISD::UREM, MVT::v4i32, { 7 } }, // pmuludq+mul+sub sequence
514 };
515
516 // XOP has faster vXi8 shifts.
517 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() &&
518 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
519 if (const auto *Entry =
520 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
521 if (auto KindCost = Entry->Cost[CostKind])
522 return LT.first * *KindCost;
523
524 static const CostKindTblEntry AVX512BWConstCostTable[] = {
525 { ISD::SDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
526 { ISD::SREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
527 { ISD::UDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
528 { ISD::UREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
529
530 { ISD::SDIV, MVT::v32i16, { 6 } }, // vpmulhw sequence
531 { ISD::SREM, MVT::v32i16, { 8 } }, // vpmulhw+mul+sub sequence
532 { ISD::UDIV, MVT::v32i16, { 6 } }, // vpmulhuw sequence
533 { ISD::UREM, MVT::v32i16, { 8 } }, // vpmulhuw+mul+sub sequence
534 };
535
536 if (Op2Info.isConstant() && ST->hasBWI())
537 if (const auto *Entry =
538 CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
539 if (auto KindCost = Entry->Cost[CostKind])
540 return LT.first * *KindCost;
541
542 static const CostKindTblEntry AVX512ConstCostTable[] = {
543 { ISD::SDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
544 { ISD::SREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
545 { ISD::UDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
546 { ISD::UREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
547
548 { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence
549 { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence
550 { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence
551 { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence
552
553 { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence
554 { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence
555 { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence
556 { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence
557 };
558
559 if (Op2Info.isConstant() && ST->hasAVX512())
560 if (const auto *Entry =
561 CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
562 if (auto KindCost = Entry->Cost[CostKind])
563 return LT.first * *KindCost;
564
565 static const CostKindTblEntry AVX2ConstCostTable[] = {
566 { ISD::SDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
567 { ISD::SREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
568 { ISD::UDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
569 { ISD::UREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
570
571 { ISD::SDIV, MVT::v16i16, { 6 } }, // vpmulhw sequence
572 { ISD::SREM, MVT::v16i16, { 8 } }, // vpmulhw+mul+sub sequence
573 { ISD::UDIV, MVT::v16i16, { 6 } }, // vpmulhuw sequence
574 { ISD::UREM, MVT::v16i16, { 8 } }, // vpmulhuw+mul+sub sequence
575
576 { ISD::SDIV, MVT::v8i32, { 15 } }, // vpmuldq sequence
577 { ISD::SREM, MVT::v8i32, { 19 } }, // vpmuldq+mul+sub sequence
578 { ISD::UDIV, MVT::v8i32, { 15 } }, // vpmuludq sequence
579 { ISD::UREM, MVT::v8i32, { 19 } }, // vpmuludq+mul+sub sequence
580 };
581
582 if (Op2Info.isConstant() && ST->hasAVX2())
583 if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
584 if (auto KindCost = Entry->Cost[CostKind])
585 return LT.first * *KindCost;
586
587 static const CostKindTblEntry AVXConstCostTable[] = {
588 { ISD::SDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
589 { ISD::SREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
590 { ISD::UDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
591 { ISD::UREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
592
593 { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split.
594 { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split.
595 { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split.
596 { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split.
597
598 { ISD::SDIV, MVT::v8i32, { 32 } }, // vpmuludq sequence
599 { ISD::SREM, MVT::v8i32, { 38 } }, // vpmuludq+mul+sub sequence
600 { ISD::UDIV, MVT::v8i32, { 32 } }, // 2*pmuludq sequence + split.
601 { ISD::UREM, MVT::v8i32, { 42 } }, // 2*pmuludq+mul+sub sequence + split.
602 };
603
604 if (Op2Info.isConstant() && ST->hasAVX())
605 if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second))
606 if (auto KindCost = Entry->Cost[CostKind])
607 return LT.first * *KindCost;
608
609 static const CostKindTblEntry SSE41ConstCostTable[] = {
610 { ISD::SDIV, MVT::v4i32, { 15 } }, // vpmuludq sequence
611 { ISD::SREM, MVT::v4i32, { 20 } }, // vpmuludq+mul+sub sequence
612 };
613
614 if (Op2Info.isConstant() && ST->hasSSE41())
615 if (const auto *Entry =
616 CostTableLookup(SSE41ConstCostTable, ISD, LT.second))
617 if (auto KindCost = Entry->Cost[CostKind])
618 return LT.first * *KindCost;
619
620 static const CostKindTblEntry SSE2ConstCostTable[] = {
621 { ISD::SDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
622 { ISD::SREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
623 { ISD::UDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
624 { ISD::UREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
625
626 { ISD::SDIV, MVT::v8i16, { 6 } }, // pmulhw sequence
627 { ISD::SREM, MVT::v8i16, { 8 } }, // pmulhw+mul+sub sequence
628 { ISD::UDIV, MVT::v8i16, { 6 } }, // pmulhuw sequence
629 { ISD::UREM, MVT::v8i16, { 8 } }, // pmulhuw+mul+sub sequence
630
631 { ISD::SDIV, MVT::v4i32, { 19 } }, // pmuludq sequence
632 { ISD::SREM, MVT::v4i32, { 24 } }, // pmuludq+mul+sub sequence
633 { ISD::UDIV, MVT::v4i32, { 15 } }, // pmuludq sequence
634 { ISD::UREM, MVT::v4i32, { 20 } }, // pmuludq+mul+sub sequence
635 };
636
637 if (Op2Info.isConstant() && ST->hasSSE2())
638 if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
639 if (auto KindCost = Entry->Cost[CostKind])
640 return LT.first * *KindCost;
641
642 static const CostKindTblEntry AVX512BWUniformCostTable[] = {
643 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
644 { ISD::SRL, MVT::v16i8, { 3,10, 5, 8 } }, // psrlw + pand.
645 { ISD::SRA, MVT::v16i8, { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb.
646 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
647 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
648 { ISD::SRA, MVT::v32i8, { 5,10,10,13 } }, // psrlw, pand, pxor, psubb.
649 { ISD::SHL, MVT::v64i8, { 4, 7, 6, 8 } }, // psllw + pand.
650 { ISD::SRL, MVT::v64i8, { 4, 8, 7,10 } }, // psrlw + pand.
651 { ISD::SRA, MVT::v64i8, { 5,10,10,15 } }, // psrlw, pand, pxor, psubb.
652
653 { ISD::SHL, MVT::v32i16, { 2, 4, 2, 3 } }, // psllw
654 { ISD::SRL, MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw
655 { ISD::SRA, MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw
656 };
657
658 if (ST->hasBWI() && Op2Info.isUniform())
659 if (const auto *Entry =
660 CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second))
661 if (auto KindCost = Entry->Cost[CostKind])
662 return LT.first * *KindCost;
663
664 static const CostKindTblEntry AVX512UniformCostTable[] = {
665 { ISD::SHL, MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split.
666 { ISD::SRL, MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split.
667 { ISD::SRA, MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split.
668
669 { ISD::SHL, MVT::v16i32, { 2, 4, 2, 3 } }, // pslld
670 { ISD::SRL, MVT::v16i32, { 2, 4, 2, 3 } }, // psrld
671 { ISD::SRA, MVT::v16i32, { 2, 4, 2, 3 } }, // psrad
672
673 { ISD::SRA, MVT::v2i64, { 1, 2, 1, 2 } }, // psraq
674 { ISD::SHL, MVT::v4i64, { 1, 4, 1, 2 } }, // psllq
675 { ISD::SRL, MVT::v4i64, { 1, 4, 1, 2 } }, // psrlq
676 { ISD::SRA, MVT::v4i64, { 1, 4, 1, 2 } }, // psraq
677 { ISD::SHL, MVT::v8i64, { 1, 4, 1, 2 } }, // psllq
678 { ISD::SRL, MVT::v8i64, { 1, 4, 1, 2 } }, // psrlq
679 { ISD::SRA, MVT::v8i64, { 1, 4, 1, 2 } }, // psraq
680 };
681
682 if (ST->hasAVX512() && Op2Info.isUniform())
683 if (const auto *Entry =
684 CostTableLookup(AVX512UniformCostTable, ISD, LT.second))
685 if (auto KindCost = Entry->Cost[CostKind])
686 return LT.first * *KindCost;
687
688 static const CostKindTblEntry AVX2UniformCostTable[] = {
689 // Uniform splats are cheaper for the following instructions.
690 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
691 { ISD::SRL, MVT::v16i8, { 3, 9, 5, 8 } }, // psrlw + pand.
692 { ISD::SRA, MVT::v16i8, { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb.
693 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
694 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
695 { ISD::SRA, MVT::v32i8, { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb.
696
697 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 2 } }, // psllw.
698 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 2 } }, // psrlw.
699 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 2 } }, // psraw.
700 { ISD::SHL, MVT::v16i16, { 2, 4, 2, 3 } }, // psllw.
701 { ISD::SRL, MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw.
702 { ISD::SRA, MVT::v16i16, { 2, 4, 2, 3 } }, // psraw.
703
704 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 2 } }, // pslld
705 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 2 } }, // psrld
706 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 2 } }, // psrad
707 { ISD::SHL, MVT::v8i32, { 2, 4, 2, 3 } }, // pslld
708 { ISD::SRL, MVT::v8i32, { 2, 4, 2, 3 } }, // psrld
709 { ISD::SRA, MVT::v8i32, { 2, 4, 2, 3 } }, // psrad
710
711 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 2 } }, // psllq
712 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 2 } }, // psrlq
713 { ISD::SRA, MVT::v2i64, { 2, 4, 5, 7 } }, // 2 x psrad + shuffle.
714 { ISD::SHL, MVT::v4i64, { 2, 4, 1, 2 } }, // psllq
715 { ISD::SRL, MVT::v4i64, { 2, 4, 1, 2 } }, // psrlq
716 { ISD::SRA, MVT::v4i64, { 4, 6, 5, 9 } }, // 2 x psrad + shuffle.
717 };
718
719 if (ST->hasAVX2() && Op2Info.isUniform())
720 if (const auto *Entry =
721 CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
722 if (auto KindCost = Entry->Cost[CostKind])
723 return LT.first * *KindCost;
724
725 static const CostKindTblEntry AVXUniformCostTable[] = {
726 { ISD::SHL, MVT::v16i8, { 4, 4, 6, 8 } }, // psllw + pand.
727 { ISD::SRL, MVT::v16i8, { 4, 8, 5, 8 } }, // psrlw + pand.
728 { ISD::SRA, MVT::v16i8, { 6, 6, 9,13 } }, // psrlw, pand, pxor, psubb.
729 { ISD::SHL, MVT::v32i8, { 7, 8,11,14 } }, // psllw + pand + split.
730 { ISD::SRL, MVT::v32i8, { 7, 9,10,14 } }, // psrlw + pand + split.
731 { ISD::SRA, MVT::v32i8, { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split.
732
733 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 2 } }, // psllw.
734 { ISD::SRL, MVT::v8i16, { 1, 3, 1, 2 } }, // psrlw.
735 { ISD::SRA, MVT::v8i16, { 1, 3, 1, 2 } }, // psraw.
736 { ISD::SHL, MVT::v16i16, { 3, 7, 5, 7 } }, // psllw + split.
737 { ISD::SRL, MVT::v16i16, { 3, 7, 5, 7 } }, // psrlw + split.
738 { ISD::SRA, MVT::v16i16, { 3, 7, 5, 7 } }, // psraw + split.
739
740 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 2 } }, // pslld.
741 { ISD::SRL, MVT::v4i32, { 1, 3, 1, 2 } }, // psrld.
742 { ISD::SRA, MVT::v4i32, { 1, 3, 1, 2 } }, // psrad.
743 { ISD::SHL, MVT::v8i32, { 3, 7, 5, 7 } }, // pslld + split.
744 { ISD::SRL, MVT::v8i32, { 3, 7, 5, 7 } }, // psrld + split.
745 { ISD::SRA, MVT::v8i32, { 3, 7, 5, 7 } }, // psrad + split.
746
747 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 2 } }, // psllq.
748 { ISD::SRL, MVT::v2i64, { 1, 3, 1, 2 } }, // psrlq.
749 { ISD::SRA, MVT::v2i64, { 3, 4, 5, 7 } }, // 2 x psrad + shuffle.
750 { ISD::SHL, MVT::v4i64, { 3, 7, 4, 6 } }, // psllq + split.
751 { ISD::SRL, MVT::v4i64, { 3, 7, 4, 6 } }, // psrlq + split.
752 { ISD::SRA, MVT::v4i64, { 6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split.
753 };
754
755 // XOP has faster vXi8 shifts.
756 if (ST->hasAVX() && Op2Info.isUniform() &&
757 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
758 if (const auto *Entry =
759 CostTableLookup(AVXUniformCostTable, ISD, LT.second))
760 if (auto KindCost = Entry->Cost[CostKind])
761 return LT.first * *KindCost;
762
763 static const CostKindTblEntry SSE2UniformCostTable[] = {
764 // Uniform splats are cheaper for the following instructions.
765 { ISD::SHL, MVT::v16i8, { 9, 10, 6, 9 } }, // psllw + pand.
766 { ISD::SRL, MVT::v16i8, { 9, 13, 5, 9 } }, // psrlw + pand.
767 { ISD::SRA, MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence.
768
769 { ISD::SHL, MVT::v8i16, { 2, 2, 1, 2 } }, // psllw.
770 { ISD::SRL, MVT::v8i16, { 2, 2, 1, 2 } }, // psrlw.
771 { ISD::SRA, MVT::v8i16, { 2, 2, 1, 2 } }, // psraw.
772
773 { ISD::SHL, MVT::v4i32, { 2, 2, 1, 2 } }, // pslld
774 { ISD::SRL, MVT::v4i32, { 2, 2, 1, 2 } }, // psrld.
775 { ISD::SRA, MVT::v4i32, { 2, 2, 1, 2 } }, // psrad.
776
777 { ISD::SHL, MVT::v2i64, { 2, 2, 1, 2 } }, // psllq.
778 { ISD::SRL, MVT::v2i64, { 2, 2, 1, 2 } }, // psrlq.
779 { ISD::SRA, MVT::v2i64, { 5, 9, 5, 7 } }, // 2*psrlq + xor + sub.
780 };
781
782 if (ST->hasSSE2() && Op2Info.isUniform() &&
783 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
784 if (const auto *Entry =
785 CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
786 if (auto KindCost = Entry->Cost[CostKind])
787 return LT.first * *KindCost;
788
789 static const CostKindTblEntry AVX512DQCostTable[] = {
790 { ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq
791 { ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq
792 { ISD::MUL, MVT::v8i64, { 3, 15, 1, 3 } } // pmullq
793 };
794
795 // Look for AVX512DQ lowering tricks for custom cases.
796 if (ST->hasDQI())
797 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
798 if (auto KindCost = Entry->Cost[CostKind])
799 return LT.first * *KindCost;
800
801 static const CostKindTblEntry AVX512BWCostTable[] = {
802 { ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence.
803 { ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence.
804 { ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence.
805 { ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence.
806 { ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence.
807 { ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence.
808 { ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence.
809 { ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence.
810 { ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence.
811
812 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw
813 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw
814 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw
815 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw
816 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw
817 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw
818 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw
819 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw
820 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw
821
822 { ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb
823 { ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw
824
825 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb
826 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw
827 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd
828 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq
829
830 { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb
831 { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw
832
833 { ISD::MUL, MVT::v64i8, { 5, 10,10,11 } },
834 { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw
835
836 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb
837 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw
838 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd
839 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq
840 };
841
842 // Look for AVX512BW lowering tricks for custom cases.
843 if (ST->hasBWI())
844 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
845 if (auto KindCost = Entry->Cost[CostKind])
846 return LT.first * *KindCost;
847
848 static const CostKindTblEntry AVX512CostTable[] = {
849 { ISD::SHL, MVT::v64i8, { 15, 19,27,33 } }, // vpblendv+split sequence.
850 { ISD::SRL, MVT::v64i8, { 15, 19,30,36 } }, // vpblendv+split sequence.
851 { ISD::SRA, MVT::v64i8, { 37, 37,51,63 } }, // vpblendv+split sequence.
852
853 { ISD::SHL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
854 { ISD::SRL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
855 { ISD::SRA, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence.
856
857 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } },
858 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } },
859 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } },
860 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } },
861 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } },
862 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } },
863 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } },
864 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } },
865 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } },
866
867 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } },
868 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } },
869 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } },
870 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } },
871 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } },
872 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } },
873 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } },
874 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } },
875 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } },
876
877 { ISD::ADD, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*paddb + split
878 { ISD::ADD, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*paddw + split
879
880 { ISD::SUB, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*psubb + split
881 { ISD::SUB, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*psubw + split
882
883 { ISD::AND, MVT::v32i8, { 1, 1, 1, 1 } },
884 { ISD::AND, MVT::v16i16, { 1, 1, 1, 1 } },
885 { ISD::AND, MVT::v8i32, { 1, 1, 1, 1 } },
886 { ISD::AND, MVT::v4i64, { 1, 1, 1, 1 } },
887
888 { ISD::OR, MVT::v32i8, { 1, 1, 1, 1 } },
889 { ISD::OR, MVT::v16i16, { 1, 1, 1, 1 } },
890 { ISD::OR, MVT::v8i32, { 1, 1, 1, 1 } },
891 { ISD::OR, MVT::v4i64, { 1, 1, 1, 1 } },
892
893 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 1 } },
894 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 1 } },
895 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } },
896 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } },
897
898 { ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
899 { ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
900 { ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
901 { ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add
902 { ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/
903
904 { X86ISD::PMULUDQ, MVT::v8i64, { 1, 5, 1, 1 } },
905
906 { ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
907 { ISD::FADD, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
908 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
909 { ISD::FSUB, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
910 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
911 { ISD::FMUL, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
912 { ISD::FMUL, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
913 { ISD::FMUL, MVT::v2f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
914 { ISD::FMUL, MVT::f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
915
916 { ISD::FDIV, MVT::f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
917 { ISD::FDIV, MVT::v2f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
918 { ISD::FDIV, MVT::v4f64, { 8, 14, 1, 1 } }, // Skylake from http://www.agner.org/
919 { ISD::FDIV, MVT::v8f64, { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/
920
921 { ISD::FNEG, MVT::v16f32, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
922 { ISD::FADD, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
923 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
924 { ISD::FSUB, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
925 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
926 { ISD::FMUL, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
927 { ISD::FMUL, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
928 { ISD::FMUL, MVT::v4f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
929 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
930
931 { ISD::FDIV, MVT::f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
932 { ISD::FDIV, MVT::v4f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
933 { ISD::FDIV, MVT::v8f32, { 5, 11, 1, 1 } }, // Skylake from http://www.agner.org/
934 { ISD::FDIV, MVT::v16f32, { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/
935 };
936
937 if (ST->hasAVX512())
938 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
939 if (auto KindCost = Entry->Cost[CostKind])
940 return LT.first * *KindCost;
941
942 static const CostKindTblEntry AVX2ShiftCostTable[] = {
943 // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
944 // customize them to detect the cases where shift amount is a scalar one.
945 { ISD::SHL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org)
946 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
947 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org)
948 { ISD::SHL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org)
949 { ISD::SRL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
950 { ISD::SRA, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org)
951 { ISD::SHL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org)
952 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org)
953 { ISD::SHL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org)
954 { ISD::SRL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org)
955 };
956
957 if (ST->hasAVX512()) {
958 if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant())
959 // On AVX512, a packed v32i16 shift left by a constant build_vector
960 // is lowered into a vector multiply (vpmullw).
961 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
962 Op1Info.getNoProps(), Op2Info.getNoProps());
963 }
964
965 // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
966 if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
967 if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
968 Op2Info.isConstant())
969 // On AVX2, a packed v16i16 shift left by a constant build_vector
970 // is lowered into a vector multiply (vpmullw).
971 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
972 Op1Info.getNoProps(), Op2Info.getNoProps());
973
974 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
975 if (auto KindCost = Entry->Cost[CostKind])
976 return LT.first * *KindCost;
977 }
978
979 static const CostKindTblEntry XOPShiftCostTable[] = {
980 // 128bit shifts take 1cy, but right shifts require negation beforehand.
981 { ISD::SHL, MVT::v16i8, { 1, 3, 1, 1 } },
982 { ISD::SRL, MVT::v16i8, { 2, 3, 1, 1 } },
983 { ISD::SRA, MVT::v16i8, { 2, 3, 1, 1 } },
984 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 1 } },
985 { ISD::SRL, MVT::v8i16, { 2, 3, 1, 1 } },
986 { ISD::SRA, MVT::v8i16, { 2, 3, 1, 1 } },
987 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 1 } },
988 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 1 } },
989 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 1 } },
990 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 1 } },
991 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } },
992 { ISD::SRA, MVT::v2i64, { 2, 3, 1, 1 } },
993 // 256bit shifts require splitting if AVX2 didn't catch them above.
994 { ISD::SHL, MVT::v32i8, { 4, 7, 5, 6 } },
995 { ISD::SRL, MVT::v32i8, { 6, 7, 5, 6 } },
996 { ISD::SRA, MVT::v32i8, { 6, 7, 5, 6 } },
997 { ISD::SHL, MVT::v16i16, { 4, 7, 5, 6 } },
998 { ISD::SRL, MVT::v16i16, { 6, 7, 5, 6 } },
999 { ISD::SRA, MVT::v16i16, { 6, 7, 5, 6 } },
1000 { ISD::SHL, MVT::v8i32, { 4, 7, 5, 6 } },
1001 { ISD::SRL, MVT::v8i32, { 6, 7, 5, 6 } },
1002 { ISD::SRA, MVT::v8i32, { 6, 7, 5, 6 } },
1003 { ISD::SHL, MVT::v4i64, { 4, 7, 5, 6 } },
1004 { ISD::SRL, MVT::v4i64, { 6, 7, 5, 6 } },
1005 { ISD::SRA, MVT::v4i64, { 6, 7, 5, 6 } },
1006 };
1007
1008 // Look for XOP lowering tricks.
1009 if (ST->hasXOP()) {
1010 // If the right shift is constant then we'll fold the negation so
1011 // it's as cheap as a left shift.
1012 int ShiftISD = ISD;
1013 if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant())
1014 ShiftISD = ISD::SHL;
1015 if (const auto *Entry =
1016 CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
1017 if (auto KindCost = Entry->Cost[CostKind])
1018 return LT.first * *KindCost;
1019 }
1020
1021 if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) {
1022 MVT VT = LT.second;
1023 // Vector shift left by non uniform constant can be lowered
1024 // into vector multiply.
1025 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
1026 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
1027 ISD = ISD::MUL;
1028 }
1029
1030 static const CostKindTblEntry GLMCostTable[] = {
1031 { ISD::FDIV, MVT::f32, { 18, 19, 1, 1 } }, // divss
1032 { ISD::FDIV, MVT::v4f32, { 35, 36, 1, 1 } }, // divps
1033 { ISD::FDIV, MVT::f64, { 33, 34, 1, 1 } }, // divsd
1034 { ISD::FDIV, MVT::v2f64, { 65, 66, 1, 1 } }, // divpd
1035 };
1036
1037 if (ST->useGLMDivSqrtCosts())
1038 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second))
1039 if (auto KindCost = Entry->Cost[CostKind])
1040 return LT.first * *KindCost;
1041
1042 static const CostKindTblEntry SLMCostTable[] = {
1043 { ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld
1044 { ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw
1045 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd
1046 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss
1047 { ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd
1048 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // mulps
1049 { ISD::FDIV, MVT::f32, { 17, 19, 1, 1 } }, // divss
1050 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 6 } }, // divps
1051 { ISD::FDIV, MVT::f64, { 32, 34, 1, 1 } }, // divsd
1052 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 6 } }, // divpd
1053 { ISD::FADD, MVT::v2f64, { 2, 4, 1, 1 } }, // addpd
1054 { ISD::FSUB, MVT::v2f64, { 2, 4, 1, 1 } }, // subpd
1055 // v2i64/v4i64 mul is custom lowered as a series of long:
1056 // multiplies(3), shifts(3) and adds(2)
1057 // slm muldq version throughput is 2 and addq throughput 4
1058 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
1059 // 3X4 (addq throughput) = 17
1060 { ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } },
1061 // slm addq\subq throughput is 4
1062 { ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } },
1063 { ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } },
1064 };
1065
1066 if (ST->useSLMArithCosts())
1067 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second))
1068 if (auto KindCost = Entry->Cost[CostKind])
1069 return LT.first * *KindCost;
1070
1071 static const CostKindTblEntry AVX2CostTable[] = {
1072 { ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence.
1073 { ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence.
1074 { ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence.
1075 { ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1076
1077 { ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence.
1078 { ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence.
1079 { ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence.
1080 { ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1081
1082 { ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence.
1083 { ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence.
1084 { ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence.
1085 { ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence.
1086 { ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence.
1087 { ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence.
1088
1089 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb
1090 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb
1091 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw
1092 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw
1093 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd
1094 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd
1095 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq
1096 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq
1097
1098 { ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack
1099 { ISD::MUL, MVT::v32i8, { 6, 11,10,19 } }, // unpack/pmullw
1100 { ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw
1101 { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld
1102 { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld
1103 { ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add
1104 { ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
1105
1106 { X86ISD::PMULUDQ, MVT::v4i64, { 1, 5, 1, 1 } },
1107
1108 { ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd
1109 { ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps
1110
1111 { ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd
1112 { ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss
1113 { ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd
1114 { ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps
1115 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd
1116 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps
1117
1118 { ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd
1119 { ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss
1120 { ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd
1121 { ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps
1122 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd
1123 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps
1124
1125 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd
1126 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss
1127 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd
1128 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps
1129 { ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd
1130 { ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps
1131
1132 { ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss
1133 { ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps
1134 { ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps
1135 { ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd
1136 { ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd
1137 { ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd
1138 };
1139
1140 // Look for AVX2 lowering tricks for custom cases.
1141 if (ST->hasAVX2())
1142 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
1143 if (auto KindCost = Entry->Cost[CostKind])
1144 return LT.first * *KindCost;
1145
1146 static const CostKindTblEntry AVX1CostTable[] = {
1147 // We don't have to scalarize unsupported ops. We can issue two half-sized
1148 // operations and we only need to extract the upper YMM half.
1149 // Two ops + 1 extract + 1 insert = 4.
1150 { ISD::MUL, MVT::v32i8, { 12, 13, 22, 23 } }, // unpack/pmullw + split
1151 { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split
1152 { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split
1153 { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
1154 { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } },
1155
1156 { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps
1157 { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps
1158 { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps
1159 { ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps
1160
1161 { ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps
1162 { ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps
1163 { ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps
1164 { ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps
1165
1166 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps
1167 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps
1168 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps
1169 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps
1170
1171 { ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split
1172 { ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split
1173 { ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split
1174 { ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split
1175 { ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split
1176 { ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split
1177 { ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split
1178 { ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split
1179 { ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq
1180 { ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq
1181
1182 { ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence.
1183 { ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split.
1184 { ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence.
1185 { ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split.
1186 { ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld
1187 { ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split
1188 { ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1189 { ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1190
1191 { ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence.
1192 { ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split.
1193 { ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1194 { ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1195 { ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1196 { ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1197 { ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1198 { ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1199
1200 { ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence.
1201 { ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split.
1202 { ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1203 { ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1204 { ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1205 { ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1206 { ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend.
1207 { ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split.
1208
1209 { ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1210 { ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1211
1212 { ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1213 { ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1214 { ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1215 { ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1216 { ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1217 { ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1218
1219 { ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1220 { ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1221 { ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1222 { ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1223 { ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1224 { ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1225
1226 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1227 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1228 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1229 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1230 { ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1231 { ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1232
1233 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1234 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1235 { ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/
1236 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1237 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1238 { ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/
1239 };
1240
1241 if (ST->hasAVX())
1242 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
1243 if (auto KindCost = Entry->Cost[CostKind])
1244 return LT.first * *KindCost;
1245
1246 static const CostKindTblEntry SSE42CostTable[] = {
1247 { ISD::FADD, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1248 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1249 { ISD::FADD, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1250 { ISD::FADD, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1251
1252 { ISD::FSUB, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1253 { ISD::FSUB, MVT::f32 , { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1254 { ISD::FSUB, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1255 { ISD::FSUB, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1256
1257 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1258 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1259 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1260 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1261
1262 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1263 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1264 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1265 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1266
1267 { ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add
1268 };
1269
1270 if (ST->hasSSE42())
1271 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
1272 if (auto KindCost = Entry->Cost[CostKind])
1273 return LT.first * *KindCost;
1274
1275 static const CostKindTblEntry SSE41CostTable[] = {
1276 { ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence.
1277 { ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence.
1278 { ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld
1279
1280 { ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence.
1281 { ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1282 { ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1283 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1284
1285 { ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence.
1286 { ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1287 { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1288 { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence.
1289
1290 { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*unpack/2*pmullw/2*and/pack
1291 { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org)
1292 };
1293
1294 if (ST->hasSSE41())
1295 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
1296 if (auto KindCost = Entry->Cost[CostKind])
1297 return LT.first * *KindCost;
1298
1299 static const CostKindTblEntry SSE2CostTable[] = {
1300 // We don't correctly identify costs of casts because they are marked as
1301 // custom.
1302 { ISD::SHL, MVT::v16i8, { 13, 21,26,28 } }, // cmpgtb sequence.
1303 { ISD::SHL, MVT::v8i16, { 24, 27,16,20 } }, // cmpgtw sequence.
1304 { ISD::SHL, MVT::v4i32, { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq.
1305 { ISD::SHL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1306
1307 { ISD::SRL, MVT::v16i8, { 14, 28,27,30 } }, // cmpgtb sequence.
1308 { ISD::SRL, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1309 { ISD::SRL, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1310 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1311
1312 { ISD::SRA, MVT::v16i8, { 27, 30,54,54 } }, // unpacked cmpgtb sequence.
1313 { ISD::SRA, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1314 { ISD::SRA, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1315 { ISD::SRA, MVT::v2i64, { 8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence.
1316
1317 { ISD::AND, MVT::v16i8, { 1, 1, 1, 1 } }, // pand
1318 { ISD::AND, MVT::v8i16, { 1, 1, 1, 1 } }, // pand
1319 { ISD::AND, MVT::v4i32, { 1, 1, 1, 1 } }, // pand
1320 { ISD::AND, MVT::v2i64, { 1, 1, 1, 1 } }, // pand
1321
1322 { ISD::OR, MVT::v16i8, { 1, 1, 1, 1 } }, // por
1323 { ISD::OR, MVT::v8i16, { 1, 1, 1, 1 } }, // por
1324 { ISD::OR, MVT::v4i32, { 1, 1, 1, 1 } }, // por
1325 { ISD::OR, MVT::v2i64, { 1, 1, 1, 1 } }, // por
1326
1327 { ISD::XOR, MVT::v16i8, { 1, 1, 1, 1 } }, // pxor
1328 { ISD::XOR, MVT::v8i16, { 1, 1, 1, 1 } }, // pxor
1329 { ISD::XOR, MVT::v4i32, { 1, 1, 1, 1 } }, // pxor
1330 { ISD::XOR, MVT::v2i64, { 1, 1, 1, 1 } }, // pxor
1331
1332 { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq
1333 { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq
1334
1335 { ISD::MUL, MVT::v16i8, { 5, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack
1336 { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw
1337 { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle
1338 { ISD::MUL, MVT::v2i64, { 7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add
1339
1340 { X86ISD::PMULUDQ, MVT::v2i64, { 1, 5, 1, 1 } },
1341
1342 { ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/
1343 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/
1344 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/
1345 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/
1346
1347 { ISD::FNEG, MVT::f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1348 { ISD::FNEG, MVT::f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1349 { ISD::FNEG, MVT::v4f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1350 { ISD::FNEG, MVT::v2f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1351
1352 { ISD::FADD, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1353 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1354 { ISD::FADD, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1355
1356 { ISD::FSUB, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1357 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1358 { ISD::FSUB, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1359
1360 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1361 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1362 };
1363
1364 if (ST->hasSSE2())
1365 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1366 if (auto KindCost = Entry->Cost[CostKind])
1367 return LT.first * *KindCost;
1368
1369 static const CostKindTblEntry SSE1CostTable[] = {
1370 { ISD::FDIV, MVT::f32, { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/
1371 { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/
1372
1373 { ISD::FNEG, MVT::f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1374 { ISD::FNEG, MVT::v4f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1375
1376 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1377 { ISD::FADD, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1378
1379 { ISD::FSUB, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1380 { ISD::FSUB, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1381
1382 { ISD::FMUL, MVT::f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1383 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1384 };
1385
1386 if (ST->hasSSE1())
1387 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1388 if (auto KindCost = Entry->Cost[CostKind])
1389 return LT.first * *KindCost;
1390
1391 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
1392 { ISD::ADD, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1393 { ISD::SUB, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1394 { ISD::MUL, MVT::i64, { 2, 6, 1, 2 } },
1395 };
1396
1397 if (ST->is64Bit())
1398 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1399 if (auto KindCost = Entry->Cost[CostKind])
1400 return LT.first * *KindCost;
1401
1402 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1403 { ISD::ADD, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1404 { ISD::ADD, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1405 { ISD::ADD, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1406
1407 { ISD::SUB, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1408 { ISD::SUB, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1409 { ISD::SUB, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1410
1411 { ISD::MUL, MVT::i8, { 3, 4, 1, 1 } },
1412 { ISD::MUL, MVT::i16, { 2, 4, 1, 1 } },
1413 { ISD::MUL, MVT::i32, { 1, 4, 1, 1 } },
1414
1415 { ISD::FNEG, MVT::f64, { 2, 2, 1, 3 } }, // (x87)
1416 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1417 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1418 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // (x87)
1419 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87)
1420 };
1421
1422 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1423 if (auto KindCost = Entry->Cost[CostKind])
1424 return LT.first * *KindCost;
1425
1426 // It is not a good idea to vectorize division. We have to scalarize it and
1427 // in the process we will often end up having to spilling regular
1428 // registers. The overhead of division is going to dominate most kernels
1429 // anyways so try hard to prevent vectorization of division - it is
1430 // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1431 // to hide "20 cycles" for each lane.
1432 if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() &&
1433 (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
1434 ISD == ISD::UREM)) {
1435 InstructionCost ScalarCost =
1437 Op1Info.getNoProps(), Op2Info.getNoProps());
1438 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1439 }
1440
1441 // Handle some basic single instruction code size cases.
1442 if (CostKind == TTI::TCK_CodeSize) {
1443 switch (ISD) {
1444 case ISD::FADD:
1445 case ISD::FSUB:
1446 case ISD::FMUL:
1447 case ISD::FDIV:
1448 case ISD::FNEG:
1449 case ISD::AND:
1450 case ISD::OR:
1451 case ISD::XOR:
1452 return LT.first;
1453 break;
1454 }
1455 }
1456
1457 // Fallback to the default implementation.
1458 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1459 Args, CxtI);
1460}
1461
1464 unsigned Opcode1, const SmallBitVector &OpcodeMask,
1466 if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
1467 return TTI::TCC_Basic;
1469}
1470
1472 TTI::ShuffleKind Kind, VectorType *BaseTp, ArrayRef<int> Mask,
1474 ArrayRef<const Value *> Args, const Instruction *CxtI) {
1475 // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1476 // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1477 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(BaseTp);
1478
1479 Kind = improveShuffleKindFromMask(Kind, Mask, BaseTp, Index, SubTp);
1480
1481 // Recognize a basic concat_vector shuffle.
1482 if (Kind == TTI::SK_PermuteTwoSrc &&
1483 Mask.size() == (2 * BaseTp->getElementCount().getKnownMinValue()) &&
1484 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
1487 CostKind, Mask.size() / 2, BaseTp);
1488
1489 // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1490 if (Kind == TTI::SK_Transpose)
1491 Kind = TTI::SK_PermuteTwoSrc;
1492
1493 // For Broadcasts we are splatting the first element from the first input
1494 // register, so only need to reference that input and all the output
1495 // registers are the same.
1496 if (Kind == TTI::SK_Broadcast)
1497 LT.first = 1;
1498
1499 // Treat <X x bfloat> shuffles as <X x half>.
1500 if (LT.second.isVector() && LT.second.getScalarType() == MVT::bf16)
1501 LT.second = LT.second.changeVectorElementType(MVT::f16);
1502
1503 // Subvector extractions are free if they start at the beginning of a
1504 // vector and cheap if the subvectors are aligned.
1505 if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1506 int NumElts = LT.second.getVectorNumElements();
1507 if ((Index % NumElts) == 0)
1508 return 0;
1509 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1510 if (SubLT.second.isVector()) {
1511 int NumSubElts = SubLT.second.getVectorNumElements();
1512 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1513 return SubLT.first;
1514 // Handle some cases for widening legalization. For now we only handle
1515 // cases where the original subvector was naturally aligned and evenly
1516 // fit in its legalized subvector type.
1517 // FIXME: Remove some of the alignment restrictions.
1518 // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1519 // vectors.
1520 int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1521 if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1522 (NumSubElts % OrigSubElts) == 0 &&
1523 LT.second.getVectorElementType() ==
1524 SubLT.second.getVectorElementType() &&
1525 LT.second.getVectorElementType().getSizeInBits() ==
1527 assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1528 "Unexpected number of elements!");
1529 auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
1530 LT.second.getVectorNumElements());
1531 auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
1532 SubLT.second.getVectorNumElements());
1533 int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1534 InstructionCost ExtractCost =
1535 getShuffleCost(TTI::SK_ExtractSubvector, VecTy, std::nullopt,
1536 CostKind, ExtractIndex, SubTy);
1537
1538 // If the original size is 32-bits or more, we can use pshufd. Otherwise
1539 // if we have SSSE3 we can use pshufb.
1540 if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1541 return ExtractCost + 1; // pshufd or pshufb
1542
1543 assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1544 "Unexpected vector size");
1545
1546 return ExtractCost + 2; // worst case pshufhw + pshufd
1547 }
1548 }
1549 // If the extract subvector is not optimal, treat it as single op shuffle.
1551 }
1552
1553 // Subvector insertions are cheap if the subvectors are aligned.
1554 // Note that in general, the insertion starting at the beginning of a vector
1555 // isn't free, because we need to preserve the rest of the wide vector.
1556 if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1557 int NumElts = LT.second.getVectorNumElements();
1558 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1559 if (SubLT.second.isVector()) {
1560 int NumSubElts = SubLT.second.getVectorNumElements();
1561 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1562 return SubLT.first;
1563 }
1564
1565 // If the insertion isn't aligned, treat it like a 2-op shuffle.
1566 Kind = TTI::SK_PermuteTwoSrc;
1567 }
1568
1569 // Handle some common (illegal) sub-vector types as they are often very cheap
1570 // to shuffle even on targets without PSHUFB.
1571 EVT VT = TLI->getValueType(DL, BaseTp);
1572 if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1573 !ST->hasSSSE3()) {
1574 static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
1575 {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw
1576 {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw
1577 {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw
1578 {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw
1579 {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck
1580
1581 {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw
1582 {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw
1583 {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus
1584 {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck
1585
1586 {TTI::SK_Splice, MVT::v4i16, 2}, // punpck+psrldq
1587 {TTI::SK_Splice, MVT::v2i16, 2}, // punpck+psrldq
1588 {TTI::SK_Splice, MVT::v4i8, 2}, // punpck+psrldq
1589 {TTI::SK_Splice, MVT::v2i8, 2}, // punpck+psrldq
1590
1591 {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw
1592 {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw
1593 {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw
1594 {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw
1595 {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck
1596
1597 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
1598 {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
1599 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw
1600 {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw
1601 {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck
1602 };
1603
1604 if (ST->hasSSE2())
1605 if (const auto *Entry =
1606 CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1607 return Entry->Cost;
1608 }
1609
1610 // We are going to permute multiple sources and the result will be in multiple
1611 // destinations. Providing an accurate cost only for splits where the element
1612 // type remains the same.
1613 if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
1614 MVT LegalVT = LT.second;
1615 if (LegalVT.isVector() &&
1616 LegalVT.getVectorElementType().getSizeInBits() ==
1618 LegalVT.getVectorNumElements() <
1619 cast<FixedVectorType>(BaseTp)->getNumElements()) {
1620 unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
1621 unsigned LegalVTSize = LegalVT.getStoreSize();
1622 // Number of source vectors after legalization:
1623 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1624 // Number of destination vectors after legalization:
1625 InstructionCost NumOfDests = LT.first;
1626
1627 auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
1628 LegalVT.getVectorNumElements());
1629
1630 if (!Mask.empty() && NumOfDests.isValid()) {
1631 // Try to perform better estimation of the permutation.
1632 // 1. Split the source/destination vectors into real registers.
1633 // 2. Do the mask analysis to identify which real registers are
1634 // permuted. If more than 1 source registers are used for the
1635 // destination register building, the cost for this destination register
1636 // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
1637 // source register is used, build mask and calculate the cost as a cost
1638 // of PermuteSingleSrc.
1639 // Also, for the single register permute we try to identify if the
1640 // destination register is just a copy of the source register or the
1641 // copy of the previous destination register (the cost is
1642 // TTI::TCC_Basic). If the source register is just reused, the cost for
1643 // this operation is 0.
1644 NumOfDests =
1646 FixedVectorType::get(BaseTp->getElementType(), Mask.size()))
1647 .first;
1648 unsigned E = *NumOfDests.getValue();
1649 unsigned NormalizedVF =
1650 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
1651 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
1652 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
1653 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
1654 copy(Mask, NormalizedMask.begin());
1655 unsigned PrevSrcReg = 0;
1656 ArrayRef<int> PrevRegMask;
1659 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
1660 [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask,
1661 &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
1662 if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) {
1663 // Check if the previous register can be just copied to the next
1664 // one.
1665 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
1666 PrevRegMask != RegMask)
1668 RegMask, CostKind, 0, nullptr);
1669 else
1670 // Just a copy of previous destination register.
1672 return;
1673 }
1674 if (SrcReg != DestReg &&
1675 any_of(RegMask, [](int I) { return I != PoisonMaskElem; })) {
1676 // Just a copy of the source register.
1678 }
1679 PrevSrcReg = SrcReg;
1680 PrevRegMask = RegMask;
1681 },
1682 [this, SingleOpTy, CostKind, &Cost](ArrayRef<int> RegMask,
1683 unsigned /*Unused*/,
1684 unsigned /*Unused*/) {
1685 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
1686 CostKind, 0, nullptr);
1687 });
1688 return Cost;
1689 }
1690
1691 InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1692 return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1693 std::nullopt, CostKind, 0, nullptr);
1694 }
1695
1696 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
1697 }
1698
1699 // For 2-input shuffles, we must account for splitting the 2 inputs into many.
1700 if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
1701 // We assume that source and destination have the same vector type.
1702 InstructionCost NumOfDests = LT.first;
1703 InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1;
1704 LT.first = NumOfDests * NumOfShufflesPerDest;
1705 }
1706
1707 static const CostTblEntry AVX512VBMIShuffleTbl[] = {
1708 {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
1709 {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
1710
1711 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
1712 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
1713
1714 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
1715 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
1716 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b
1717 };
1718
1719 if (ST->hasVBMI())
1720 if (const auto *Entry =
1721 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1722 return LT.first * Entry->Cost;
1723
1724 static const CostTblEntry AVX512BWShuffleTbl[] = {
1725 {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1726 {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
1727 {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1728
1729 {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
1730 {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
1731 {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
1732 {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
1733
1734 {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
1735 {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
1736 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
1737 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
1738 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
1739
1740 {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
1741 {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
1742 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
1743 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
1744 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1745
1746 {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
1747 {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
1748
1749 {TTI::SK_Splice, MVT::v32i16, 2}, // vshufi64x2 + palignr
1750 {TTI::SK_Splice, MVT::v32f16, 2}, // vshufi64x2 + palignr
1751 {TTI::SK_Splice, MVT::v64i8, 2}, // vshufi64x2 + palignr
1752 };
1753
1754 if (ST->hasBWI())
1755 if (const auto *Entry =
1756 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1757 return LT.first * Entry->Cost;
1758
1759 static const CostKindTblEntry AVX512ShuffleTbl[] = {
1760 {TTI::SK_Broadcast, MVT::v8f64, { 1, 1, 1, 1 } }, // vbroadcastsd
1761 {TTI::SK_Broadcast, MVT::v16f32, { 1, 1, 1, 1 } }, // vbroadcastss
1762 {TTI::SK_Broadcast, MVT::v8i64, { 1, 1, 1, 1 } }, // vpbroadcastq
1763 {TTI::SK_Broadcast, MVT::v16i32, { 1, 1, 1, 1 } }, // vpbroadcastd
1764 {TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw
1765 {TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw
1766 {TTI::SK_Broadcast, MVT::v64i8, { 1, 1, 1, 1 } }, // vpbroadcastb
1767
1768 {TTI::SK_Reverse, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1769 {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1770 {TTI::SK_Reverse, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1771 {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1772 {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca
1773 {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca
1774 {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca
1775
1776 {TTI::SK_Splice, MVT::v8f64, { 1, 1, 1, 1 } }, // vpalignd
1777 {TTI::SK_Splice, MVT::v4f64, { 1, 1, 1, 1 } }, // vpalignd
1778 {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd
1779 {TTI::SK_Splice, MVT::v8f32, { 1, 1, 1, 1 } }, // vpalignd
1780 {TTI::SK_Splice, MVT::v8i64, { 1, 1, 1, 1 } }, // vpalignd
1781 {TTI::SK_Splice, MVT::v4i64, { 1, 1, 1, 1 } }, // vpalignd
1782 {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd
1783 {TTI::SK_Splice, MVT::v8i32, { 1, 1, 1, 1 } }, // vpalignd
1784 {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr
1785 {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr
1786 {TTI::SK_Splice, MVT::v64i8, { 4, 4, 4, 4 } }, // split + palignr
1787
1788 {TTI::SK_PermuteSingleSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1789 {TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermpd
1790 {TTI::SK_PermuteSingleSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermpd
1791 {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1792 {TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermps
1793 {TTI::SK_PermuteSingleSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermps
1794 {TTI::SK_PermuteSingleSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1795 {TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermq
1796 {TTI::SK_PermuteSingleSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermq
1797 {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1798 {TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermd
1799 {TTI::SK_PermuteSingleSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermd
1800 {TTI::SK_PermuteSingleSrc, MVT::v16i8, { 1, 3, 1, 1 } }, // pshufb
1801
1802 {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermt2pd
1803 {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermt2ps
1804 {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermt2q
1805 {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermt2d
1806 {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermt2pd
1807 {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermt2ps
1808 {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermt2q
1809 {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermt2d
1810 {TTI::SK_PermuteTwoSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermt2pd
1811 {TTI::SK_PermuteTwoSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermt2ps
1812 {TTI::SK_PermuteTwoSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermt2q
1813 {TTI::SK_PermuteTwoSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermt2d
1814
1815 // FIXME: This just applies the type legalization cost rules above
1816 // assuming these completely split.
1817 {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } },
1818 {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } },
1819 {TTI::SK_PermuteSingleSrc, MVT::v64i8, { 14, 14, 14, 14 } },
1820 {TTI::SK_PermuteTwoSrc, MVT::v32i16, { 42, 42, 42, 42 } },
1821 {TTI::SK_PermuteTwoSrc, MVT::v32f16, { 42, 42, 42, 42 } },
1822 {TTI::SK_PermuteTwoSrc, MVT::v64i8, { 42, 42, 42, 42 } },
1823
1824 {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq
1825 {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq
1826 {TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vpternlogq
1827 {TTI::SK_Select, MVT::v8f64, { 1, 1, 1, 1 } }, // vblendmpd
1828 {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps
1829 {TTI::SK_Select, MVT::v8i64, { 1, 1, 1, 1 } }, // vblendmq
1830 {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd
1831 };
1832
1833 if (ST->hasAVX512())
1834 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1835 if (auto KindCost = Entry->Cost[CostKind])
1836 return LT.first * *KindCost;
1837
1838 static const CostTblEntry AVX2ShuffleTbl[] = {
1839 {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1840 {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1841 {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1842 {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1843 {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1844 {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
1845 {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1846
1847 {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1848 {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1849 {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1850 {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1851 {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1852 {TTI::SK_Reverse, MVT::v16f16, 2}, // vperm2i128 + pshufb
1853 {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1854
1855 {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1856 {TTI::SK_Select, MVT::v16f16, 1}, // vpblendvb
1857 {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1858
1859 {TTI::SK_Splice, MVT::v8i32, 2}, // vperm2i128 + vpalignr
1860 {TTI::SK_Splice, MVT::v8f32, 2}, // vperm2i128 + vpalignr
1861 {TTI::SK_Splice, MVT::v16i16, 2}, // vperm2i128 + vpalignr
1862 {TTI::SK_Splice, MVT::v16f16, 2}, // vperm2i128 + vpalignr
1863 {TTI::SK_Splice, MVT::v32i8, 2}, // vperm2i128 + vpalignr
1864
1865 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1866 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1867 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1868 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1869 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1870 // + vpblendvb
1871 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vperm2i128 + 2*vpshufb
1872 // + vpblendvb
1873 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1874 // + vpblendvb
1875
1876 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1877 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1878 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1879 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1880 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1881 // + vpblendvb
1882 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 7}, // 2*vperm2i128 + 4*vpshufb
1883 // + vpblendvb
1884 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
1885 // + vpblendvb
1886 };
1887
1888 if (ST->hasAVX2())
1889 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1890 return LT.first * Entry->Cost;
1891
1892 static const CostTblEntry XOPShuffleTbl[] = {
1893 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
1894 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
1895 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
1896 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
1897 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1898 // + vinsertf128
1899 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
1900 // + vinsertf128
1901
1902 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1903 // + vinsertf128
1904 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
1905 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
1906 // + vinsertf128
1907 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
1908 };
1909
1910 if (ST->hasXOP())
1911 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1912 return LT.first * Entry->Cost;
1913
1914 static const CostTblEntry AVX1ShuffleTbl[] = {
1915 {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1916 {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1917 {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1918 {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1919 {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1920 {TTI::SK_Broadcast, MVT::v16f16, 3}, // vpshuflw + vpshufd + vinsertf128
1921 {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
1922
1923 {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1924 {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1925 {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1926 {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1927 {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1928 // + vinsertf128
1929 {TTI::SK_Reverse, MVT::v16f16, 4}, // vextractf128 + 2*pshufb
1930 // + vinsertf128
1931 {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
1932 // + vinsertf128
1933
1934 {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
1935 {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
1936 {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
1937 {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
1938 {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
1939 {TTI::SK_Select, MVT::v16f16, 3}, // vpand + vpandn + vpor
1940 {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
1941
1942 {TTI::SK_Splice, MVT::v4i64, 2}, // vperm2f128 + shufpd
1943 {TTI::SK_Splice, MVT::v4f64, 2}, // vperm2f128 + shufpd
1944 {TTI::SK_Splice, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1945 {TTI::SK_Splice, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1946 {TTI::SK_Splice, MVT::v16i16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1947 {TTI::SK_Splice, MVT::v16f16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1948 {TTI::SK_Splice, MVT::v32i8, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1949
1950 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
1951 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
1952 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1953 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1954 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
1955 // + 2*por + vinsertf128
1956 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 8}, // vextractf128 + 4*pshufb
1957 // + 2*por + vinsertf128
1958 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
1959 // + 2*por + vinsertf128
1960
1961 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
1962 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
1963 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1964 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1965 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
1966 // + 4*por + vinsertf128
1967 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 15}, // 2*vextractf128 + 8*pshufb
1968 // + 4*por + vinsertf128
1969 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
1970 // + 4*por + vinsertf128
1971 };
1972
1973 if (ST->hasAVX())
1974 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
1975 return LT.first * Entry->Cost;
1976
1977 static const CostTblEntry SSE41ShuffleTbl[] = {
1978 {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
1979 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1980 {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
1981 {TTI::SK_Select, MVT::v4f32, 1}, // blendps
1982 {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
1983 {TTI::SK_Select, MVT::v8f16, 1}, // pblendw
1984 {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
1985 };
1986
1987 if (ST->hasSSE41())
1988 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
1989 return LT.first * Entry->Cost;
1990
1991 static const CostTblEntry SSSE3ShuffleTbl[] = {
1992 {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
1993 {TTI::SK_Broadcast, MVT::v8f16, 1}, // pshufb
1994 {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
1995
1996 {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
1997 {TTI::SK_Reverse, MVT::v8f16, 1}, // pshufb
1998 {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
1999
2000 {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
2001 {TTI::SK_Select, MVT::v8f16, 3}, // 2*pshufb + por
2002 {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
2003
2004 {TTI::SK_Splice, MVT::v4i32, 1}, // palignr
2005 {TTI::SK_Splice, MVT::v4f32, 1}, // palignr
2006 {TTI::SK_Splice, MVT::v8i16, 1}, // palignr
2007 {TTI::SK_Splice, MVT::v8f16, 1}, // palignr
2008 {TTI::SK_Splice, MVT::v16i8, 1}, // palignr
2009
2010 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
2011 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // pshufb
2012 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
2013
2014 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
2015 {TTI::SK_PermuteTwoSrc, MVT::v8f16, 3}, // 2*pshufb + por
2016 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
2017 };
2018
2019 if (ST->hasSSSE3())
2020 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
2021 return LT.first * Entry->Cost;
2022
2023 static const CostTblEntry SSE2ShuffleTbl[] = {
2024 {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
2025 {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
2026 {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
2027 {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
2028 {TTI::SK_Broadcast, MVT::v8f16, 2}, // pshuflw + pshufd
2029 {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
2030
2031 {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
2032 {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
2033 {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
2034 {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
2035 {TTI::SK_Reverse, MVT::v8f16, 3}, // pshuflw + pshufhw + pshufd
2036 {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
2037 // + 2*pshufd + 2*unpck + packus
2038
2039 {TTI::SK_Select, MVT::v2i64, 1}, // movsd
2040 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
2041 {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
2042 {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
2043 {TTI::SK_Select, MVT::v8f16, 3}, // pand + pandn + por
2044 {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
2045
2046 {TTI::SK_Splice, MVT::v2i64, 1}, // shufpd
2047 {TTI::SK_Splice, MVT::v2f64, 1}, // shufpd
2048 {TTI::SK_Splice, MVT::v4i32, 2}, // 2*{unpck,movsd,pshufd}
2049 {TTI::SK_Splice, MVT::v8i16, 3}, // psrldq + psrlldq + por
2050 {TTI::SK_Splice, MVT::v8f16, 3}, // psrldq + psrlldq + por
2051 {TTI::SK_Splice, MVT::v16i8, 3}, // psrldq + psrlldq + por
2052
2053 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
2054 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
2055 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
2056 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
2057 // + pshufd/unpck
2058 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 5}, // 2*pshuflw + 2*pshufhw
2059 // + pshufd/unpck
2060 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
2061 // + 2*pshufd + 2*unpck + 2*packus
2062
2063 { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
2064 { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
2065 { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
2066 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
2067 { TTI::SK_PermuteTwoSrc, MVT::v8f16, 8 }, // blend+permute
2068 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
2069 };
2070
2071 static const CostTblEntry SSE3BroadcastLoadTbl[] = {
2072 {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
2073 };
2074
2075 if (ST->hasSSE2()) {
2076 bool IsLoad =
2077 llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
2078 if (ST->hasSSE3() && IsLoad)
2079 if (const auto *Entry =
2080 CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
2082 LT.second.getVectorElementCount()) &&
2083 "Table entry missing from isLegalBroadcastLoad()");
2084 return LT.first * Entry->Cost;
2085 }
2086
2087 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
2088 return LT.first * Entry->Cost;
2089 }
2090
2091 static const CostTblEntry SSE1ShuffleTbl[] = {
2092 { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
2093 { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
2094 { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
2095 { TTI::SK_Splice, MVT::v4f32, 2 }, // 2*shufps
2096 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
2097 { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
2098 };
2099
2100 if (ST->hasSSE1())
2101 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
2102 return LT.first * Entry->Cost;
2103
2104 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
2105}
2106
2108 Type *Src,
2111 const Instruction *I) {
2112 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2113 assert(ISD && "Invalid opcode");
2114
2115 // TODO: Allow non-throughput costs that aren't binary.
2116 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
2118 return Cost == 0 ? 0 : 1;
2119 return Cost;
2120 };
2121
2122 // The cost tables include both specific, custom (non-legal) src/dst type
2123 // conversions and generic, legalized types. We test for customs first, before
2124 // falling back to legalization.
2125 // FIXME: Need a better design of the cost table to handle non-simple types of
2126 // potential massive combinations (elem_num x src_type x dst_type).
2127 static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
2128 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
2129 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
2130
2131 // Mask sign extend has an instruction.
2132 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
2133 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 },
2134 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
2135 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 },
2136 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
2137 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 },
2138 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
2139 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 },
2140 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
2141 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 },
2142 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
2143 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
2144 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
2145 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
2146 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 },
2147 { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 },
2148 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, 1 },
2149
2150 // Mask zero extend is a sext + shift.
2151 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
2152 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 },
2153 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
2154 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 },
2155 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
2156 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 },
2157 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
2158 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 },
2159 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
2160 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 },
2161 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
2162 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
2163 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
2164 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
2165 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 },
2166 { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 },
2167 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, 2 },
2168
2169 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 },
2170 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 },
2171 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 },
2172 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 },
2173 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 },
2174 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 },
2175 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 },
2176 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 },
2177 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 },
2178 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 },
2179 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 },
2180 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 },
2181 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 },
2182 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 },
2183 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, 2 },
2184 { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, 2 },
2185 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, 2 },
2186
2187 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 2 },
2188 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm
2189 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // vpmovwb
2190 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // vpmovwb
2191 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // vpmovwb
2192 };
2193
2194 static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
2195 // Mask sign extend has an instruction.
2196 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 },
2197 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 },
2198 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 },
2199 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 },
2200 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 },
2201 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, 1 },
2202 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 },
2203 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 },
2204
2205 // Mask zero extend is a sext + shift.
2206 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 },
2207 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 },
2208 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 },
2209 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 },
2210 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 },
2211 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, 2 },
2212 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 },
2213 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 },
2214
2215 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 },
2216 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 },
2217 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 },
2218 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 },
2219 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
2220 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 },
2221 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 },
2222 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, 2 },
2223
2224 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
2225 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
2226
2227 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
2228 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
2229
2230 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 },
2231 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 },
2232
2233 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 },
2234 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 },
2235 };
2236
2237 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
2238 // 256-bit wide vectors.
2239
2240 static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
2241 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 },
2242 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 },
2243 { ISD::FP_EXTEND, MVT::v16f64, MVT::v16f32, 4 }, // 2*vcvtps2pd+vextractf64x4
2244 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 },
2245
2246 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
2247 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
2248 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
2249 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd
2250 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
2251 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
2252 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
2253 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd
2254 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd
2255 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd
2256 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd
2257 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd
2258 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq
2259 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq
2260 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq
2261 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 2 }, // vpmovdb
2262 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 2 }, // vpmovdb
2263 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 }, // vpmovdb
2264 { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, 2 }, // vpmovdb
2265 { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, 2 }, // vpmovdb
2266 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 }, // vpmovdw
2267 { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, 2 }, // vpmovdw
2268 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 2 }, // vpmovqb
2269 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1 }, // vpshufb
2270 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 }, // vpmovqb
2271 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, 2 }, // vpmovqb
2272 { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, 2 }, // vpmovqb
2273 { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, 2 }, // vpmovqb
2274 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 }, // vpmovqw
2275 { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, 2 }, // vpmovqw
2276 { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, 2 }, // vpmovqw
2277 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, // vpmovqd
2278 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd
2279 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb
2280
2281 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32
2282 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 8 },
2283 { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, 8 },
2284
2285 // Sign extend is zmm vpternlogd+vptruncdb.
2286 // Zero extend is zmm broadcast load+vptruncdw.
2287 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 3 },
2288 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 4 },
2289 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 3 },
2290 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 4 },
2291 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 3 },
2292 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 4 },
2293 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 3 },
2294 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 4 },
2295
2296 // Sign extend is zmm vpternlogd+vptruncdw.
2297 // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
2298 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 3 },
2299 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 4 },
2300 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 3 },
2301 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 4 },
2302 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 3 },
2303 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 4 },
2304 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 3 },
2305 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
2306
2307 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd
2308 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld
2309 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd
2310 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld
2311 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd
2312 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld
2313 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq
2314 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq
2315 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq
2316 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq
2317
2318 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd
2319 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld
2320 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq
2321 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq
2322
2323 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
2324 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
2325 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
2326 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
2327 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
2328 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
2329 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
2330 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
2331 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
2332 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
2333
2334 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
2335 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
2336
2337 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
2338 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
2339 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 },
2340 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 },
2341 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
2342 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 },
2343 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
2344 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
2345
2346 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
2347 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
2348 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 },
2349 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 },
2350 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
2351 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 },
2352 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
2353 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
2354 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 },
2355 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 },
2356
2357 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 },
2358 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, 7 },
2359 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64,15 },
2360 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32,11 },
2361 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64,31 },
2362 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, 3 },
2363 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, 7 },
2364 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, 5 },
2365 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64,15 },
2366 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 1 },
2367 { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, 3 },
2368
2369 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 },
2370 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 3 },
2371 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 3 },
2372 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 },
2373 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 3 },
2374 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 3 },
2375 };
2376
2377 static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] {
2378 // Mask sign extend has an instruction.
2379 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
2380 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 },
2381 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
2382 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 },
2383 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
2384 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 },
2385 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
2386 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 },
2387 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
2388 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 },
2389 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
2390 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
2391 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
2392 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
2393 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, 1 },
2394 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, 1 },
2395 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, 1 },
2396
2397 // Mask zero extend is a sext + shift.
2398 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
2399 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 },
2400 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
2401 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 },
2402 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
2403 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 },
2404 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
2405 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 },
2406 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
2407 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 },
2408 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
2409 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
2410 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
2411 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
2412 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, 2 },
2413 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, 2 },
2414 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, 2 },
2415
2416 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 },
2417 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 },
2418 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 },
2419 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 },
2420 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 },
2421 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 },
2422 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 },
2423 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 },
2424 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 },
2425 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 },
2426 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 },
2427 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 },
2428 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 },
2429 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 },
2430 { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, 2 },
2431 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, 2 },
2432 { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, 2 },
2433
2434 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 },
2435 };
2436
2437 static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = {
2438 // Mask sign extend has an instruction.
2439 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 },
2440 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 },
2441 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 },
2442 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, 1 },
2443 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 },
2444 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, 1 },
2445 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, 1 },
2446 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 },
2447
2448 // Mask zero extend is a sext + shift.
2449 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 },
2450 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 },
2451 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 },
2452 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, 2 },
2453 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 },
2454 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, 2 },
2455 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, 2 },
2456 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 },
2457
2458 { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, 2 },
2459 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, 2 },
2460 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 },
2461 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 },
2462 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 },
2463 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 },
2464 { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, 2 },
2465 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
2466
2467 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
2468 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
2469 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
2470 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
2471
2472 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
2473 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
2474 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
2475 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
2476
2477 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, 1 },
2478 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 },
2479 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
2480 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 },
2481
2482 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, 1 },
2483 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 },
2484 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
2485 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 },
2486 };
2487
2488 static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = {
2489 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
2490 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
2491 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
2492 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8
2493 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
2494 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
2495 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
2496 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16
2497 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd
2498 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd
2499 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd
2500 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, 2 }, // vpslld+vptestmd
2501 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq
2502 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq
2503 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd
2504 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, // vpmovqb
2505 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, // vpmovqw
2506 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, // vpmovwb
2507
2508 // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
2509 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
2510 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 5 },
2511 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 6 },
2512 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 5 },
2513 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 6 },
2514 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 5 },
2515 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 6 },
2516 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 10 },
2517 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 12 },
2518
2519 // sign extend is vpcmpeq+maskedmove+vpmovdw
2520 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
2521 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 4 },
2522 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 5 },
2523 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 4 },
2524 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 5 },
2525 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 4 },
2526 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 5 },
2527 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 10 },
2528 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 12 },
2529
2530 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd
2531 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld
2532 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd
2533 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld
2534 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd
2535 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld
2536 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, 1 }, // vpternlogd
2537 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, 2 }, // vpternlogd+psrld
2538
2539 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq
2540 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq
2541 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq
2542 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq
2543
2544 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 1 },
2545 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 1 },
2546 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 1 },
2547 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 1 },
2548 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
2549 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
2550 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 1 },
2551 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 1 },
2552 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
2553 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
2554 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
2555 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
2556
2557 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
2558 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 },
2559 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
2560 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 },
2561
2562 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 },
2563 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 },
2564 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
2565 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 },
2566 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
2567 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 },
2568 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
2569 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
2570 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
2571 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
2572 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 },
2573 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
2574 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 5 },
2575
2576 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 },
2577 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 },
2578 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, 5 },
2579
2580 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 },
2581 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 },
2582 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
2583 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 1 },
2584 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 },
2585 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 },
2586 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 },
2587 };
2588
2589 static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
2590 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
2591 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
2592 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
2593 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
2594 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
2595 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
2596
2597 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 2 },
2598 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 2 },
2599 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 2 },
2600 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 2 },
2601 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
2602 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
2603 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 2 },
2604 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 2 },
2605 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
2606 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
2607 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
2608 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
2609 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
2610 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
2611
2612 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
2613
2614 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 4 },
2615 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 4 },
2616 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 1 },
2617 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 1 },
2618 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 1 },
2619 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 4 },
2620 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 4 },
2621 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 1 },
2622 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 1 },
2623 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 5 },
2624 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 },
2625 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 },
2626
2627 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 },
2628 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 },
2629
2630 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 1 },
2631 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 1 },
2632 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 1 },
2633 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 3 },
2634
2635 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 3 },
2636 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 3 },
2637 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 1 },
2638 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 },
2639 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
2640 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4 },
2641 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 3 },
2642 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 4 },
2643
2644 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 },
2645 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 },
2646 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 },
2647 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 },
2648 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
2649 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
2650 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 3 },
2651
2652 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 },
2653 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 },
2654 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 },
2655 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 },
2656 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 },
2657 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
2658 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 2 },
2659 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
2660 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
2661 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 },
2662 };
2663
2664 static const TypeConversionCostTblEntry AVXConversionTbl[] = {
2665 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 4 },
2666 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 },
2667 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 4 },
2668 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 },
2669 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
2670 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
2671
2672 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 3 },
2673 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 3 },
2674 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 3 },
2675 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 3 },
2676 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
2677 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
2678 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 3 },
2679 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 3 },
2680 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
2681 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
2682 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
2683 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
2684
2685 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 4 },
2686 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 5 },
2687 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 4 },
2688 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 9 },
2689 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, 11 },
2690
2691 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 },
2692 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
2693 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // and+extract+packuswb
2694 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 5 },
2695 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
2696 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 5 },
2697 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 3 }, // and+extract+2*packusdw
2698 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 },
2699
2700 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
2701 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 },
2702 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 },
2703 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 },
2704 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 },
2705 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
2706 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 },
2707 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
2708 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
2709 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 },
2710 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 5 },
2711 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 8 },
2712
2713 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 },
2714 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 },
2715 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 },
2716 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 },
2717 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 },
2718 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
2719 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 },
2720 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 4 },
2721 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 4 },
2722 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
2723 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 },
2724 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 },
2725 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 10 },
2726 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 10 },
2727 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 18 },
2728 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
2729 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 10 },
2730
2731 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 },
2732 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, 2 },
2733 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, 2 },
2734 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, 2 },
2735 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 2 },
2736 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, 2 },
2737 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 2 },
2738 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, 2 },
2739 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 2 },
2740 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 2 },
2741 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 5 },
2742
2743 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, 2 },
2744 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, 2 },
2745 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, 2 },
2746 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, 2 },
2747 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 2 },
2748 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, 2 },
2749 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 2 },
2750 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, 2 },
2751 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 },
2752 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
2753 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 6 },
2754 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 7 },
2755 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 7 },
2756
2757 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 },
2758 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 },
2759 };
2760
2761 static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
2762 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 1 },
2763 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 1 },
2764 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 1 },
2765 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 1 },
2766 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
2767 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
2768 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 1 },
2769 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 1 },
2770 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
2771 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
2772 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
2773 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
2774
2775 // These truncates end up widening elements.
2776 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ
2777 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ
2778 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD
2779
2780 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 2 },
2781 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 2 },
2782 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 2 },
2783
2784 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 1 },
2785 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 1 },
2786 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 1 },
2787 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 1 },
2788 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 },
2789 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
2790 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 },
2791 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
2792 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
2793 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 1 },
2794 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
2795
2796 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 1 },
2797 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 1 },
2798 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 4 },
2799 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 },
2800 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 },
2801 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
2802 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 },
2803 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
2804 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 3 },
2805 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 },
2806 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 2 },
2807 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 12 },
2808 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 22 },
2809 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 4 },
2810
2811 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 1 },
2812 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 1 },
2813 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 1 },
2814 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 1 },
2815 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 2 },
2816 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 2 },
2817 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 1 },
2818 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 1 },
2819 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
2820 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 1 },
2821
2822 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 1 },
2823 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 },
2824 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 1 },
2825 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 },
2826 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 2 },
2827 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 2 },
2828 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 1 },
2829 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 1 },
2830 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 4 },
2831 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
2832 };
2833
2834 static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
2835 // These are somewhat magic numbers justified by comparing the
2836 // output of llvm-mca for our various supported scheduler models
2837 // and basing it off the worst case scenario.
2838 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 3 },
2839 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 3 },
2840 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 3 },
2841 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 3 },
2842 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 3 },
2843 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 },
2844 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 3 },
2845 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 },
2846 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 },
2847 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4 },
2848 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 8 },
2849 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 8 },
2850
2851 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 3 },
2852 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 3 },
2853 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 8 },
2854 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 9 },
2855 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 },
2856 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 4 },
2857 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 4 },
2858 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 },
2859 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 7 },
2860 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 7 },
2861 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
2862 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 15 },
2863 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 18 },
2864
2865 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 4 },
2866 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 4 },
2867 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 4 },
2868 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 4 },
2869 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 6 },
2870 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 6 },
2871 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 5 },
2872 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 5 },
2873 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 4 },
2874 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 4 },
2875
2876 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 4 },
2877 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 },
2878 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 4 },
2879 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 15 },
2880 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 6 },
2881 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 6 },
2882 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 5 },
2883 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 5 },
2884 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 8 },
2885 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 8 },
2886
2887 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 4 },
2888 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 4 },
2889 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 2 },
2890 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 3 },
2891 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
2892 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 2 },
2893 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 2 },
2894 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 3 },
2895 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
2896 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 2 },
2897 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
2898 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 2 },
2899
2900 // These truncates are really widening elements.
2901 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD
2902 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // PUNPCKLWD+DQ
2903 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // PUNPCKLBW+WD+PSHUFD
2904 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 1 }, // PUNPCKLWD
2905 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD
2906 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW
2907
2908 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 2 }, // PAND+PACKUSWB
2909 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 },
2910 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 3 }, // PAND+2*PACKUSWB
2911 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 },
2912 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 },
2913 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 3 },
2914 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
2915 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32,10 },
2916 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB
2917 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW
2918 { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, 1 }, // PSHUFD
2919 };
2920
2921 // Attempt to map directly to (simple) MVT types to let us match custom entries.
2922 EVT SrcTy = TLI->getValueType(DL, Src);
2923 EVT DstTy = TLI->getValueType(DL, Dst);
2924
2925 // The function getSimpleVT only handles simple value types.
2926 if (SrcTy.isSimple() && DstTy.isSimple()) {
2927 MVT SimpleSrcTy = SrcTy.getSimpleVT();
2928 MVT SimpleDstTy = DstTy.getSimpleVT();
2929
2930 if (ST->useAVX512Regs()) {
2931 if (ST->hasBWI())
2932 if (const auto *Entry = ConvertCostTableLookup(
2933 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2934 return AdjustCost(Entry->Cost);
2935
2936 if (ST->hasDQI())
2937 if (const auto *Entry = ConvertCostTableLookup(
2938 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2939 return AdjustCost(Entry->Cost);
2940
2941 if (ST->hasAVX512())
2942 if (const auto *Entry = ConvertCostTableLookup(
2943 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2944 return AdjustCost(Entry->Cost);
2945 }
2946
2947 if (ST->hasBWI())
2948 if (const auto *Entry = ConvertCostTableLookup(
2949 AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2950 return AdjustCost(Entry->Cost);
2951
2952 if (ST->hasDQI())
2953 if (const auto *Entry = ConvertCostTableLookup(
2954 AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2955 return AdjustCost(Entry->Cost);
2956
2957 if (ST->hasAVX512())
2958 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2959 SimpleDstTy, SimpleSrcTy))
2960 return AdjustCost(Entry->Cost);
2961
2962 if (ST->hasAVX2()) {
2963 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2964 SimpleDstTy, SimpleSrcTy))
2965 return AdjustCost(Entry->Cost);
2966 }
2967
2968 if (ST->hasAVX()) {
2969 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2970 SimpleDstTy, SimpleSrcTy))
2971 return AdjustCost(Entry->Cost);
2972 }
2973
2974 if (ST->hasSSE41()) {
2975 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
2976 SimpleDstTy, SimpleSrcTy))
2977 return AdjustCost(Entry->Cost);
2978 }
2979
2980 if (ST->hasSSE2()) {
2981 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2982 SimpleDstTy, SimpleSrcTy))
2983 return AdjustCost(Entry->Cost);
2984 }
2985 }
2986
2987 // Fall back to legalized types.
2988 std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src);
2989 std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst);
2990
2991 // If we're truncating to the same legalized type - just assume its free.
2992 if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
2993 return TTI::TCC_Free;
2994
2995 if (ST->useAVX512Regs()) {
2996 if (ST->hasBWI())
2997 if (const auto *Entry = ConvertCostTableLookup(
2998 AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
2999 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3000
3001 if (ST->hasDQI())
3002 if (const auto *Entry = ConvertCostTableLookup(
3003 AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
3004 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3005
3006 if (ST->hasAVX512())
3007 if (const auto *Entry = ConvertCostTableLookup(
3008 AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
3009 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3010 }
3011
3012 if (ST->hasBWI())
3013 if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
3014 LTDest.second, LTSrc.second))
3015 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3016
3017 if (ST->hasDQI())
3018 if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
3019 LTDest.second, LTSrc.second))
3020 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3021
3022 if (ST->hasAVX512())
3023 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3024 LTDest.second, LTSrc.second))
3025 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3026
3027 if (ST->hasAVX2())
3028 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3029 LTDest.second, LTSrc.second))
3030 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3031
3032 if (ST->hasAVX())
3033 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3034 LTDest.second, LTSrc.second))
3035 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3036
3037 if (ST->hasSSE41())
3038 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3039 LTDest.second, LTSrc.second))
3040 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3041
3042 if (ST->hasSSE2())
3043 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3044 LTDest.second, LTSrc.second))
3045 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3046
3047 // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
3048 // sitofp.
3049 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3050 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
3051 Type *ExtSrc = Src->getWithNewBitWidth(32);
3052 unsigned ExtOpc =
3053 (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
3054
3055 // For scalar loads the extend would be free.
3056 InstructionCost ExtCost = 0;
3057 if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
3058 ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
3059
3060 return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
3062 }
3063
3064 // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
3065 // i32.
3066 if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
3067 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
3068 Type *TruncDst = Dst->getWithNewBitWidth(32);
3069 return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
3070 getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
3072 }
3073
3074 return AdjustCost(
3075 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3076}
3077
3079 Type *CondTy,
3080 CmpInst::Predicate VecPred,
3082 const Instruction *I) {
3083 // Early out if this type isn't scalar/vector integer/float.
3084 if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy()))
3085 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3086 I);
3087
3088 // Legalize the type.
3089 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3090
3091 MVT MTy = LT.second;
3092
3093 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3094 assert(ISD && "Invalid opcode");
3095
3096 InstructionCost ExtraCost = 0;
3097 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
3098 // Some vector comparison predicates cost extra instructions.
3099 // TODO: Adjust ExtraCost based on CostKind?
3100 // TODO: Should we invert this and assume worst case cmp costs
3101 // and reduce for particular predicates?
3102 if (MTy.isVector() &&
3103 !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
3104 (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
3105 ST->hasBWI())) {
3106 // Fallback to I if a specific predicate wasn't specified.
3107 CmpInst::Predicate Pred = VecPred;
3108 if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
3110 Pred = cast<CmpInst>(I)->getPredicate();
3111
3112 bool CmpWithConstant = false;
3113 if (auto *CmpInstr = dyn_cast_or_null<CmpInst>(I))
3114 CmpWithConstant = isa<Constant>(CmpInstr->getOperand(1));
3115
3116 switch (Pred) {
3118 // xor(cmpeq(x,y),-1)
3119 ExtraCost = CmpWithConstant ? 0 : 1;
3120 break;
3123 // xor(cmpgt(x,y),-1)
3124 ExtraCost = CmpWithConstant ? 0 : 1;
3125 break;
3128 // cmpgt(xor(x,signbit),xor(y,signbit))
3129 // xor(cmpeq(pmaxu(x,y),x),-1)
3130 ExtraCost = CmpWithConstant ? 1 : 2;
3131 break;
3134 if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
3135 (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
3136 // cmpeq(psubus(x,y),0)
3137 // cmpeq(pminu(x,y),x)
3138 ExtraCost = 1;
3139 } else {
3140 // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
3141 ExtraCost = CmpWithConstant ? 2 : 3;
3142 }
3143 break;
3146 // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases.
3147 // Use FCMP_UEQ expansion - FCMP_ONE should be the same.
3148 if (CondTy && !ST->hasAVX())
3149 return getCmpSelInstrCost(Opcode, ValTy, CondTy,
3151 getCmpSelInstrCost(Opcode, ValTy, CondTy,
3153 getArithmeticInstrCost(Instruction::Or, CondTy, CostKind);
3154
3155 break;
3158 // Assume worst case scenario and add the maximum extra cost.
3159 ExtraCost = 3;
3160 break;
3161 default:
3162 break;
3163 }
3164 }
3165 }
3166
3167 static const CostKindTblEntry SLMCostTbl[] = {
3168 // slm pcmpeq/pcmpgt throughput is 2
3169 { ISD::SETCC, MVT::v2i64, { 2, 5, 1, 2 } },
3170 // slm pblendvb/blendvpd/blendvps throughput is 4
3171 { ISD::SELECT, MVT::v2f64, { 4, 4, 1, 3 } }, // vblendvpd
3172 { ISD::SELECT, MVT::v4f32, { 4, 4, 1, 3 } }, // vblendvps
3173 { ISD::SELECT, MVT::v2i64, { 4, 4, 1, 3 } }, // pblendvb
3174 { ISD::SELECT, MVT::v8i32, { 4, 4, 1, 3 } }, // pblendvb
3175 { ISD::SELECT, MVT::v8i16, { 4, 4, 1, 3 } }, // pblendvb
3176 { ISD::SELECT, MVT::v16i8, { 4, 4, 1, 3 } }, // pblendvb
3177 };
3178
3179 static const CostKindTblEntry AVX512BWCostTbl[] = {
3180 { ISD::SETCC, MVT::v32i16, { 1, 1, 1, 1 } },
3181 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 1 } },
3182 { ISD::SETCC, MVT::v64i8, { 1, 1, 1, 1 } },
3183 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 1 } },
3184
3185 { ISD::SELECT, MVT::v32i16, { 1, 1, 1, 1 } },
3186 { ISD::SELECT, MVT::v64i8, { 1, 1, 1, 1 } },
3187 };
3188
3189 static const CostKindTblEntry AVX512CostTbl[] = {
3190 { ISD::SETCC, MVT::v8f64, { 1, 4, 1, 1 } },
3191 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 1 } },
3192 { ISD::SETCC, MVT::v16f32, { 1, 4, 1, 1 } },
3193 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 1 } },
3194
3195 { ISD::SETCC, MVT::v8i64, { 1, 1, 1, 1 } },
3196 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 1 } },
3197 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3198 { ISD::SETCC, MVT::v16i32, { 1, 1, 1, 1 } },
3199 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 1 } },
3200 { ISD::SETCC, MVT::v32i16, { 3, 7, 5, 5 } },
3201 { ISD::SETCC, MVT::v64i8, { 3, 7, 5, 5 } },
3202
3203 { ISD::SELECT, MVT::v8i64, { 1, 1, 1, 1 } },
3204 { ISD::SELECT, MVT::v4i64, { 1, 1, 1, 1 } },
3205 { ISD::SELECT, MVT::v2i64, { 1, 1, 1, 1 } },
3206 { ISD::SELECT, MVT::v16i32, { 1, 1, 1, 1 } },
3207 { ISD::SELECT, MVT::v8i32, { 1, 1, 1, 1 } },
3208 { ISD::SELECT, MVT::v4i32, { 1, 1, 1, 1 } },
3209 { ISD::SELECT, MVT::v8f64, { 1, 1, 1, 1 } },
3210 { ISD::SELECT, MVT::v4f64, { 1, 1, 1, 1 } },
3211 { ISD::SELECT, MVT::v2f64, { 1, 1, 1, 1 } },
3212 { ISD::SELECT, MVT::f64, { 1, 1, 1, 1 } },
3213 { ISD::SELECT, MVT::v16f32, { 1, 1, 1, 1 } },
3214 { ISD::SELECT, MVT::v8f32 , { 1, 1, 1, 1 } },
3215 { ISD::SELECT, MVT::v4f32, { 1, 1, 1, 1 } },
3216 { ISD::SELECT, MVT::f32 , { 1, 1, 1, 1 } },
3217
3218 { ISD::SELECT, MVT::v32i16, { 2, 2, 4, 4 } },
3219 { ISD::SELECT, MVT::v16i16, { 1, 1, 1, 1 } },
3220 { ISD::SELECT, MVT::v8i16, { 1, 1, 1, 1 } },
3221 { ISD::SELECT, MVT::v64i8, { 2, 2, 4, 4 } },
3222 { ISD::SELECT, MVT::v32i8, { 1, 1, 1, 1 } },
3223 { ISD::SELECT, MVT::v16i8, { 1, 1, 1, 1 } },
3224 };
3225
3226 static const CostKindTblEntry AVX2CostTbl[] = {
3227 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 2 } },
3228 { ISD::SETCC, MVT::v2f64, { 1, 4, 1, 1 } },
3229 { ISD::SETCC, MVT::f64, { 1, 4, 1, 1 } },
3230 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 2 } },
3231 { ISD::SETCC, MVT::v4f32, { 1, 4, 1, 1 } },
3232 { ISD::SETCC, MVT::f32, { 1, 4, 1, 1 } },
3233
3234 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 2 } },
3235 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 2 } },
3236 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 2 } },
3237 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 2 } },
3238
3239 { ISD::SELECT, MVT::v4f64, { 2, 2, 1, 2 } }, // vblendvpd
3240 { ISD::SELECT, MVT::v8f32, { 2, 2, 1, 2 } }, // vblendvps
3241 { ISD::SELECT, MVT::v4i64, { 2, 2, 1, 2 } }, // pblendvb
3242 { ISD::SELECT, MVT::v8i32, { 2, 2, 1, 2 } }, // pblendvb
3243 { ISD::SELECT, MVT::v16i16, { 2, 2, 1, 2 } }, // pblendvb
3244 { ISD::SELECT, MVT::v32i8, { 2, 2, 1, 2 } }, // pblendvb
3245 };
3246
3247 static const CostKindTblEntry XOPCostTbl[] = {
3248 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3249 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3250 };
3251
3252 static const CostKindTblEntry AVX1CostTbl[] = {
3253 { ISD::SETCC, MVT::v4f64, { 2, 3, 1, 2 } },
3254 { ISD::SETCC, MVT::v2f64, { 1, 3, 1, 1 } },
3255 { ISD::SETCC, MVT::f64, { 1, 3, 1, 1 } },
3256 { ISD::SETCC, MVT::v8f32, { 2, 3, 1, 2 } },
3257 { ISD::SETCC, MVT::v4f32, { 1, 3, 1, 1 } },
3258 { ISD::SETCC, MVT::f32, { 1, 3, 1, 1 } },
3259
3260 // AVX1 does not support 8-wide integer compare.
3261 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3262 { ISD::SETCC, MVT::v8i32, { 4, 2, 5, 6 } },
3263 { ISD::SETCC, MVT::v16i16, { 4, 2, 5, 6 } },
3264 { ISD::SETCC, MVT::v32i8, { 4, 2, 5, 6 } },
3265
3266 { ISD::SELECT, MVT::v4f64, { 3, 3, 1, 2 } }, // vblendvpd
3267 { ISD::SELECT, MVT::v8f32, { 3, 3, 1, 2 } }, // vblendvps
3268 { ISD::SELECT, MVT::v4i64, { 3, 3, 1, 2 } }, // vblendvpd
3269 { ISD::SELECT, MVT::v8i32, { 3, 3, 1, 2 } }, // vblendvps
3270 { ISD::SELECT, MVT::v16i16, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3271 { ISD::SELECT, MVT::v32i8, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3272 };
3273
3274 static const CostKindTblEntry SSE42CostTbl[] = {
3275 { ISD::SETCC, MVT::v2i64, { 1, 2, 1, 2 } },
3276 };
3277
3278 static const CostKindTblEntry SSE41CostTbl[] = {
3279 { ISD::SETCC, MVT::v2f64, { 1, 5, 1, 1 } },
3280 { ISD::SETCC, MVT::v4f32, { 1, 5, 1, 1 } },
3281
3282 { ISD::SELECT, MVT::v2f64, { 2, 2, 1, 2 } }, // blendvpd
3283 { ISD::SELECT, MVT::f64, { 2, 2, 1, 2 } }, // blendvpd
3284 { ISD::SELECT, MVT::v4f32, { 2, 2, 1, 2 } }, // blendvps
3285 { ISD::SELECT, MVT::f32 , { 2, 2, 1, 2 } }, // blendvps
3286 { ISD::SELECT, MVT::v2i64, { 2, 2, 1, 2 } }, // pblendvb
3287 { ISD::SELECT, MVT::v4i32, { 2, 2, 1, 2 } }, // pblendvb
3288 { ISD::SELECT, MVT::v8i16, { 2, 2, 1, 2 } }, // pblendvb
3289 { ISD::SELECT, MVT::v16i8, { 2, 2, 1, 2 } }, // pblendvb
3290 };
3291
3292 static const CostKindTblEntry SSE2CostTbl[] = {
3293 { ISD::SETCC, MVT::v2f64, { 2, 5, 1, 1 } },
3294 { ISD::SETCC, MVT::f64, { 1, 5, 1, 1 } },
3295
3296 { ISD::SETCC, MVT::v2i64, { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion
3297 { ISD::SETCC, MVT::v4i32, { 1, 1, 1, 1 } },
3298 { ISD::SETCC, MVT::v8i16, { 1, 1, 1, 1 } },
3299 { ISD::SETCC, MVT::v16i8, { 1, 1, 1, 1 } },
3300
3301 { ISD::SELECT, MVT::v2f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3302 { ISD::SELECT, MVT::f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3303 { ISD::SELECT, MVT::v2i64, { 2, 2, 3, 3 } }, // pand + pandn + por
3304 { ISD::SELECT, MVT::v4i32, { 2, 2, 3, 3 } }, // pand + pandn + por
3305 { ISD::SELECT, MVT::v8i16, { 2, 2, 3, 3 } }, // pand + pandn + por
3306 { ISD::SELECT, MVT::v16i8, { 2, 2, 3, 3 } }, // pand + pandn + por
3307 };
3308
3309 static const CostKindTblEntry SSE1CostTbl[] = {
3310 { ISD::SETCC, MVT::v4f32, { 2, 5, 1, 1 } },
3311 { ISD::SETCC, MVT::f32, { 1, 5, 1, 1 } },
3312
3313 { ISD::SELECT, MVT::v4f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3314 { ISD::SELECT, MVT::f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3315 };
3316
3317 if (ST->useSLMArithCosts())
3318 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
3319 if (auto KindCost = Entry->Cost[CostKind])
3320 return LT.first * (ExtraCost + *KindCost);
3321
3322 if (ST->hasBWI())
3323 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3324 if (auto KindCost = Entry->Cost[CostKind])
3325 return LT.first * (ExtraCost + *KindCost);
3326
3327 if (ST->hasAVX512())
3328 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3329 if (auto KindCost = Entry->Cost[CostKind])
3330 return LT.first * (ExtraCost + *KindCost);
3331
3332 if (ST->hasAVX2())
3333 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3334 if (auto KindCost = Entry->Cost[CostKind])
3335 return LT.first * (ExtraCost + *KindCost);
3336
3337 if (ST->hasXOP())
3338 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3339 if (auto KindCost = Entry->Cost[CostKind])
3340 return LT.first * (ExtraCost + *KindCost);
3341
3342 if (ST->hasAVX())
3343 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3344 if (auto KindCost = Entry->Cost[CostKind])
3345 return LT.first * (ExtraCost + *KindCost);
3346
3347 if (ST->hasSSE42())
3348 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3349 if (auto KindCost = Entry->Cost[CostKind])
3350 return LT.first * (ExtraCost + *KindCost);
3351
3352 if (ST->hasSSE41())
3353 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3354 if (auto KindCost = Entry->Cost[CostKind])
3355 return LT.first * (ExtraCost + *KindCost);
3356
3357 if (ST->hasSSE2())
3358 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3359 if (auto KindCost = Entry->Cost[CostKind])
3360 return LT.first * (ExtraCost + *KindCost);
3361
3362 if (ST->hasSSE1())
3363 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3364 if (auto KindCost = Entry->Cost[CostKind])
3365 return LT.first * (ExtraCost + *KindCost);
3366
3367 // Assume a 3cy latency for fp select ops.
3368 if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select)
3369 if (ValTy->getScalarType()->isFloatingPointTy())
3370 return 3;
3371
3372 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
3373}
3374
3376
3380 // Costs should match the codegen from:
3381 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
3382 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
3383 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
3384 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
3385 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
3386
3387 // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
3388 // specialized in these tables yet.
3389 static const CostKindTblEntry AVX512VBMI2CostTbl[] = {
3390 { ISD::FSHL, MVT::v8i64, { 1, 1, 1, 1 } },
3391 { ISD::FSHL, MVT::v4i64, { 1, 1, 1, 1 } },
3392 { ISD::FSHL, MVT::v2i64, { 1, 1, 1, 1 } },
3393 { ISD::FSHL, MVT::v16i32, { 1, 1, 1, 1 } },
3394 { ISD::FSHL, MVT::v8i32, { 1, 1, 1, 1 } },
3395 { ISD::FSHL, MVT::v4i32, { 1, 1, 1, 1 } },
3396 { ISD::FSHL, MVT::v32i16, { 1, 1, 1, 1 } },
3397 { ISD::FSHL, MVT::v16i16, { 1, 1, 1, 1 } },
3398 { ISD::FSHL, MVT::v8i16, { 1, 1, 1, 1 } },
3399 { ISD::ROTL, MVT::v32i16, { 1, 1, 1, 1 } },
3400 { ISD::ROTL, MVT::v16i16, { 1, 1, 1, 1 } },
3401 { ISD::ROTL, MVT::v8i16, { 1, 1, 1, 1 } },
3402 { ISD::ROTR, MVT::v32i16, { 1, 1, 1, 1 } },
3403 { ISD::ROTR, MVT::v16i16, { 1, 1, 1, 1 } },
3404 { ISD::ROTR, MVT::v8i16, { 1, 1, 1, 1 } },
3405 { X86ISD::VROTLI, MVT::v32i16, { 1, 1, 1, 1 } },
3406 { X86ISD::VROTLI, MVT::v16i16, { 1, 1, 1, 1 } },
3407 { X86ISD::VROTLI, MVT::v8i16, { 1, 1, 1, 1 } },
3408 };
3409 static const CostKindTblEntry AVX512BITALGCostTbl[] = {
3410 { ISD::CTPOP, MVT::v32i16, { 1, 1, 1, 1 } },
3411 { ISD::CTPOP, MVT::v64i8, { 1, 1, 1, 1 } },
3412 { ISD::CTPOP, MVT::v16i16, { 1, 1, 1, 1 } },
3413 { ISD::CTPOP, MVT::v32i8, { 1, 1, 1, 1 } },
3414 { ISD::CTPOP, MVT::v8i16, { 1, 1, 1, 1 } },
3415 { ISD::CTPOP, MVT::v16i8, { 1, 1, 1, 1 } },
3416 };
3417 static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = {
3418 { ISD::CTPOP, MVT::v8i64, { 1, 1, 1, 1 } },
3419 { ISD::CTPOP, MVT::v16i32, { 1, 1, 1, 1 } },
3420 { ISD::CTPOP, MVT::v4i64, { 1, 1, 1, 1 } },
3421 { ISD::CTPOP, MVT::v8i32, { 1, 1, 1, 1 } },
3422 { ISD::CTPOP, MVT::v2i64, { 1, 1, 1, 1 } },
3423 { ISD::CTPOP, MVT::v4i32, { 1, 1, 1, 1 } },
3424 };
3425 static const CostKindTblEntry AVX512CDCostTbl[] = {
3426 { ISD::CTLZ, MVT::v8i64, { 1, 5, 1, 1 } },
3427 { ISD::CTLZ, MVT::v16i32, { 1, 5, 1, 1 } },
3428 { ISD::CTLZ, MVT::v32i16, { 18, 27, 23, 27 } },
3429 { ISD::CTLZ, MVT::v64i8, { 3, 16, 9, 11 } },
3430 { ISD::CTLZ, MVT::v4i64, { 1, 5, 1, 1 } },
3431 { ISD::CTLZ, MVT::v8i32, { 1, 5, 1, 1 } },
3432 { ISD::CTLZ, MVT::v16i16, { 8, 19, 11, 13 } },
3433 { ISD::CTLZ, MVT::v32i8, { 2, 11, 9, 10 } },
3434 { ISD::CTLZ, MVT::v2i64, { 1, 5, 1, 1 } },
3435 { ISD::CTLZ, MVT::v4i32, { 1, 5, 1, 1 } },
3436 { ISD::CTLZ, MVT::v8i16, { 3, 15, 4, 6 } },
3437 { ISD::CTLZ, MVT::v16i8, { 2, 10, 9, 10 } },
3438
3439 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3440 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3441 { ISD::CTTZ, MVT::v4i64, { 1, 8, 6, 6 } },
3442 { ISD::CTTZ, MVT::v8i32, { 1, 8, 6, 6 } },
3443 { ISD::CTTZ, MVT::v2i64, { 1, 8, 6, 6 } },
3444 { ISD::CTTZ, MVT::v4i32, { 1, 8, 6, 6 } },
3445 };
3446 static const CostKindTblEntry AVX512BWCostTbl[] = {
3447 { ISD::ABS, MVT::v32i16, { 1, 1, 1, 1 } },
3448 { ISD::ABS, MVT::v64i8, { 1, 1, 1, 1 } },
3449 { ISD::BITREVERSE, MVT::v2i64, { 3, 10, 10, 11 } },
3450 { ISD::BITREVERSE, MVT::v4i64, { 3, 11, 10, 11 } },
3451 { ISD::BITREVERSE, MVT::v8i64, { 3, 12, 10, 14 } },
3452 { ISD::BITREVERSE, MVT::v4i32, { 3, 10, 10, 11 } },
3453 { ISD::BITREVERSE, MVT::v8i32, { 3, 11, 10, 11 } },
3454 { ISD::BITREVERSE, MVT::v16i32, { 3, 12, 10, 14 } },
3455 { ISD::BITREVERSE, MVT::v8i16, { 3, 10, 10, 11 } },
3456 { ISD::BITREVERSE, MVT::v16i16, { 3, 11, 10, 11 } },
3457 { ISD::BITREVERSE, MVT::v32i16, { 3, 12, 10, 14 } },
3458 { ISD::BITREVERSE, MVT::v16i8, { 2, 5, 9, 9 } },
3459 { ISD::BITREVERSE, MVT::v32i8, { 2, 5, 9, 9 } },
3460 { ISD::BITREVERSE, MVT::v64i8, { 2, 5, 9, 12 } },
3461 { ISD::BSWAP, MVT::v2i64, { 1, 1, 1, 2 } },
3462 { ISD::BSWAP, MVT::v4i64, { 1, 1, 1, 2 } },
3463 { ISD::BSWAP, MVT::v8i64, { 1, 1, 1, 2 } },
3464 { ISD::BSWAP, MVT::v4i32, { 1, 1, 1, 2 } },
3465 { ISD::BSWAP, MVT::v8i32, { 1, 1, 1, 2 } },
3466 { ISD::BSWAP, MVT::v16i32, { 1, 1, 1, 2 } },
3467 { ISD::BSWAP, MVT::v8i16, { 1, 1, 1, 2 } },
3468 { ISD::BSWAP, MVT::v16i16, { 1, 1, 1, 2 } },
3469 { ISD::BSWAP, MVT::v32i16, { 1, 1, 1, 2 } },
3470 { ISD::CTLZ, MVT::v8i64, { 8, 22, 23, 23 } },
3471 { ISD::CTLZ, MVT::v16i32, { 8, 23, 25, 25 } },
3472 { ISD::CTLZ, MVT::v32i16, { 4, 15, 15, 16 } },
3473 { ISD::CTLZ, MVT::v64i8, { 3, 12, 10, 9 } },
3474 { ISD::CTPOP, MVT::v2i64, { 3, 7, 10, 10 } },
3475 { ISD::CTPOP, MVT::v4i64, { 3, 7, 10, 10 } },
3476 { ISD::CTPOP, MVT::v8i64, { 3, 8, 10, 12 } },
3477 { ISD::CTPOP, MVT::v4i32, { 7, 11, 14, 14 } },
3478 { ISD::CTPOP, MVT::v8i32, { 7, 11, 14, 14 } },
3479 { ISD::CTPOP, MVT::v16i32, { 7, 12, 14, 16 } },
3480 { ISD::CTPOP, MVT::v8i16, { 2, 7, 11, 11 } },
3481 { ISD::CTPOP, MVT::v16i16, { 2, 7, 11, 11 } },
3482 { ISD::CTPOP, MVT::v32i16, { 3, 7, 11, 13 } },
3483 { ISD::CTPOP, MVT::v16i8, { 2, 4, 8, 8 } },
3484 { ISD::CTPOP, MVT::v32i8, { 2, 4, 8, 8 } },
3485 { ISD::CTPOP, MVT::v64i8, { 2, 5, 8, 10 } },
3486 { ISD::CTTZ, MVT::v8i16, { 3, 9, 14, 14 } },
3487 { ISD::CTTZ, MVT::v16i16, { 3, 9, 14, 14 } },
3488 { ISD::CTTZ, MVT::v32i16, { 3, 10, 14, 16 } },
3489 { ISD::CTTZ, MVT::v16i8, { 2, 6, 11, 11 } },
3490 { ISD::CTTZ, MVT::v32i8, { 2, 6, 11, 11 } },
3491 { ISD::CTTZ, MVT::v64i8, { 3, 7, 11, 13 } },
3492 { ISD::ROTL, MVT::v32i16, { 2, 8, 6, 8 } },
3493 { ISD::ROTL, MVT::v16i16, { 2, 8, 6, 7 } },
3494 { ISD::ROTL, MVT::v8i16, { 2, 7, 6, 7 } },
3495 { ISD::ROTL, MVT::v64i8, { 5, 6, 11, 12 } },
3496 { ISD::ROTL, MVT::v32i8, { 5, 15, 7, 10 } },
3497 { ISD::ROTL, MVT::v16i8, { 5, 15, 7, 10 } },
3498 { ISD::ROTR, MVT::v32i16, { 2, 8, 6, 8 } },
3499 { ISD::ROTR, MVT::v16i16, { 2, 8, 6, 7 } },
3500 { ISD::ROTR, MVT::v8i16, { 2, 7, 6, 7 } },
3501 { ISD::ROTR, MVT::v64i8, { 5, 6, 12, 14 } },
3502 { ISD::ROTR, MVT::v32i8, { 5, 14, 6, 9 } },
3503 { ISD::ROTR, MVT::v16i8, { 5, 14, 6, 9 } },
3504 { X86ISD::VROTLI, MVT::v32i16, { 2, 5, 3, 3 } },
3505 { X86ISD::VROTLI, MVT::v16i16, { 1, 5, 3, 3 } },
3506 { X86ISD::VROTLI, MVT::v8i16, { 1, 5, 3, 3 } },
3507 { X86ISD::VROTLI, MVT::v64i8, { 2, 9, 3, 4 } },
3508 { X86ISD::VROTLI, MVT::v32i8, { 1, 9, 3, 4 } },
3509 { X86ISD::VROTLI, MVT::v16i8, { 1, 8, 3, 4 } },
3510 { ISD::SADDSAT, MVT::v32i16, { 1 } },
3511 { ISD::SADDSAT, MVT::v64i8, { 1 } },
3512 { ISD::SMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3513 { ISD::SMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3514 { ISD::SMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3515 { ISD::SMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3516 { ISD::SSUBSAT, MVT::v32i16, { 1 } },
3517 { ISD::SSUBSAT, MVT::v64i8, { 1 } },
3518 { ISD::UADDSAT, MVT::v32i16, { 1 } },
3519 { ISD::UADDSAT, MVT::v64i8, { 1 } },
3520 { ISD::UMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3521 { ISD::UMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3522 { ISD::UMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3523 { ISD::UMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3524 { ISD::USUBSAT, MVT::v32i16, { 1 } },
3525 { ISD::USUBSAT, MVT::v64i8, { 1 } },
3526 };
3527 static const CostKindTblEntry AVX512CostTbl[] = {
3528 { ISD::ABS, MVT::v8i64, { 1, 1, 1, 1 } },
3529 { ISD::ABS, MVT::v4i64, { 1, 1, 1, 1 } },
3530 { ISD::ABS, MVT::v2i64, { 1, 1, 1, 1 } },
3531 { ISD::ABS, MVT::v16i32, { 1, 1, 1, 1 } },
3532 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 1 } },
3533 { ISD::ABS, MVT::v32i16, { 2, 7, 4, 4 } },
3534 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 1 } },
3535 { ISD::ABS, MVT::v64i8, { 2, 7, 4, 4 } },
3536 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 1 } },
3537 { ISD::BITREVERSE, MVT::v8i64, { 9, 13, 20, 20 } },
3538 { ISD::BITREVERSE, MVT::v16i32, { 9, 13, 20, 20 } },
3539 { ISD::BITREVERSE, MVT::v32i16, { 9, 13, 20, 20 } },
3540 { ISD::BITREVERSE, MVT::v64i8, { 6, 11, 17, 17 } },
3541 { ISD::BSWAP, MVT::v8i64, { 4, 7, 5, 5 } },
3542 { ISD::BSWAP, MVT::v16i32, { 4, 7, 5, 5 } },
3543 { ISD::BSWAP, MVT::v32i16, { 4, 7, 5, 5 } },
3544 { ISD::CTLZ, MVT::v8i64, { 10, 28, 32, 32 } },
3545 { ISD::CTLZ, MVT::v16i32, { 12, 30, 38, 38 } },
3546 { ISD::CTLZ, MVT::v32i16, { 8, 15, 29, 29 } },
3547 { ISD::CTLZ, MVT::v64i8, { 6, 11, 19, 19 } },
3548 { ISD::CTPOP, MVT::v8i64, { 16, 16, 19, 19 } },
3549 { ISD::CTPOP, MVT::v16i32, { 24, 19, 27, 27 } },
3550 { ISD::CTPOP, MVT::v32i16, { 18, 15, 22, 22 } },
3551 { ISD::CTPOP, MVT::v64i8, { 12, 11, 16, 16 } },
3552 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3553 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3554 { ISD::CTTZ, MVT::v32i16, { 7, 17, 27, 27 } },
3555 { ISD::CTTZ, MVT::v64i8, { 6, 13, 21, 21 } },
3556 { ISD::ROTL, MVT::v8i64, { 1, 1, 1, 1 } },
3557 { ISD::ROTL, MVT::v4i64, { 1, 1, 1, 1 } },
3558 { ISD::ROTL, MVT::v2i64, { 1, 1, 1, 1 } },
3559 { ISD::ROTL, MVT::v16i32, { 1, 1, 1, 1 } },
3560 { ISD::ROTL, MVT::v8i32, { 1, 1, 1, 1 } },
3561 { ISD::ROTL, MVT::v4i32, { 1, 1, 1, 1 } },
3562 { ISD::ROTR, MVT::v8i64, { 1, 1, 1, 1 } },
3563 { ISD::ROTR, MVT::v4i64, { 1, 1, 1, 1 } },
3564 { ISD::ROTR, MVT::v2i64, { 1, 1, 1, 1 } },
3565 { ISD::ROTR, MVT::v16i32, { 1, 1, 1, 1 } },
3566 { ISD::ROTR, MVT::v8i32, { 1, 1, 1, 1 } },
3567 { ISD::ROTR, MVT::v4i32, { 1, 1, 1, 1 } },
3568 { X86ISD::VROTLI, MVT::v8i64, { 1, 1, 1, 1 } },
3569 { X86ISD::VROTLI, MVT::v4i64, { 1, 1, 1, 1 } },
3570 { X86ISD::VROTLI, MVT::v2i64, { 1, 1, 1, 1 } },
3571 { X86ISD::VROTLI, MVT::v16i32, { 1, 1, 1, 1 } },
3572 { X86ISD::VROTLI, MVT::v8i32, { 1, 1, 1, 1 } },
3573 { X86ISD::VROTLI, MVT::v4i32, { 1, 1, 1, 1 } },
3574 { ISD::SMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3575 { ISD::SMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3576 { ISD::SMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3577 { ISD::SMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3578 { ISD::SMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3579 { ISD::SMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3580 { ISD::SMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3581 { ISD::SMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3582 { ISD::SMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3583 { ISD::SMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3584 { ISD::SMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3585 { ISD::SMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3586 { ISD::UMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3587 { ISD::UMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3588 { ISD::UMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3589 { ISD::UMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3590 { ISD::UMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3591 { ISD::UMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3592 { ISD::UMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3593 { ISD::UMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3594 { ISD::UMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3595 { ISD::UMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3596 { ISD::UMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3597 { ISD::UMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3598 { ISD::USUBSAT, MVT::v16i32, { 2 } }, // pmaxud + psubd
3599 { ISD::USUBSAT, MVT::v2i64, { 2 } }, // pmaxuq + psubq
3600 { ISD::USUBSAT, MVT::v4i64, { 2 } }, // pmaxuq + psubq
3601 { ISD::USUBSAT, MVT::v8i64, { 2 } }, // pmaxuq + psubq
3602 { ISD::UADDSAT, MVT::v16i32, { 3 } }, // not + pminud + paddd
3603 { ISD::UADDSAT, MVT::v2i64, { 3 } }, // not + pminuq + paddq
3604 { ISD::UADDSAT, MVT::v4i64, { 3 } }, // not + pminuq + paddq
3605 { ISD::UADDSAT, MVT::v8i64, { 3 } }, // not + pminuq + paddq
3606 { ISD::SADDSAT, MVT::v32i16, { 2 } },
3607 { ISD::SADDSAT, MVT::v64i8, { 2 } },
3608 { ISD::SSUBSAT, MVT::v32i16, { 2 } },
3609 { ISD::SSUBSAT, MVT::v64i8, { 2 } },
3610 { ISD::UADDSAT, MVT::v32i16, { 2 } },
3611 { ISD::UADDSAT, MVT::v64i8, { 2 } },
3612 { ISD::USUBSAT, MVT::v32i16, { 2 } },
3613 { ISD::USUBSAT, MVT::v64i8, { 2 } },
3614 { ISD::FMAXNUM, MVT::f32, { 2, 2, 3, 3 } },
3615 { ISD::FMAXNUM, MVT::v4f32, { 1, 1, 3, 3 } },
3616 { ISD::FMAXNUM, MVT::v8f32, { 2, 2, 3, 3 } },
3617 { ISD::FMAXNUM, MVT::v16f32, { 4, 4, 3, 3 } },
3618 { ISD::FMAXNUM, MVT::f64, { 2, 2, 3, 3 } },
3619 { ISD::FMAXNUM, MVT::v2f64, { 1, 1, 3, 3 } },
3620 { ISD::FMAXNUM, MVT::v4f64, { 2, 2, 3, 3 } },
3621 { ISD::FMAXNUM, MVT::v8f64, { 3, 3, 3, 3 } },
3622 { ISD::FSQRT, MVT::f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3623 { ISD::FSQRT, MVT::v4f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3624 { ISD::FSQRT, MVT::v8f32, { 6, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3625 { ISD::FSQRT, MVT::v16f32, { 12, 20, 1, 3 } }, // Skylake from http://www.agner.org/
3626 { ISD::FSQRT, MVT::f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3627 { ISD::FSQRT, MVT::v2f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3628 { ISD::FSQRT, MVT::v4f64, { 12, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3629 { ISD::FSQRT, MVT::v8f64, { 24, 32, 1, 3 } }, // Skylake from http://www.agner.org/
3630 };
3631 static const CostKindTblEntry XOPCostTbl[] = {
3632 { ISD::BITREVERSE, MVT::v4i64, { 3, 6, 5, 6 } },
3633 { ISD::BITREVERSE, MVT::v8i32, { 3, 6, 5, 6 } },
3634 { ISD::BITREVERSE, MVT::v16i16, { 3, 6, 5, 6 } },
3635 { ISD::BITREVERSE, MVT::v32i8, { 3, 6, 5, 6 } },
3636 { ISD::BITREVERSE, MVT::v2i64, { 2, 7, 1, 1 } },
3637 { ISD::BITREVERSE, MVT::v4i32, { 2, 7, 1, 1 } },
3638 { ISD::BITREVERSE, MVT::v8i16, { 2, 7, 1, 1 } },
3639 { ISD::BITREVERSE, MVT::v16i8, { 2, 7, 1, 1 } },
3640 { ISD::BITREVERSE, MVT::i64, { 2, 2, 3, 4 } },
3641 { ISD::BITREVERSE, MVT::i32, { 2, 2, 3, 4 } },
3642 { ISD::BITREVERSE, MVT::i16, { 2, 2, 3, 4 } },
3643 { ISD::BITREVERSE, MVT::i8, { 2, 2, 3, 4 } },
3644 // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
3645 { ISD::ROTL, MVT::v4i64, { 4, 7, 5, 6 } },
3646 { ISD::ROTL, MVT::v8i32, { 4, 7, 5, 6 } },
3647 { ISD::ROTL, MVT::v16i16, { 4, 7, 5, 6 } },
3648 { ISD::ROTL, MVT::v32i8, { 4, 7, 5, 6 } },
3649 { ISD::ROTL, MVT::v2i64, { 1, 3, 1, 1 } },
3650 { ISD::ROTL, MVT::v4i32, { 1, 3, 1, 1 } },
3651 { ISD::ROTL, MVT::v8i16, { 1, 3, 1, 1 } },
3652 { ISD::ROTL, MVT::v16i8, { 1, 3, 1, 1 } },
3653 { ISD::ROTR, MVT::v4i64, { 4, 7, 8, 9 } },
3654 { ISD::ROTR, MVT::v8i32, { 4, 7, 8, 9 } },
3655 { ISD::ROTR, MVT::v16i16, { 4, 7, 8, 9 } },
3656 { ISD::ROTR, MVT::v32i8, { 4, 7, 8, 9 } },
3657 { ISD::ROTR, MVT::v2i64, { 1, 3, 3, 3 } },
3658 { ISD::ROTR, MVT::v4i32, { 1, 3, 3, 3 } },
3659 { ISD::ROTR, MVT::v8i16, { 1, 3, 3, 3 } },
3660 { ISD::ROTR, MVT::v16i8, { 1, 3, 3, 3 } },
3661 { X86ISD::VROTLI, MVT::v4i64, { 4, 7, 5, 6 } },
3662 { X86ISD::VROTLI, MVT::v8i32, { 4, 7, 5, 6 } },
3663 { X86ISD::VROTLI, MVT::v16i16, { 4, 7, 5, 6 } },
3664 { X86ISD::VROTLI, MVT::v32i8, { 4, 7, 5, 6 } },
3665 { X86ISD::VROTLI, MVT::v2i64, { 1, 3, 1, 1 } },
3666 { X86ISD::VROTLI, MVT::v4i32, { 1, 3, 1, 1 } },
3667 { X86ISD::VROTLI, MVT::v8i16, { 1, 3, 1, 1 } },
3668 { X86ISD::VROTLI, MVT::v16i8, { 1, 3, 1, 1 } },
3669 };
3670 static const CostKindTblEntry AVX2CostTbl[] = {
3671 { ISD::ABS, MVT::v2i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3672 { ISD::ABS, MVT::v4i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3673 { ISD::ABS, MVT::v4i32, { 1, 1, 1, 1 } },
3674 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 2 } },
3675 { ISD::ABS, MVT::v8i16, { 1, 1, 1, 1 } },
3676 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 2 } },
3677 { ISD::ABS, MVT::v16i8, { 1, 1, 1, 1 } },
3678 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 2 } },
3679 { ISD::BITREVERSE, MVT::v2i64, { 3, 11, 10, 11 } },
3680 { ISD::BITREVERSE, MVT::v4i64, { 5, 11, 10, 17 } },
3681 { ISD::BITREVERSE, MVT::v4i32, { 3, 11, 10, 11 } },
3682 { ISD::BITREVERSE, MVT::v8i32, { 5, 11, 10, 17 } },
3683 { ISD::BITREVERSE, MVT::v8i16, { 3, 11, 10, 11 } },
3684 { ISD::BITREVERSE, MVT::v16i16, { 5, 11, 10, 17 } },
3685 { ISD::BITREVERSE, MVT::v16i8, { 3, 6, 9, 9 } },
3686 { ISD::BITREVERSE, MVT::v32i8, { 4, 5, 9, 15 } },
3687 { ISD::BSWAP, MVT::v2i64, { 1, 2, 1, 2 } },
3688 { ISD::BSWAP, MVT::v4i64, { 1, 3, 1, 2 } },
3689 { ISD::BSWAP, MVT::v4i32, { 1, 2, 1, 2 } },
3690 { ISD::BSWAP, MVT::v8i32, { 1, 3, 1, 2 } },
3691 { ISD::BSWAP, MVT::v8i16, { 1, 2, 1, 2 } },
3692 { ISD::BSWAP, MVT::v16i16, { 1, 3, 1, 2 } },
3693 { ISD::CTLZ, MVT::v2i64, { 7, 18, 24, 25 } },
3694 { ISD::CTLZ, MVT::v4i64, { 14, 18, 24, 44 } },
3695 { ISD::CTLZ, MVT::v4i32, { 5, 16, 19, 20 } },
3696 { ISD::CTLZ, MVT::v8i32, { 10, 16, 19, 34 } },
3697 { ISD::CTLZ, MVT::v8i16, { 4, 13, 14, 15 } },
3698 { ISD::CTLZ, MVT::v16i16, { 6, 14, 14, 24 } },
3699 { ISD::CTLZ, MVT::v16i8, { 3, 12, 9, 10 } },
3700 { ISD::CTLZ, MVT::v32i8, { 4, 12, 9, 14 } },
3701 { ISD::CTPOP, MVT::v2i64, { 3, 9, 10, 10 } },
3702 { ISD::CTPOP, MVT::v4i64, { 4, 9, 10, 14 } },
3703 { ISD::CTPOP, MVT::v4i32, { 7, 12, 14, 14 } },
3704 { ISD::CTPOP, MVT::v8i32, { 7, 12, 14, 18 } },
3705 { ISD::CTPOP, MVT::v8i16, { 3, 7, 11, 11 } },
3706 { ISD::CTPOP, MVT::v16i16, { 6, 8, 11, 18 } },
3707 { ISD::CTPOP, MVT::v16i8, { 2, 5, 8, 8 } },
3708 { ISD::CTPOP, MVT::v32i8, { 3, 5, 8, 12 } },
3709 { ISD::CTTZ, MVT::v2i64, { 4, 11, 13, 13 } },
3710 { ISD::CTTZ, MVT::v4i64, { 5, 11, 13, 20 } },
3711 { ISD::CTTZ, MVT::v4i32, { 7, 14, 17, 17 } },
3712 { ISD::CTTZ, MVT::v8i32, { 7, 15, 17, 24 } },
3713 { ISD::CTTZ, MVT::v8i16, { 4, 9, 14, 14 } },
3714 { ISD::CTTZ, MVT::v16i16, { 6, 9, 14, 24 } },
3715 { ISD::CTTZ, MVT::v16i8, { 3, 7, 11, 11 } },
3716 { ISD::CTTZ, MVT::v32i8, { 5, 7, 11, 18 } },
3717 { ISD::SADDSAT, MVT::v16i16, { 1 } },
3718 { ISD::SADDSAT, MVT::v32i8, { 1 } },
3719 { ISD::SMAX, MVT::v2i64, { 2, 7, 2, 3 } },
3720 { ISD::SMAX, MVT::v4i64, { 2, 7, 2, 3 } },
3721 { ISD::SMAX, MVT::v8i32, { 1, 1, 1, 2 } },
3722 { ISD::SMAX, MVT::v16i16, { 1, 1, 1, 2 } },
3723 { ISD::SMAX, MVT::v32i8, { 1, 1, 1, 2 } },
3724 { ISD::SMIN, MVT::v2i64, { 2, 7, 2, 3 } },
3725 { ISD::SMIN, MVT::v4i64, { 2, 7, 2, 3 } },
3726 { ISD::SMIN, MVT::v8i32, { 1, 1, 1, 2 } },
3727 { ISD::SMIN, MVT::v16i16, { 1, 1, 1, 2 } },
3728 { ISD::SMIN, MVT::v32i8, { 1, 1, 1, 2 } },
3729 { ISD::SSUBSAT, MVT::v16i16, { 1 } },
3730 { ISD::SSUBSAT, MVT::v32i8, { 1 } },
3731 { ISD::UADDSAT, MVT::v16i16, { 1 } },
3732 { ISD::UADDSAT, MVT::v32i8, { 1 } },
3733 { ISD::UADDSAT, MVT::v8i32, { 3 } }, // not + pminud + paddd
3734 { ISD::UMAX, MVT::v2i64, { 2, 8, 5, 6 } },
3735 { ISD::UMAX, MVT::v4i64, { 2, 8, 5, 8 } },
3736 { ISD::UMAX, MVT::v8i32, { 1, 1, 1, 2 } },
3737 { ISD::UMAX, MVT::v16i16, { 1, 1, 1, 2 } },
3738 { ISD::UMAX, MVT::v32i8, { 1, 1, 1, 2 } },
3739 { ISD::UMIN, MVT::v2i64, { 2, 8, 5, 6 } },
3740 { ISD::UMIN, MVT::v4i64, { 2, 8, 5, 8 } },
3741 { ISD::UMIN, MVT::v8i32, { 1, 1, 1, 2 } },
3742 { ISD::UMIN, MVT::v16i16, { 1, 1, 1, 2 } },
3743 { ISD::UMIN, MVT::v32i8, { 1, 1, 1, 2 } },
3744 { ISD::USUBSAT, MVT::v16i16, { 1 } },
3745 { ISD::USUBSAT, MVT::v32i8, { 1 } },
3746 { ISD::USUBSAT, MVT::v8i32, { 2 } }, // pmaxud + psubd
3747 { ISD::FMAXNUM, MVT::f32, { 2, 7, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3748 { ISD::FMAXNUM, MVT::v4f32, { 2, 7, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3749 { ISD::FMAXNUM, MVT::v8f32, { 3, 7, 3, 6 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3750 { ISD::FMAXNUM, MVT::f64, { 2, 7, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3751 { ISD::FMAXNUM, MVT::v2f64, { 2, 7, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3752 { ISD::FMAXNUM, MVT::v4f64, { 3, 7, 3, 6 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3753 { ISD::FSQRT, MVT::f32, { 7, 15, 1, 1 } }, // vsqrtss
3754 { ISD::FSQRT, MVT::v4f32, { 7, 15, 1, 1 } }, // vsqrtps
3755 { ISD::FSQRT, MVT::v8f32, { 14, 21, 1, 3 } }, // vsqrtps
3756 { ISD::FSQRT, MVT::f64, { 14, 21, 1, 1 } }, // vsqrtsd
3757 { ISD::FSQRT, MVT::v2f64, { 14, 21, 1, 1 } }, // vsqrtpd
3758 { ISD::FSQRT, MVT::v4f64, { 28, 35, 1, 3 } }, // vsqrtpd
3759 };
3760 static const CostKindTblEntry AVX1CostTbl[] = {
3761 { ISD::ABS, MVT::v4i64, { 6, 8, 6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3762 { ISD::ABS, MVT::v8i32, { 3, 6, 4, 5 } },
3763 { ISD::ABS, MVT::v16i16, { 3, 6, 4, 5 } },
3764 { ISD::ABS, MVT::v32i8, { 3, 6, 4, 5 } },
3765 { ISD::BITREVERSE, MVT::v4i64, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3766 { ISD::BITREVERSE, MVT::v2i64, { 8, 13, 10, 16 } },
3767 { ISD::BITREVERSE, MVT::v8i32, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3768 { ISD::BITREVERSE, MVT::v4i32, { 8, 13, 10, 16 } },
3769 { ISD::BITREVERSE, MVT::v16i16, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3770 { ISD::BITREVERSE, MVT::v8i16, { 8, 13, 10, 16 } },
3771 { ISD::BITREVERSE, MVT::v32i8, { 13, 15, 17, 26 } }, // 2 x 128-bit Op + extract/insert
3772 { ISD::BITREVERSE, MVT::v16i8, { 7, 7, 9, 13 } },
3773 { ISD::BSWAP, MVT::v4i64, { 5, 6, 5, 10 } },
3774 { ISD::BSWAP, MVT::v2i64, { 2, 2, 1, 3 } },
3775 { ISD::BSWAP, MVT::v8i32, { 5, 6, 5, 10 } },
3776 { ISD::BSWAP, MVT::v4i32, { 2, 2, 1, 3 } },
3777 { ISD::BSWAP, MVT::v16i16, { 5, 6, 5, 10 } },
3778 { ISD::BSWAP, MVT::v8i16, { 2, 2, 1, 3 } },
3779 { ISD::CTLZ, MVT::v4i64, { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert
3780 { ISD::CTLZ, MVT::v2i64, { 14, 24, 24, 28 } },
3781 { ISD::CTLZ, MVT::v8i32, { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert
3782 { ISD::CTLZ, MVT::v4i32, { 12, 20, 19, 23 } },
3783 { ISD::CTLZ, MVT::v16i16, { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert
3784 { ISD::CTLZ, MVT::v8i16, { 9, 16, 14, 18 } },
3785 { ISD::CTLZ, MVT::v32i8, { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert
3786 { ISD::CTLZ, MVT::v16i8, { 7, 12, 9, 13 } },
3787 { ISD::CTPOP, MVT::v4i64, { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert
3788 { ISD::CTPOP, MVT::v2i64, { 7, 14, 10, 14 } },
3789 { ISD::CTPOP, MVT::v8i32, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
3790 { ISD::CTPOP, MVT::v4i32, { 9, 20, 14, 18 } },
3791 { ISD::CTPOP, MVT::v16i16, { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert
3792 { ISD::CTPOP, MVT::v8i16, { 8, 18, 11, 15 } },
3793 { ISD::CTPOP, MVT::v32i8, { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert
3794 { ISD::CTPOP, MVT::v16i8, { 6, 12, 8, 12 } },
3795 { ISD::CTTZ, MVT::v4i64, { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert
3796 { ISD::CTTZ, MVT::v2i64, { 9, 19, 13, 17 } },
3797 { ISD::CTTZ, MVT::v8i32, { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert
3798 { ISD::CTTZ, MVT::v4i32, { 11, 24, 17, 21 } },
3799 { ISD::CTTZ, MVT::v16i16, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
3800 { ISD::CTTZ, MVT::v8i16, { 9, 21, 14, 18 } },
3801 { ISD::CTTZ, MVT::v32i8, { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert
3802 { ISD::CTTZ, MVT::v16i8, { 8, 16, 11, 15 } },
3803 { ISD::SADDSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3804 { ISD::SADDSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3805 { ISD::SMAX, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
3806 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 4 } },
3807 { ISD::SMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3808 { ISD::SMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3809 { ISD::SMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3810 { ISD::SMIN, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
3811 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
3812 { ISD::SMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3813 { ISD::SMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3814 { ISD::SMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3815 { ISD::SSUBSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3816 { ISD::SSUBSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3817 { ISD::UADDSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3818 { ISD::UADDSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3819 { ISD::UADDSAT, MVT::v8i32, { 8 } }, // 2 x 128-bit Op + extract/insert
3820 { ISD::UMAX, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
3821 { ISD::UMAX, MVT::v2i64, { 4, 8, 5, 7 } },
3822 { ISD::UMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3823 { ISD::UMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3824 { ISD::UMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3825 { ISD::UMIN, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
3826 { ISD::UMIN, MVT::v2i64, { 4, 8, 5, 7 } },
3827 { ISD::UMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3828 { ISD::UMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3829 { ISD::UMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3830 { ISD::USUBSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3831 { ISD::USUBSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3832 { ISD::USUBSAT, MVT::v8i32, { 6 } }, // 2 x 128-bit Op + extract/insert
3833 { ISD::FMAXNUM, MVT::f32, { 3, 6, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3834 { ISD::FMAXNUM, MVT::v4f32, { 3, 6, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3835 { ISD::FMAXNUM, MVT::v8f32, { 5, 7, 3, 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3836 { ISD::FMAXNUM, MVT::f64, { 3, 6, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3837 { ISD::FMAXNUM, MVT::v2f64, { 3, 6, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3838 { ISD::FMAXNUM, MVT::v4f64, { 5, 7, 3, 10 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3839 { ISD::FSQRT, MVT::f32, { 21, 21, 1, 1 } }, // vsqrtss
3840 { ISD::FSQRT, MVT::v4f32, { 21, 21, 1, 1 } }, // vsqrtps
3841 { ISD::FSQRT, MVT::v8f32, { 42, 42, 1, 3 } }, // vsqrtps
3842 { ISD::FSQRT, MVT::f64, { 27, 27, 1, 1 } }, // vsqrtsd
3843 { ISD::FSQRT, MVT::v2f64, { 27, 27, 1, 1 } }, // vsqrtpd
3844 { ISD::FSQRT, MVT::v4f64, { 54, 54, 1, 3 } }, // vsqrtpd
3845 };
3846 static const CostKindTblEntry GFNICostTbl[] = {
3847 { ISD::BITREVERSE, MVT::i8, { 3, 3, 3, 4 } }, // gf2p8affineqb
3848 { ISD::BITREVERSE, MVT::i16, { 3, 3, 4, 6 } }, // gf2p8affineqb
3849 { ISD::BITREVERSE, MVT::i32, { 3, 3, 4, 5 } }, // gf2p8affineqb
3850 { ISD::BITREVERSE, MVT::i64, { 3, 3, 4, 6 } }, // gf2p8affineqb
3851 { ISD::BITREVERSE, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3852 { ISD::BITREVERSE, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3853 { ISD::BITREVERSE, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3854 { ISD::BITREVERSE, MVT::v8i16, { 1, 8, 2, 4 } }, // gf2p8affineqb
3855 { ISD::BITREVERSE, MVT::v16i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
3856 { ISD::BITREVERSE, MVT::v32i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
3857 { ISD::BITREVERSE, MVT::v4i32, { 1, 8, 2, 4 } }, // gf2p8affineqb
3858 { ISD::BITREVERSE, MVT::v8i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
3859 { ISD::BITREVERSE, MVT::v16i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
3860 { ISD::BITREVERSE, MVT::v2i64, { 1, 8, 2, 4 } }, // gf2p8affineqb
3861 { ISD::BITREVERSE, MVT::v4i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
3862 { ISD::BITREVERSE, MVT::v8i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
3863 };
3864 static const CostKindTblEntry GLMCostTbl[] = {
3865 { ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss
3866 { ISD::FSQRT, MVT::v4f32, { 37, 41, 1, 5 } }, // sqrtps
3867 { ISD::FSQRT, MVT::f64, { 34, 35, 1, 1 } }, // sqrtsd
3868 { ISD::FSQRT, MVT::v2f64, { 67, 71, 1, 5 } }, // sqrtpd
3869 };
3870 static const CostKindTblEntry SLMCostTbl[] = {
3871 { ISD::BSWAP, MVT::v2i64, { 5, 5, 1, 5 } },
3872 { ISD::BSWAP, MVT::v4i32, { 5, 5, 1, 5 } },
3873 { ISD::BSWAP, MVT::v8i16, { 5, 5, 1, 5 } },
3874 { ISD::FSQRT, MVT::f32, { 20, 20, 1, 1 } }, // sqrtss
3875 { ISD::FSQRT, MVT::v4f32, { 40, 41, 1, 5 } }, // sqrtps
3876 { ISD::FSQRT, MVT::f64, { 35, 35, 1, 1 } }, // sqrtsd
3877 { ISD::FSQRT, MVT::v2f64, { 70, 71, 1, 5 } }, // sqrtpd
3878 };
3879 static const CostKindTblEntry SSE42CostTbl[] = {
3880 { ISD::USUBSAT, MVT::v4i32, { 2 } }, // pmaxud + psubd
3881 { ISD::UADDSAT, MVT::v4i32, { 3 } }, // not + pminud + paddd
3882 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3883 { ISD::FMAXNUM, MVT::v4f32, { 4, 4, 4, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3884 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3885 { ISD::FMAXNUM, MVT::v2f64, { 4, 4, 4, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3886 { ISD::FSQRT, MVT::f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
3887 { ISD::FSQRT, MVT::v4f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
3888 };
3889 static const CostKindTblEntry SSE41CostTbl[] = {
3890 { ISD::ABS, MVT::v2i64, { 3, 4, 3, 5 } }, // BLENDVPD(X,PSUBQ(0,X),X)
3891 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 3 } },
3892 { ISD::SMAX, MVT::v4i32, { 1, 1, 1, 1 } },
3893 { ISD::SMAX, MVT::v16i8, { 1, 1, 1, 1 } },
3894 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
3895 { ISD::SMIN, MVT::v4i32, { 1, 1, 1, 1 } },
3896 { ISD::SMIN, MVT::v16i8, { 1, 1, 1, 1 } },
3897 { ISD::UMAX, MVT::v2i64, { 2, 11, 6, 7 } },
3898 { ISD::UMAX, MVT::v4i32, { 1, 1, 1, 1 } },
3899 { ISD::UMAX, MVT::v8i16, { 1, 1, 1, 1 } },
3900 { ISD::UMIN, MVT::v2i64, { 2, 11, 6, 7 } },
3901 { ISD::UMIN, MVT::v4i32, { 1, 1, 1, 1 } },
3902 { ISD::UMIN, MVT::v8i16, { 1, 1, 1, 1 } },
3903 };
3904 static const CostKindTblEntry SSSE3CostTbl[] = {
3905 { ISD::ABS, MVT::v4i32, { 1, 2, 1, 1 } },
3906 { ISD::ABS, MVT::v8i16, { 1, 2, 1, 1 } },
3907 { ISD::ABS, MVT::v16i8, { 1, 2, 1, 1 } },
3908 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 11, 21 } },
3909 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 11, 21 } },
3910 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 11, 21 } },
3911 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 10, 16 } },
3912 { ISD::BSWAP, MVT::v2i64, { 2, 3, 1, 5 } },
3913 { ISD::BSWAP, MVT::v4i32, { 2, 3, 1, 5 } },
3914 { ISD::BSWAP, MVT::v8i16, { 2, 3, 1, 5 } },
3915 { ISD::CTLZ, MVT::v2i64, { 18, 28, 28, 35 } },
3916 { ISD::CTLZ, MVT::v4i32, { 15, 20, 22, 28 } },
3917 { ISD::CTLZ, MVT::v8i16, { 13, 17, 16, 22 } },
3918 { ISD::CTLZ, MVT::v16i8, { 11, 15, 10, 16 } },
3919 { ISD::CTPOP, MVT::v2i64, { 13, 19, 12, 18 } },
3920 { ISD::CTPOP, MVT::v4i32, { 18, 24, 16, 22 } },
3921 { ISD::CTPOP, MVT::v8i16, { 13, 18, 14, 20 } },
3922 { ISD::CTPOP, MVT::v16i8, { 11, 12, 10, 16 } },
3923 { ISD::CTTZ, MVT::v2i64, { 13, 25, 15, 22 } },
3924 { ISD::CTTZ, MVT::v4i32, { 18, 26, 19, 25 } },
3925 { ISD::CTTZ, MVT::v8i16, { 13, 20, 17, 23 } },
3926 { ISD::CTTZ, MVT::v16i8, { 11, 16, 13, 19 } }
3927 };
3928 static const CostKindTblEntry SSE2CostTbl[] = {
3929 { ISD::ABS, MVT::v2i64, { 3, 6, 5, 5 } },
3930 { ISD::ABS, MVT::v4i32, { 1, 4, 4, 4 } },
3931 { ISD::ABS, MVT::v8i16, { 1, 2, 3, 3 } },
3932 { ISD::ABS, MVT::v16i8, { 1, 2, 3, 3 } },
3933 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 32, 32 } },
3934 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 30, 30 } },
3935 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 25, 25 } },
3936 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 21, 21 } },
3937 { ISD::BSWAP, MVT::v2i64, { 5, 6, 11, 11 } },
3938 { ISD::BSWAP, MVT::v4i32, { 5, 5, 9, 9 } },
3939 { ISD::BSWAP, MVT::v8i16, { 5, 5, 4, 5 } },
3940 { ISD::CTLZ, MVT::v2i64, { 10, 45, 36, 38 } },
3941 { ISD::CTLZ, MVT::v4i32, { 10, 45, 38, 40 } },
3942 { ISD::CTLZ, MVT::v8i16, { 9, 38, 32, 34 } },
3943 { ISD::CTLZ, MVT::v16i8, { 8, 39, 29, 32 } },
3944 { ISD::CTPOP, MVT::v2i64, { 12, 26, 16, 18 } },
3945 { ISD::CTPOP, MVT::v4i32, { 15, 29, 21, 23 } },
3946 { ISD::CTPOP, MVT::v8i16, { 13, 25, 18, 20 } },
3947 { ISD::CTPOP, MVT::v16i8, { 10, 21, 14, 16 } },
3948 { ISD::CTTZ, MVT::v2i64, { 14, 28, 19, 21 } },
3949 { ISD::CTTZ, MVT::v4i32, { 18, 31, 24, 26 } },
3950 { ISD::CTTZ, MVT::v8i16, { 16, 27, 21, 23 } },
3951 { ISD::CTTZ, MVT::v16i8, { 13, 23, 17, 19 } },
3952 { ISD::SADDSAT, MVT::v8i16, { 1 } },
3953 { ISD::SADDSAT, MVT::v16i8, { 1 } },
3954 { ISD::SMAX, MVT::v2i64, { 4, 8, 15, 15 } },
3955 { ISD::SMAX, MVT::v4i32, { 2, 4, 5, 5 } },
3956 { ISD::SMAX, MVT::v8i16, { 1, 1, 1, 1 } },
3957 { ISD::SMAX, MVT::v16i8, { 2, 4, 5, 5 } },
3958 { ISD::SMIN, MVT::v2i64, { 4, 8, 15, 15 } },
3959 { ISD::SMIN, MVT::v4i32, { 2, 4, 5, 5 } },
3960 { ISD::SMIN, MVT::v8i16, { 1, 1, 1, 1 } },
3961 { ISD::SMIN, MVT::v16i8, { 2, 4, 5, 5 } },
3962 { ISD::SSUBSAT, MVT::v8i16, { 1 } },
3963 { ISD::SSUBSAT, MVT::v16i8, { 1 } },
3964 { ISD::UADDSAT, MVT::v8i16, { 1 } },
3965 { ISD::UADDSAT, MVT::v16i8, { 1 } },
3966 { ISD::UMAX, MVT::v2i64, { 4, 8, 15, 15 } },
3967 { ISD::UMAX, MVT::v4i32, { 2, 5, 8, 8 } },
3968 { ISD::UMAX, MVT::v8i16, { 1, 3, 3, 3 } },
3969 { ISD::UMAX, MVT::v16i8, { 1, 1, 1, 1 } },
3970 { ISD::UMIN, MVT::v2i64, { 4, 8, 15, 15 } },
3971 { ISD::UMIN, MVT::v4i32, { 2, 5, 8, 8 } },
3972 { ISD::UMIN, MVT::v8i16, { 1, 3, 3, 3 } },
3973 { ISD::UMIN, MVT::v16i8, { 1, 1, 1, 1 } },
3974 { ISD::USUBSAT, MVT::v8i16, { 1 } },
3975 { ISD::USUBSAT, MVT::v16i8, { 1 } },
3976 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } },
3977 { ISD::FMAXNUM, MVT::v2f64, { 4, 6, 6, 6 } },
3978 { ISD::FSQRT, MVT::f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
3979 { ISD::FSQRT, MVT::v2f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
3980 };
3981 static const CostKindTblEntry SSE1CostTbl[] = {
3982 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } },
3983 { ISD::FMAXNUM, MVT::v4f32, { 4, 6, 6, 6 } },
3984 { ISD::FSQRT, MVT::f32, { 28, 30, 1, 2 } }, // Pentium III from http://www.agner.org/
3985 { ISD::FSQRT, MVT::v4f32, { 56, 56, 1, 2 } }, // Pentium III from http://www.agner.org/
3986 };
3987 static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets
3988 { ISD::CTTZ, MVT::i64, { 1 } },
3989 };
3990 static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
3991 { ISD::CTTZ, MVT::i32, { 1 } },
3992 { ISD::CTTZ, MVT::i16, { 1 } },
3993 { ISD::CTTZ, MVT::i8, { 1 } },
3994 };
3995 static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets
3996 { ISD::CTLZ, MVT::i64, { 1 } },
3997 };
3998 static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
3999 { ISD::CTLZ, MVT::i32, { 1 } },
4000 { ISD::CTLZ, MVT::i16, { 2 } },
4001 { ISD::CTLZ, MVT::i8, { 2 } },
4002 };
4003 static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets
4004 { ISD::CTPOP, MVT::i64, { 1, 1, 1, 1 } }, // popcnt
4005 };
4006 static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
4007 { ISD::CTPOP, MVT::i32, { 1, 1, 1, 1 } }, // popcnt
4008 { ISD::CTPOP, MVT::i16, { 1, 1, 2, 2 } }, // popcnt(zext())
4009 { ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext())
4010 };
4011 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
4012 { ISD::ABS, MVT::i64, { 1, 2, 3, 4 } }, // SUB+CMOV
4013 { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } },
4014 { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } },
4015 { ISD::CTLZ, MVT::i64, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
4016 { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 1, 1, 1 } }, // BSR+XOR
4017 { ISD::CTTZ, MVT::i64, { 3 } }, // TEST+BSF+CMOV/BRANCH
4018 { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 1, 1, 1 } }, // BSR
4019 { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } },
4020 { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } },
4021 { ISD::ROTR, MVT::i64, { 2, 3, 1, 3 } },
4022 { X86ISD::VROTLI, MVT::i64, { 1, 1, 1, 1 } },
4023 { ISD::FSHL, MVT::i64, { 4, 4, 1, 4 } },
4024 { ISD::SMAX, MVT::i64, { 1, 3, 2, 3 } },
4025 { ISD::SMIN, MVT::i64, { 1, 3, 2, 3 } },
4026 { ISD::UMAX, MVT::i64, { 1, 3, 2, 3 } },
4027 { ISD::UMIN, MVT::i64, { 1, 3, 2, 3 } },
4028 { ISD::SADDO, MVT::i64, { 1 } },
4029 { ISD::UADDO, MVT::i64, { 1 } },
4030 { ISD::UMULO, MVT::i64, { 2 } }, // mulq + seto
4031 };
4032 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
4033 { ISD::ABS, MVT::i32, { 1, 2, 3, 4 } }, // SUB+XOR+SRA or SUB+CMOV
4034 { ISD::ABS, MVT::i16, { 2, 2, 3, 4 } }, // SUB+XOR+SRA or SUB+CMOV
4035 { ISD::ABS, MVT::i8, { 2, 4, 4, 4 } }, // SUB+XOR+SRA
4036 { ISD::BITREVERSE, MVT::i32, { 9, 12, 17, 19 } },
4037 { ISD::BITREVERSE, MVT::i16, { 9, 12, 17, 19 } },
4038 { ISD::BITREVERSE, MVT::i8, { 7, 9, 13, 14 } },
4039 { ISD::BSWAP, MVT::i32, { 1, 1, 1, 1 } },
4040 { ISD::BSWAP, MVT::i16, { 1, 2, 1, 2 } }, // ROL
4041 { ISD::CTLZ, MVT::i32, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
4042 { ISD::CTLZ, MVT::i16, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
4043 { ISD::CTLZ, MVT::i8, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
4044 { ISD::CTLZ_ZERO_UNDEF, MVT::i32,{ 1, 1, 1, 1 } }, // BSR+XOR
4045 { ISD::CTLZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 3, 3 } }, // BSR+XOR
4046 { ISD::CTLZ_ZERO_UNDEF, MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR
4047 { ISD::CTTZ, MVT::i32, { 3 } }, // TEST+BSF+CMOV/BRANCH
4048 { ISD::CTTZ, MVT::i16, { 3 } }, // TEST+BSF+CMOV/BRANCH
4049 { ISD::CTTZ, MVT::i8, { 3 } }, // TEST+BSF+CMOV/BRANCH
4050 { ISD::CTTZ_ZERO_UNDEF, MVT::i32,{ 1, 1, 1, 1 } }, // BSF
4051 { ISD::CTTZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 1, 1 } }, // BSF
4052 { ISD::CTTZ_ZERO_UNDEF, MVT::i8, { 2, 2, 1, 1 } }, // BSF
4053 { ISD::CTPOP, MVT::i32, { 8, 7, 15, 15 } },
4054 { ISD::CTPOP, MVT::i16, { 9, 8, 17, 17 } },
4055 { ISD::CTPOP, MVT::i8, { 7, 6, 6, 6 } },
4056 { ISD::ROTL, MVT::i32, { 2, 3, 1, 3 } },
4057 { ISD::ROTL, MVT::i16, { 2, 3, 1, 3 } },
4058 { ISD::ROTL, MVT::i8, { 2, 3, 1, 3 } },
4059 { ISD::ROTR, MVT::i32, { 2, 3, 1, 3 } },
4060 { ISD::ROTR, MVT::i16, { 2, 3, 1, 3 } },
4061 { ISD::ROTR, MVT::i8, { 2, 3, 1, 3 } },
4062 { X86ISD::VROTLI, MVT::i32, { 1, 1, 1, 1 } },
4063 { X86ISD::VROTLI, MVT::i16, { 1, 1, 1, 1 } },
4064 { X86ISD::VROTLI, MVT::i8, { 1, 1, 1, 1 } },
4065 { ISD::FSHL, MVT::i32, { 4, 4, 1, 4 } },
4066 { ISD::FSHL, MVT::i16, { 4, 4, 2, 5 } },
4067 { ISD::FSHL, MVT::i8, { 4, 4, 2, 5 } },
4068 { ISD::SMAX, MVT::i32, { 1, 2, 2, 3 } },
4069 { ISD::SMAX, MVT::i16, { 1, 4, 2, 4 } },
4070 { ISD::SMAX, MVT::i8, { 1, 4, 2, 4 } },
4071 { ISD::SMIN, MVT::i32, { 1, 2, 2, 3 } },
4072 { ISD::SMIN, MVT::i16, { 1, 4, 2, 4 } },
4073 { ISD::SMIN, MVT::i8, { 1, 4, 2, 4 } },
4074 { ISD::UMAX, MVT::i32, { 1, 2, 2, 3 } },
4075 { ISD::UMAX, MVT::i16, { 1, 4, 2, 4 } },
4076 { ISD::UMAX, MVT::i8, { 1, 4, 2, 4 } },
4077 { ISD::UMIN, MVT::i32, { 1, 2, 2, 3 } },
4078 { ISD::UMIN, MVT::i16, { 1, 4, 2, 4 } },
4079 { ISD::UMIN, MVT::i8, { 1, 4, 2, 4 } },
4080 { ISD::SADDO, MVT::i32, { 1 } },
4081 { ISD::SADDO, MVT::i16, { 1 } },
4082 { ISD::SADDO, MVT::i8, { 1 } },
4083 { ISD::UADDO, MVT::i32, { 1 } },
4084 { ISD::UADDO, MVT::i16, { 1 } },
4085 { ISD::UADDO, MVT::i8, { 1 } },
4086 { ISD::UMULO, MVT::i32, { 2 } }, // mul + seto
4087 { ISD::UMULO, MVT::i16, { 2 } },
4088 { ISD::UMULO, MVT::i8, { 2 } },
4089 };
4090
4091 Type *RetTy = ICA.getReturnType();
4092 Type *OpTy = RetTy;
4093 Intrinsic::ID IID = ICA.getID();
4094 unsigned ISD = ISD::DELETED_NODE;
4095 switch (IID) {
4096 default:
4097 break;
4098 case Intrinsic::abs:
4099 ISD = ISD::ABS;
4100 break;
4101 case Intrinsic::bitreverse:
4102 ISD = ISD::BITREVERSE;
4103 break;
4104 case Intrinsic::bswap:
4105 ISD = ISD::BSWAP;
4106 break;
4107 case Intrinsic::ctlz:
4108 ISD = ISD::CTLZ;
4109 break;
4110 case Intrinsic::ctpop:
4111 ISD = ISD::CTPOP;
4112 break;
4113 case Intrinsic::cttz:
4114 ISD = ISD::CTTZ;
4115 break;
4116 case Intrinsic::fshl:
4117 ISD = ISD::FSHL;
4118 if (!ICA.isTypeBasedOnly()) {
4119 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4120 if (Args[0] == Args[1]) {
4121 ISD = ISD::ROTL;
4122 // Handle uniform constant rotation amounts.
4123 // TODO: Handle funnel-shift cases.
4124 const APInt *Amt;
4125 if (Args[2] &&
4127 ISD = X86ISD::VROTLI;
4128 }
4129 }
4130 break;
4131 case Intrinsic::fshr:
4132 // FSHR has same costs so don't duplicate.
4133 ISD = ISD::FSHL;
4134 if (!ICA.isTypeBasedOnly()) {
4135 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4136 if (Args[0] == Args[1]) {
4137 ISD = ISD::ROTR;
4138 // Handle uniform constant rotation amount.
4139 // TODO: Handle funnel-shift cases.
4140 const APInt *Amt;
4141 if (Args[2] &&
4143 ISD = X86ISD::VROTLI;
4144 }
4145 }
4146 break;
4147 case Intrinsic::maxnum:
4148 case Intrinsic::minnum:
4149 // FMINNUM has same costs so don't duplicate.
4150 ISD = ISD::FMAXNUM;
4151 break;
4152 case Intrinsic::sadd_sat:
4153 ISD = ISD::SADDSAT;
4154 break;
4155 case Intrinsic::smax:
4156 ISD = ISD::SMAX;
4157 break;
4158 case Intrinsic::smin:
4159 ISD = ISD::SMIN;
4160 break;
4161 case Intrinsic::ssub_sat:
4162 ISD = ISD::SSUBSAT;
4163 break;
4164 case Intrinsic::uadd_sat:
4165 ISD = ISD::UADDSAT;
4166 break;
4167 case Intrinsic::umax:
4168 ISD = ISD::UMAX;
4169 break;
4170 case Intrinsic::umin:
4171 ISD = ISD::UMIN;
4172 break;
4173 case Intrinsic::usub_sat:
4174 ISD = ISD::USUBSAT;
4175 break;
4176 case Intrinsic::sqrt:
4177 ISD = ISD::FSQRT;
4178 break;
4179 case Intrinsic::sadd_with_overflow:
4180 case Intrinsic::ssub_with_overflow:
4181 // SSUBO has same costs so don't duplicate.
4182 ISD = ISD::SADDO;
4183 OpTy = RetTy->getContainedType(0);
4184 break;
4185 case Intrinsic::uadd_with_overflow:
4186 case Intrinsic::usub_with_overflow:
4187 // USUBO has same costs so don't duplicate.
4188 ISD = ISD::UADDO;
4189 OpTy = RetTy->getContainedType(0);
4190 break;
4191 case Intrinsic::umul_with_overflow:
4192 case Intrinsic::smul_with_overflow:
4193 // SMULO has same costs so don't duplicate.
4194 ISD = ISD::UMULO;
4195 OpTy = RetTy->getContainedType(0);
4196 break;
4197 }
4198
4199 if (ISD != ISD::DELETED_NODE) {
4200 // Legalize the type.
4201 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy);
4202 MVT MTy = LT.second;
4203
4204 // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost.
4205 if (((ISD == ISD::CTTZ && !ST->hasBMI()) ||
4206 (ISD == ISD::CTLZ && !ST->hasLZCNT())) &&
4207 !MTy.isVector() && !ICA.isTypeBasedOnly()) {
4208 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4209 if (auto *Cst = dyn_cast<ConstantInt>(Args[1]))
4210 if (Cst->isAllOnesValue())
4212 }
4213
4214 // FSQRT is a single instruction.
4215 if (ISD == ISD::FSQRT && CostKind == TTI::TCK_CodeSize)
4216 return LT.first;
4217
4218 auto adjustTableCost = [](int ISD, unsigned Cost,
4219 InstructionCost LegalizationCost,
4220 FastMathFlags FMF) {
4221 // If there are no NANs to deal with, then these are reduced to a
4222 // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
4223 // assume is used in the non-fast case.
4224 if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) {
4225 if (FMF.noNaNs())
4226 return LegalizationCost * 1;
4227 }
4228 return LegalizationCost * (int)Cost;
4229 };
4230
4231 if (ST->useGLMDivSqrtCosts())
4232 if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
4233 if (auto KindCost = Entry->Cost[CostKind])
4234 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4235 ICA.getFlags());
4236
4237 if (ST->useSLMArithCosts())
4238 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
4239 if (auto KindCost = Entry->Cost[CostKind])
4240 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4241 ICA.getFlags());
4242
4243 if (ST->hasVBMI2())
4244 if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy))
4245 if (auto KindCost = Entry->Cost[CostKind])
4246 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4247 ICA.getFlags());
4248
4249 if (ST->hasBITALG())
4250 if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
4251 if (auto KindCost = Entry->Cost[CostKind])
4252 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4253 ICA.getFlags());
4254
4255 if (ST->hasVPOPCNTDQ())
4256 if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
4257 if (auto KindCost = Entry->Cost[CostKind])
4258 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4259 ICA.getFlags());
4260
4261 if (ST->hasGFNI())
4262 if (const auto *Entry = CostTableLookup(GFNICostTbl, ISD, MTy))
4263 if (auto KindCost = Entry->Cost[CostKind])
4264 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4265 ICA.getFlags());
4266
4267 if (ST->hasCDI())
4268 if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
4269 if (auto KindCost = Entry->Cost[CostKind])
4270 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4271 ICA.getFlags());
4272
4273 if (ST->hasBWI())
4274 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
4275 if (auto KindCost = Entry->Cost[CostKind])
4276 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4277 ICA.getFlags());
4278
4279 if (ST->hasAVX512())
4280 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
4281 if (auto KindCost = Entry->Cost[CostKind])
4282 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4283 ICA.getFlags());
4284
4285 if (ST->hasXOP())
4286 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
4287 if (auto KindCost = Entry->Cost[CostKind])
4288 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4289 ICA.getFlags());
4290
4291 if (ST->hasAVX2())
4292 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
4293 if (auto KindCost = Entry->Cost[CostKind])
4294 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4295 ICA.getFlags());
4296
4297 if (ST->hasAVX())
4298 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
4299 if (auto KindCost = Entry->Cost[CostKind])
4300 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4301 ICA.getFlags());
4302
4303 if (ST->hasSSE42())
4304 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
4305 if (auto KindCost = Entry->Cost[CostKind])
4306 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4307 ICA.getFlags());
4308
4309 if (ST->hasSSE41())
4310 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
4311 if (auto KindCost = Entry->Cost[CostKind])
4312 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4313 ICA.getFlags());
4314
4315 if (ST->hasSSSE3())
4316 if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
4317 if (auto KindCost = Entry->Cost[CostKind])
4318 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4319 ICA.getFlags());
4320
4321 if (ST->hasSSE2())
4322 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
4323 if (auto KindCost = Entry->Cost[CostKind])
4324 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4325 ICA.getFlags());
4326
4327 if (ST->hasSSE1())
4328 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
4329 if (auto KindCost = Entry->Cost[CostKind])
4330 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4331 ICA.getFlags());
4332
4333 if (ST->hasBMI()) {
4334 if (ST->is64Bit())
4335 if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
4336 if (auto KindCost = Entry->Cost[CostKind])
4337 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4338 ICA.getFlags());
4339
4340 if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
4341 if (auto KindCost = Entry->Cost[CostKind])
4342 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4343 ICA.getFlags());
4344 }
4345
4346 if (ST->hasLZCNT()) {
4347 if (ST->is64Bit())
4348 if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
4349 if (auto KindCost = Entry->Cost[CostKind])
4350 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4351 ICA.getFlags());
4352
4353 if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
4354 if (auto KindCost = Entry->Cost[CostKind])
4355 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4356 ICA.getFlags());
4357 }
4358
4359 if (ST->hasPOPCNT()) {
4360 if (ST->is64Bit())
4361 if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
4362 if (auto KindCost = Entry->Cost[CostKind])
4363 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4364 ICA.getFlags());
4365
4366 if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
4367 if (auto KindCost = Entry->Cost[CostKind])
4368 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4369 ICA.getFlags());
4370 }
4371
4372 if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
4373 if (const Instruction *II = ICA.getInst()) {
4374 if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
4375 return TTI::TCC_Free;
4376 if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
4377 if (LI->hasOneUse())
4378 return TTI::TCC_Free;
4379 }
4380 }
4381 }
4382
4383 if (ST->is64Bit())
4384 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
4385 if (auto KindCost = Entry->Cost[CostKind])
4386 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4387 ICA.getFlags());
4388
4389 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
4390 if (auto KindCost = Entry->Cost[CostKind])
4391 return adjustTableCost(Entry->ISD, *KindCost, LT.first, ICA.getFlags());
4392 }
4393
4395}
4396
4399 unsigned Index, Value *Op0,
4400 Value *Op1) {
4401 static const CostTblEntry SLMCostTbl[] = {
4402 { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },
4403 { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 },
4404 { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 },
4405 { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 }
4406 };
4407
4408 assert(Val->isVectorTy() && "This must be a vector type");
4409 Type *ScalarType = Val->getScalarType();
4410 InstructionCost RegisterFileMoveCost = 0;
4411
4412 // Non-immediate extraction/insertion can be handled as a sequence of
4413 // aliased loads+stores via the stack.
4414 if (Index == -1U && (Opcode == Instruction::ExtractElement ||
4415 Opcode == Instruction::InsertElement)) {
4416 // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
4417 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
4418
4419 // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
4420 assert(isa<FixedVectorType>(Val) && "Fixed vector type expected");
4421 Align VecAlign = DL.getPrefTypeAlign(Val);
4422 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
4423
4424 // Extract - store vector to stack, load scalar.
4425 if (Opcode == Instruction::ExtractElement) {
4426 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4427 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
4428 CostKind);
4429 }
4430 // Insert - store vector to stack, store scalar, load vector.
4431 if (Opcode == Instruction::InsertElement) {
4432 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4433 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
4434 CostKind) +
4435 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind);
4436 }
4437 }
4438
4439 if (Index != -1U && (Opcode == Instruction::ExtractElement ||
4440 Opcode == Instruction::InsertElement)) {
4441 // Extraction of vXi1 elements are now efficiently handled by MOVMSK.
4442 if (Opcode == Instruction::ExtractElement &&
4443 ScalarType->getScalarSizeInBits() == 1 &&
4444 cast<FixedVectorType>(Val)->getNumElements() > 1)
4445 return 1;
4446
4447 // Legalize the type.
4448 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4449
4450 // This type is legalized to a scalar type.
4451 if (!LT.second.isVector())
4452 return 0;
4453
4454 // The type may be split. Normalize the index to the new type.
4455 unsigned SizeInBits = LT.second.getSizeInBits();
4456 unsigned NumElts = LT.second.getVectorNumElements();
4457 unsigned SubNumElts = NumElts;
4458 Index = Index % NumElts;
4459
4460 // For >128-bit vectors, we need to extract higher 128-bit subvectors.
4461 // For inserts, we also need to insert the subvector back.
4462 if (SizeInBits > 128) {
4463 assert((SizeInBits % 128) == 0 && "Illegal vector");
4464 unsigned NumSubVecs = SizeInBits / 128;
4465 SubNumElts = NumElts / NumSubVecs;
4466 if (SubNumElts <= Index) {
4467 RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
4468 Index %= SubNumElts;
4469 }
4470 }
4471
4472 MVT MScalarTy = LT.second.getScalarType();
4473 auto IsCheapPInsrPExtrInsertPS = [&]() {
4474 // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
4475 // Also, assume insertps is relatively cheap on all >= SSE41 targets.
4476 return (MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4477 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4478 (MScalarTy == MVT::f32 && ST->hasSSE41() &&
4479 Opcode == Instruction::InsertElement);
4480 };
4481
4482 if (Index == 0) {
4483 // Floating point scalars are already located in index #0.
4484 // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
4485 // true for all.
4486 if (ScalarType->isFloatingPointTy() &&
4487 (Opcode != Instruction::InsertElement || !Op0 ||
4488 isa<UndefValue>(Op0)))
4489 return RegisterFileMoveCost;
4490
4491 if (Opcode == Instruction::InsertElement &&
4492 isa_and_nonnull<UndefValue>(Op0)) {
4493 // Consider the gather cost to be cheap.
4494 if (isa_and_nonnull<LoadInst>(Op1))
4495 return RegisterFileMoveCost;
4496 if (!IsCheapPInsrPExtrInsertPS()) {
4497 // mov constant-to-GPR + movd/movq GPR -> XMM.
4498 if (isa_and_nonnull<Constant>(Op1) && Op1->getType()->isIntegerTy())
4499 return 2 + RegisterFileMoveCost;
4500 // Assume movd/movq GPR -> XMM is relatively cheap on all targets.
4501 return 1 + RegisterFileMoveCost;
4502 }
4503 }
4504
4505 // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
4506 if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
4507 return 1 + RegisterFileMoveCost;
4508 }
4509
4510 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4511 assert(ISD && "Unexpected vector opcode");
4512 if (ST->useSLMArithCosts())
4513 if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
4514 return Entry->Cost + RegisterFileMoveCost;
4515
4516 // Consider cheap cases.
4517 if (IsCheapPInsrPExtrInsertPS())
4518 return 1 + RegisterFileMoveCost;
4519
4520 // For extractions we just need to shuffle the element to index 0, which
4521 // should be very cheap (assume cost = 1). For insertions we need to shuffle
4522 // the elements to its destination. In both cases we must handle the
4523 // subvector move(s).
4524 // If the vector type is already less than 128-bits then don't reduce it.
4525 // TODO: Under what circumstances should we shuffle using the full width?
4526 InstructionCost ShuffleCost = 1;
4527 if (Opcode == Instruction::InsertElement) {
4528 auto *SubTy = cast<VectorType>(Val);
4529 EVT VT = TLI->getValueType(DL, Val);
4530 if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
4531 SubTy = FixedVectorType::get(ScalarType, SubNumElts);
4532 ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, std::nullopt,
4533 CostKind, 0, SubTy);
4534 }
4535 int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
4536 return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
4537 }
4538
4539 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1) +
4540 RegisterFileMoveCost;
4541}
4542
4545 bool Insert, bool Extract,
4547 assert(DemandedElts.getBitWidth() ==
4548 cast<FixedVectorType>(Ty)->getNumElements() &&
4549 "Vector size mismatch");
4550
4551 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4552 MVT MScalarTy = LT.second.getScalarType();
4553 unsigned LegalVectorBitWidth = LT.second.getSizeInBits();
4555
4556 constexpr unsigned LaneBitWidth = 128;
4557 assert((LegalVectorBitWidth < LaneBitWidth ||
4558 (LegalVectorBitWidth % LaneBitWidth) == 0) &&
4559 "Illegal vector");
4560
4561 const int NumLegalVectors = *LT.first.getValue();
4562 assert(NumLegalVectors >= 0 && "Negative cost!");
4563
4564 // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
4565 // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
4566 if (Insert) {
4567 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4568 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4569 (MScalarTy == MVT::f32 && ST->hasSSE41())) {
4570 // For types we can insert directly, insertion into 128-bit sub vectors is
4571 // cheap, followed by a cheap chain of concatenations.
4572 if (LegalVectorBitWidth <= LaneBitWidth) {
4573 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
4574 /*Extract*/ false, CostKind);
4575 } else {
4576 // In each 128-lane, if at least one index is demanded but not all
4577 // indices are demanded and this 128-lane is not the first 128-lane of
4578 // the legalized-vector, then this 128-lane needs a extracti128; If in
4579 // each 128-lane, there is at least one demanded index, this 128-lane
4580 // needs a inserti128.
4581
4582 // The following cases will help you build a better understanding:
4583 // Assume we insert several elements into a v8i32 vector in avx2,
4584 // Case#1: inserting into 1th index needs vpinsrd + inserti128.
4585 // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
4586 // inserti128.
4587 // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
4588 assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector");
4589 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
4590 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
4591 unsigned NumLegalElts =
4592 LT.second.getVectorNumElements() * NumLegalVectors;
4593 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
4594 "Vector has been legalized to smaller element count");
4595 assert((NumLegalElts % NumLanesTotal) == 0 &&
4596 "Unexpected elts per lane");
4597 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
4598
4599 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
4600 auto *LaneTy =
4601 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
4602
4603 for (unsigned I = 0; I != NumLanesTotal; ++I) {
4604 APInt LaneEltMask = WidenedDemandedElts.extractBits(
4605 NumEltsPerLane, NumEltsPerLane * I);
4606 if (LaneEltMask.isZero())
4607 continue;
4608 // FIXME: we don't need to extract if all non-demanded elements
4609 // are legalization-inserted padding.
4610 if (!LaneEltMask.isAllOnes())
4611 Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
4612 CostKind, I * NumEltsPerLane, LaneTy);
4613 Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert,
4614 /*Extract*/ false, CostKind);
4615 }
4616
4617 APInt AffectedLanes =
4618 APIntOps::ScaleBitMask(WidenedDemandedElts, NumLanesTotal);
4619 APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask(
4620 AffectedLanes, NumLegalVectors, /*MatchAllBits=*/true);
4621 for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) {
4622 for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) {
4623 unsigned I = NumLegalLanes * LegalVec + Lane;
4624 // No need to insert unaffected lane; or lane 0 of each legal vector
4625 // iff ALL lanes of that vector were affected and will be inserted.
4626 if (!AffectedLanes[I] ||
4627 (Lane == 0 && FullyAffectedLegalVectors[LegalVec]))
4628 continue;
4629 Cost += getShuffleCost(TTI::SK_InsertSubvector, Ty, std::nullopt,
4630 CostKind, I * NumEltsPerLane, LaneTy);
4631 }
4632 }
4633 }
4634 } else if (LT.second.isVector()) {
4635 // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
4636 // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
4637 // series of UNPCK followed by CONCAT_VECTORS - all of these can be
4638 // considered cheap.
4639 if (Ty->isIntOrIntVectorTy())
4640 Cost += DemandedElts.popcount();
4641
4642 // Get the smaller of the legalized or original pow2-extended number of
4643 // vector elements, which represents the number of unpacks we'll end up
4644 // performing.
4645 unsigned NumElts = LT.second.getVectorNumElements();
4646 unsigned Pow2Elts =
4647 PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements());
4648 Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
4649 }
4650 }
4651
4652 if (Extract) {
4653 // vXi1 can be efficiently extracted with MOVMSK.
4654 // TODO: AVX512 predicate mask handling.
4655 // NOTE: This doesn't work well for roundtrip scalarization.
4656 if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) {
4657 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
4658 unsigned MaxElts = ST->hasAVX2() ? 32 : 16;
4659 unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts;
4660 return MOVMSKCost;
4661 }
4662
4663 if (LT.second.isVector()) {
4664 unsigned NumLegalElts =
4665 LT.second.getVectorNumElements() * NumLegalVectors;
4666 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
4667 "Vector has been legalized to smaller element count");
4668
4669 // If we're extracting elements from a 128-bit subvector lane,
4670 // we only need to extract each lane once, not for every element.
4671 if (LegalVectorBitWidth > LaneBitWidth) {
4672 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
4673 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
4674 assert((NumLegalElts % NumLanesTotal) == 0 &&
4675 "Unexpected elts per lane");
4676 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
4677
4678 // Add cost for each demanded 128-bit subvector extraction.
4679 // Luckily this is a lot easier than for insertion.
4680 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
4681 auto *LaneTy =
4682 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
4683
4684 for (unsigned I = 0; I != NumLanesTotal; ++I) {
4685 APInt LaneEltMask = WidenedDemandedElts.extractBits(
4686 NumEltsPerLane, I * NumEltsPerLane);
4687 if (LaneEltMask.isZero())
4688 continue;
4689 Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
4690 CostKind, I * NumEltsPerLane, LaneTy);
4692 LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind);
4693 }
4694
4695 return Cost;
4696 }
4697 }
4698
4699 // Fallback to default extraction.
4700 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ false,
4701 Extract, CostKind);
4702 }
4703
4704 return Cost;
4705}
4706
4708X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
4709 int VF, const APInt &DemandedDstElts,
4711 const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy);
4712 // We don't differentiate element types here, only element bit width.
4713 EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits);
4714
4715 auto bailout = [&]() {
4716 return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
4717 DemandedDstElts, CostKind);
4718 };
4719
4720 // For now, only deal with AVX512 cases.
4721 if (!ST->hasAVX512())
4722 return bailout();
4723
4724 // Do we have a native shuffle for this element type, or should we promote?
4725 unsigned PromEltTyBits = EltTyBits;
4726 switch (EltTyBits) {
4727 case 32:
4728 case 64:
4729 break; // AVX512F.
4730 case 16:
4731 if (!ST->hasBWI())
4732 PromEltTyBits = 32; // promote to i32, AVX512F.
4733 break; // AVX512BW
4734 case 8:
4735 if (!ST->hasVBMI())
4736 PromEltTyBits = 32; // promote to i32, AVX512F.
4737 break; // AVX512VBMI
4738 case 1:
4739 // There is no support for shuffling i1 elements. We *must* promote.
4740 if (ST->hasBWI()) {
4741 if (ST->hasVBMI())
4742 PromEltTyBits = 8; // promote to i8, AVX512VBMI.
4743 else
4744 PromEltTyBits = 16; // promote to i16, AVX512BW.
4745 break;
4746 }
4747 PromEltTyBits = 32; // promote to i32, AVX512F.
4748 break;
4749 default:
4750 return bailout();
4751 }
4752 auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits);
4753
4754 auto *SrcVecTy = FixedVectorType::get(EltTy, VF);
4755 auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF);
4756
4757 int NumDstElements = VF * ReplicationFactor;
4758 auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements);
4759 auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements);
4760
4761 // Legalize the types.
4762 MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second;
4763 MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second;
4764 MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second;
4765 MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second;
4766 // They should have legalized into vector types.
4767 if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() ||
4768 !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector())
4769 return bailout();
4770
4771 if (PromEltTyBits != EltTyBits) {
4772 // If we have to perform the shuffle with wider elt type than our data type,
4773 // then we will first need to anyext (we don't care about the new bits)
4774 // the source elements, and then truncate Dst elements.
4775 InstructionCost PromotionCost;
4776 PromotionCost += getCastInstrCost(
4777 Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy,
4779 PromotionCost +=
4780 getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy,
4781 /*Src=*/PromDstVecTy,
4783 return PromotionCost + getReplicationShuffleCost(PromEltTy,
4784 ReplicationFactor, VF,
4785 DemandedDstElts, CostKind);
4786 }
4787
4788 assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&
4789 LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&
4790 "We expect that the legalization doesn't affect the element width, "
4791 "doesn't coalesce/split elements.");
4792
4793 unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements();
4794 unsigned NumDstVectors =
4795 divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec);
4796
4797 auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec);
4798
4799 // Not all the produced Dst elements may be demanded. In our case,
4800 // given that a single Dst vector is formed by a single shuffle,
4801 // if all elements that will form a single Dst vector aren't demanded,
4802 // then we won't need to do that shuffle, so adjust the cost accordingly.
4803 APInt DemandedDstVectors = APIntOps::ScaleBitMask(
4804 DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors);
4805 unsigned NumDstVectorsDemanded = DemandedDstVectors.popcount();
4806
4807 InstructionCost SingleShuffleCost = getShuffleCost(
4808 TTI::SK_PermuteSingleSrc, SingleDstVecTy, /*Mask=*/std::nullopt, CostKind,
4809 /*Index=*/0, /*SubTp=*/nullptr);
4810 return NumDstVectorsDemanded * SingleShuffleCost;
4811}
4812
4814 MaybeAlign Alignment,
4815 unsigned AddressSpace,
4817 TTI::OperandValueInfo OpInfo,
4818 const Instruction *I) {
4819 // TODO: Handle other cost kinds.
4821 if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
4822 // Store instruction with index and scale costs 2 Uops.
4823 // Check the preceding GEP to identify non-const indices.
4824 if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
4825 if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
4826 return TTI::TCC_Basic * 2;
4827 }
4828 }
4829 return TTI::TCC_Basic;
4830 }
4831
4832 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
4833 "Invalid Opcode");
4834 // Type legalization can't handle structs
4835 if (TLI->getValueType(DL, Src, true) == MVT::Other)
4836 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4837 CostKind);
4838
4839 // Legalize the type.
4840 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
4841
4842 auto *VTy = dyn_cast<FixedVectorType>(Src);
4843
4845
4846 // Add a cost for constant load to vector.
4847 if (Opcode == Instruction::Store && OpInfo.isConstant())
4848 Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src),
4849 /*AddressSpace=*/0, CostKind);
4850
4851 // Handle the simple case of non-vectors.
4852 // NOTE: this assumes that legalization never creates vector from scalars!
4853 if (!VTy || !LT.second.isVector()) {
4854 // Each load/store unit costs 1.
4855 return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1;
4856 }
4857
4858 bool IsLoad = Opcode == Instruction::Load;
4859
4860 Type *EltTy = VTy->getElementType();
4861
4862 const int EltTyBits = DL.getTypeSizeInBits(EltTy);
4863
4864 // Source of truth: how many elements were there in the original IR vector?
4865 const unsigned SrcNumElt = VTy->getNumElements();
4866
4867 // How far have we gotten?
4868 int NumEltRemaining = SrcNumElt;
4869 // Note that we intentionally capture by-reference, NumEltRemaining changes.
4870 auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
4871
4872 const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
4873
4874 // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
4875 const unsigned XMMBits = 128;
4876 if (XMMBits % EltTyBits != 0)
4877 // Vector size must be a multiple of the element size. I.e. no padding.
4878 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4879 CostKind);
4880 const int NumEltPerXMM = XMMBits / EltTyBits;
4881
4882 auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
4883
4884 for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
4885 NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
4886 // How many elements would a single op deal with at once?
4887 if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
4888 // Vector size must be a multiple of the element size. I.e. no padding.
4889 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4890 CostKind);
4891 int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
4892
4893 assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
4894 assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||
4895 (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
4896 "Unless we haven't halved the op size yet, "
4897 "we have less than two op's sized units of work left.");
4898
4899 auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
4900 ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
4901 : XMMVecTy;
4902
4903 assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
4904 "After halving sizes, the vector elt count is no longer a multiple "
4905 "of number of elements per operation?");
4906 auto *CoalescedVecTy =
4907 CurrNumEltPerOp == 1
4908 ? CurrVecTy
4910 IntegerType::get(Src->getContext(),
4911 EltTyBits * CurrNumEltPerOp),
4912 CurrVecTy->getNumElements() / CurrNumEltPerOp);
4913 assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
4914 DL.getTypeSizeInBits(CurrVecTy) &&
4915 "coalesciing elements doesn't change vector width.");
4916
4917 while (NumEltRemaining > 0) {
4918 assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
4919
4920 // Can we use this vector size, as per the remaining element count?
4921 // Iff the vector is naturally aligned, we can do a wide load regardless.
4922 if (NumEltRemaining < CurrNumEltPerOp &&
4923 (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) &&
4924 CurrOpSizeBytes != 1)
4925 break; // Try smalled vector size.
4926
4927 bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
4928
4929 // If we have fully processed the previous reg, we need to replenish it.
4930 if (SubVecEltsLeft == 0) {
4931 SubVecEltsLeft += CurrVecTy->getNumElements();
4932 // And that's free only for the 0'th subvector of a legalized vector.
4933 if (!Is0thSubVec)
4936 VTy, std::nullopt, CostKind, NumEltDone(),
4937 CurrVecTy);
4938 }
4939
4940 // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
4941 // for smaller widths (32/16/8) we have to insert/extract them separately.
4942 // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
4943 // but let's pretend that it is also true for 16/8 bit wide ops...)
4944 if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
4945 int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
4946 assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
4947 int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
4948 APInt DemandedElts =
4949 APInt::getBitsSet(CoalescedVecTy->getNumElements(),
4950 CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
4951 assert(DemandedElts.popcount() == 1 && "Inserting single value");
4952 Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
4953 !IsLoad, CostKind);
4954 }
4955
4956 // This isn't exactly right. We're using slow unaligned 32-byte accesses
4957 // as a proxy for a double-pumped AVX memory interface such as on
4958 // Sandybridge.
4959 // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
4960 // will be scalarized.
4961 if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
4962 Cost += 2;
4963 else if (CurrOpSizeBytes < 4)
4964 Cost += 2;
4965 else
4966 Cost += 1;
4967
4968 SubVecEltsLeft -= CurrNumEltPerOp;
4969 NumEltRemaining -= CurrNumEltPerOp;
4970 Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes);
4971 }
4972 }
4973
4974 assert(NumEltRemaining <= 0 && "Should have processed all the elements.");
4975
4976 return Cost;
4977}
4978
4980X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
4981 unsigned AddressSpace,
4983 bool IsLoad = (Instruction::Load == Opcode);
4984 bool IsStore = (Instruction::Store == Opcode);
4985
4986 auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
4987 if (!SrcVTy)
4988 // To calculate scalar take the regular cost, without mask
4989 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
4990
4991 unsigned NumElem = SrcVTy->getNumElements();
4992 auto *MaskTy =
4993 FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
4994 if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) ||
4995 (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) {
4996 // Scalarization
4997 APInt DemandedElts = APInt::getAllOnes(NumElem);
4999 MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind);
5000 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
5001 Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
5003 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
5004 InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
5006 SrcVTy, DemandedElts, IsLoad, IsStore, CostKind);
5007 InstructionCost MemopCost =
5008 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5009 Alignment, AddressSpace, CostKind);
5010 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
5011 }
5012
5013 // Legalize the type.
5014 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
5015 auto VT = TLI->getValueType(DL, SrcVTy);
5017 if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
5018 LT.second.getVectorNumElements() == NumElem)
5019 // Promotion requires extend/truncate for data and a shuffle for mask.
5020 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, std::nullopt,
5021 CostKind, 0, nullptr) +
5022 getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, std::nullopt,
5023 CostKind, 0, nullptr);
5024
5025 else if (LT.first * LT.second.getVectorNumElements() > NumElem) {
5026 auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
5027 LT.second.getVectorNumElements());
5028 // Expanding requires fill mask with zeroes
5029 Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, std::nullopt,
5030 CostKind, 0, MaskTy);
5031 }
5032
5033 // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
5034 if (!ST->hasAVX512())
5035 return Cost + LT.first * (IsLoad ? 2 : 8);
5036
5037 // AVX-512 masked load/store is cheaper
5038 return Cost + LT.first;
5039}
5040
5043 const Value *Base,
5044 const TTI::PointersChainInfo &Info,
5045 Type *AccessTy, TTI::TargetCostKind CostKind) {
5046 if (Info.isSameBase() && Info.isKnownStride()) {
5047 // If all the pointers have known stride all the differences are translated
5048 // into constants. X86 memory addressing allows encoding it into
5049 // displacement. So we just need to take the base GEP cost.
5050 if (const auto *BaseGEP = dyn_cast<GetElementPtrInst>(Base)) {
5051 SmallVector<const Value *> Indices(BaseGEP->indices());
5052 return getGEPCost(BaseGEP->getSourceElementType(),
5053 BaseGEP->getPointerOperand(), Indices, nullptr,
5054 CostKind);
5055 }
5056 return TTI::TCC_Free;
5057 }
5058 return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind);
5059}
5060
5062 ScalarEvolution *SE,
5063 const SCEV *Ptr) {
5064 // Address computations in vectorized code with non-consecutive addresses will
5065 // likely result in more instructions compared to scalar code where the
5066 // computation can more often be merged into the index mode. The resulting
5067 // extra micro-ops can significantly decrease throughput.
5068 const unsigned NumVectorInstToHideOverhead = 10;
5069
5070 // Cost modeling of Strided Access Computation is hidden by the indexing
5071 // modes of X86 regardless of the stride value. We dont believe that there
5072 // is a difference between constant strided access in gerenal and constant
5073 // strided value which is less than or equal to 64.
5074 // Even in the case of (loop invariant) stride whose value is not known at
5075 // compile time, the address computation will not incur more than one extra
5076 // ADD instruction.
5077 if (Ty->isVectorTy() && SE && !ST->hasAVX2()) {
5078 // TODO: AVX2 is the current cut-off because we don't have correct
5079 // interleaving costs for prior ISA's.
5081 return NumVectorInstToHideOverhead;
5083 return 1;
5084 }
5085
5086 return BaseT::getAddressComputationCost(Ty, SE, Ptr);
5087}
5088
5091 std::optional<FastMathFlags> FMF,
5094 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5095
5096 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5097 // and make it as the cost.
5098
5099 static const CostTblEntry SLMCostTbl[] = {
5100 { ISD::FADD, MVT::v2f64, 3 },
5101 { ISD::ADD, MVT::v2i64, 5 },
5102 };
5103
5104 static const CostTblEntry SSE2CostTbl[] = {
5105 { ISD::FADD, MVT::v2f64, 2 },
5106 { ISD::FADD, MVT::v2f32, 2 },
5107 { ISD::FADD, MVT::v4f32, 4 },
5108 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
5109 { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
5110 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
5111 { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
5112 { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
5113 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
5114 { ISD::ADD, MVT::v2i8, 2 },
5115 { ISD::ADD, MVT::v4i8, 2 },
5116 { ISD::ADD, MVT::v8i8, 2 },
5117 { ISD::ADD, MVT::v16i8, 3 },
5118 };
5119
5120 static const CostTblEntry AVX1CostTbl[] = {
5121 { ISD::FADD, MVT::v4f64, 3 },
5122 { ISD::FADD, MVT::v4f32, 3 },
5123 { ISD::FADD, MVT::v8f32, 4 },
5124 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
5125 { ISD::ADD, MVT::v4i64, 3 },
5126 { ISD::ADD, MVT::v8i32, 5 },
5127 { ISD::ADD, MVT::v16i16, 5 },
5128 { ISD::ADD, MVT::v32i8, 4 },
5129 };
5130
5131 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5132 assert(ISD && "Invalid opcode");
5133
5134 // Before legalizing the type, give a chance to look up illegal narrow types
5135 // in the table.
5136 // FIXME: Is there a better way to do this?
5137 EVT VT = TLI->getValueType(DL, ValTy);
5138 if (VT.isSimple()) {
5139 MVT MTy = VT.getSimpleVT();
5140 if (ST->useSLMArithCosts())
5141 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5142 return Entry->Cost;
5143
5144 if (ST->hasAVX())
5145 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5146 return Entry->Cost;
5147
5148 if (ST->hasSSE2())
5149 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5150 return Entry->Cost;
5151 }
5152
5153 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5154
5155 MVT MTy = LT.second;
5156
5157 auto *ValVTy = cast<FixedVectorType>(ValTy);
5158
5159 // Special case: vXi8 mul reductions are performed as vXi16.
5160 if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
5161 auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
5162 auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
5163 return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
5165 CostKind) +
5166 getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind);
5167 }
5168
5169 InstructionCost ArithmeticCost = 0;
5170 if (LT.first != 1 && MTy.isVector() &&
5171 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5172 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5173 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5174 MTy.getVectorNumElements());
5175 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5176 ArithmeticCost *= LT.first - 1;
5177 }
5178
5179 if (ST->useSLMArithCosts())
5180 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5181 return ArithmeticCost + Entry->Cost;
5182
5183 if (ST->hasAVX())
5184 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5185 return ArithmeticCost + Entry->Cost;
5186
5187 if (ST->hasSSE2())
5188 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5189 return ArithmeticCost + Entry->Cost;
5190
5191 // FIXME: These assume a naive kshift+binop lowering, which is probably
5192 // conservative in most cases.
5193 static const CostTblEntry AVX512BoolReduction[] = {
5194 { ISD::AND, MVT::v2i1, 3 },
5195 { ISD::AND, MVT::v4i1, 5 },
5196 { ISD::AND, MVT::v8i1, 7 },
5197 { ISD::AND, MVT::v16i1, 9 },
5198 { ISD::AND, MVT::v32i1, 11 },
5199 { ISD::AND, MVT::v64i1, 13 },
5200 { ISD::OR, MVT::v2i1, 3 },
5201 { ISD::OR, MVT::v4i1, 5 },
5202 { ISD::OR, MVT::v8i1, 7 },
5203 { ISD::OR, MVT::v16i1, 9 },
5204 { ISD::OR, MVT::v32i1, 11 },
5205 { ISD::OR, MVT::v64i1, 13 },
5206 };
5207
5208 static const CostTblEntry AVX2BoolReduction[] = {
5209 { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
5210 { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
5211 { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
5212 { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
5213 };
5214
5215 static const CostTblEntry AVX1BoolReduction[] = {
5216 { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
5217 { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
5218 { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5219 { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5220 { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
5221 { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
5222 { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5223 { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5224 };
5225
5226 static const CostTblEntry SSE2BoolReduction[] = {
5227 { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
5228 { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
5229 { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
5230 { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
5231 { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
5232 { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
5233 { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
5234 { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
5235 };
5236
5237 // Handle bool allof/anyof patterns.
5238 if (ValVTy->getElementType()->isIntegerTy(1)) {
5239 InstructionCost ArithmeticCost = 0;
5240 if (LT.first != 1 && MTy.isVector() &&
5241 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5242 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5243 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5244 MTy.getVectorNumElements());
5245 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5246 ArithmeticCost *= LT.first - 1;
5247 }
5248
5249 if (ST->hasAVX512())
5250 if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
5251 return ArithmeticCost + Entry->Cost;
5252 if (ST->hasAVX2())
5253 if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
5254 return ArithmeticCost + Entry->Cost;
5255 if (ST->hasAVX())
5256 if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
5257 return ArithmeticCost + Entry->Cost;
5258 if (ST->hasSSE2())
5259 if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
5260 return ArithmeticCost + Entry->Cost;
5261
5262 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5263 }
5264
5265 unsigned NumVecElts = ValVTy->getNumElements();
5266 unsigned ScalarSize = ValVTy->getScalarSizeInBits();
5267
5268 // Special case power of 2 reductions where the scalar type isn't changed
5269 // by type legalization.
5270 if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
5271 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5272
5273 InstructionCost ReductionCost = 0;
5274
5275 auto *Ty = ValVTy;
5276 if (LT.first != 1 && MTy.isVector() &&
5277 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5278 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5279 Ty = FixedVectorType::get(ValVTy->getElementType(),
5280 MTy.getVectorNumElements());
5281 ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5282 ReductionCost *= LT.first - 1;
5283 NumVecElts = MTy.getVectorNumElements();
5284 }
5285
5286 // Now handle reduction with the legal type, taking into account size changes
5287 // at each level.
5288 while (NumVecElts > 1) {
5289 // Determine the size of the remaining vector we need to reduce.
5290 unsigned Size = NumVecElts * ScalarSize;
5291 NumVecElts /= 2;
5292 // If we're reducing from 256/512 bits, use an extract_subvector.
5293 if (Size > 128) {
5294 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5295 ReductionCost +=
5297 NumVecElts, SubTy);
5298 Ty = SubTy;
5299 } else if (Size == 128) {
5300 // Reducing from 128 bits is a permute of v2f64/v2i64.
5301 FixedVectorType *ShufTy;
5302 if (ValVTy->isFloatingPointTy())
5303 ShufTy =
5304 FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
5305 else
5306 ShufTy =
5307 FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
5308 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5309 std::nullopt, CostKind, 0, nullptr);
5310 } else if (Size == 64) {
5311 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5312 FixedVectorType *ShufTy;
5313 if (ValVTy->isFloatingPointTy())
5314 ShufTy =
5315 FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
5316 else
5317 ShufTy =
5318 FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
5319 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5320 std::nullopt, CostKind, 0, nullptr);
5321 } else {
5322 // Reducing from smaller size is a shift by immediate.
5323 auto *ShiftTy = FixedVectorType::get(
5324 Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
5325 ReductionCost += getArithmeticInstrCost(
5326 Instruction::LShr, ShiftTy, CostKind,
5329 }
5330
5331 // Add the arithmetic op for this level.
5332 ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
5333 }
5334
5335 // Add the final extract element to the cost.
5336 return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5337 CostKind, 0, nullptr, nullptr);
5338}
5339
5342 FastMathFlags FMF) {
5343 IntrinsicCostAttributes ICA(IID, Ty, {Ty, Ty}, FMF);
5344 return getIntrinsicInstrCost(ICA, CostKind);
5345}
5346
5349 FastMathFlags FMF,
5351 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5352
5353 MVT MTy = LT.second;
5354
5355 int ISD;
5356 if (ValTy->isIntOrIntVectorTy()) {
5357 ISD = (IID == Intrinsic::umin || IID == Intrinsic::umax) ? ISD::UMIN
5358 : ISD::SMIN;
5359 } else {
5360 assert(ValTy->isFPOrFPVectorTy() &&
5361 "Expected float point or integer vector type.");
5362 ISD = (IID == Intrinsic::minnum || IID == Intrinsic::maxnum)
5363 ? ISD::FMINNUM
5364 : ISD::FMINIMUM;
5365 }
5366
5367 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5368 // and make it as the cost.
5369
5370 static const CostTblEntry SSE2CostTbl[] = {
5371 {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
5372 {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
5373 {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
5374 };
5375
5376 static const CostTblEntry SSE41CostTbl[] = {
5377 {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
5378 {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
5379 {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
5380 {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
5381 {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
5382 {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
5383 {ISD::SMIN, MVT::v2i8, 3}, // pminsb
5384 {ISD::SMIN, MVT::v4i8, 5}, // pminsb
5385 {ISD::SMIN, MVT::v8i8, 7}, // pminsb
5386 {ISD::SMIN, MVT::v16i8, 6},
5387 {ISD::UMIN, MVT::v2i8, 3}, // same as sse2
5388 {ISD::UMIN, MVT::v4i8, 5}, // same as sse2
5389 {ISD::UMIN, MVT::v8i8, 7}, // same as sse2
5390 {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
5391 };
5392
5393 static const CostTblEntry AVX1CostTbl[] = {
5394 {ISD::SMIN, MVT::v16i16, 6},
5395 {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
5396 {ISD::SMIN, MVT::v32i8, 8},
5397 {ISD::UMIN, MVT::v32i8, 8},
5398 };
5399
5400 static const CostTblEntry AVX512BWCostTbl[] = {
5401 {ISD::SMIN, MVT::v32i16, 8},
5402 {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
5403 {ISD::SMIN, MVT::v64i8, 10},
5404 {ISD::UMIN, MVT::v64i8, 10},
5405 };
5406
5407 // Before legalizing the type, give a chance to look up illegal narrow types
5408 // in the table.
5409 // FIXME: Is there a better way to do this?
5410 EVT VT = TLI->getValueType(DL, ValTy);
5411 if (VT.isSimple()) {
5412 MVT MTy = VT.getSimpleVT();
5413 if (ST->hasBWI())
5414 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5415 return Entry->Cost;
5416
5417 if (ST->hasAVX())
5418 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5419 return Entry->Cost;
5420
5421 if (ST->hasSSE41())
5422 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5423 return Entry->Cost;
5424
5425 if (ST->hasSSE2())
5426 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5427 return Entry->Cost;
5428 }
5429
5430 auto *ValVTy = cast<FixedVectorType>(ValTy);
5431 unsigned NumVecElts = ValVTy->getNumElements();
5432
5433 auto *Ty = ValVTy;
5434 InstructionCost MinMaxCost = 0;
5435 if (LT.first != 1 && MTy.isVector() &&
5436 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5437 // Type needs to be split. We need LT.first - 1 operations ops.
5438 Ty = FixedVectorType::get(ValVTy->getElementType(),
5439 MTy.getVectorNumElements());
5440 MinMaxCost = getMinMaxCost(IID, Ty, CostKind, FMF);
5441 MinMaxCost *= LT.first - 1;
5442 NumVecElts = MTy.getVectorNumElements();
5443 }
5444
5445 if (ST->hasBWI())
5446 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5447 return MinMaxCost + Entry->Cost;
5448
5449 if (ST->hasAVX())
5450 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5451 return MinMaxCost + Entry->Cost;
5452
5453 if (ST->hasSSE41())
5454 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5455 return MinMaxCost + Entry->Cost;
5456
5457 if (ST->hasSSE2())
5458 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5459 return MinMaxCost + Entry->Cost;
5460
5461 unsigned ScalarSize = ValTy->getScalarSizeInBits();
5462
5463 // Special case power of 2 reductions where the scalar type isn't changed
5464 // by type legalization.
5465 if (!isPowerOf2_32(ValVTy->getNumElements()) ||
5466 ScalarSize != MTy.getScalarSizeInBits())
5467 return BaseT::getMinMaxReductionCost(IID, ValTy, FMF, CostKind);
5468
5469 // Now handle reduction with the legal type, taking into account size changes
5470 // at each level.
5471 while (NumVecElts > 1) {
5472 // Determine the size of the remaining vector we need to reduce.
5473 unsigned Size = NumVecElts * ScalarSize;
5474 NumVecElts /= 2;
5475 // If we're reducing from 256/512 bits, use an extract_subvector.
5476 if (Size > 128) {
5477 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5478 MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
5479 CostKind, NumVecElts, SubTy);
5480 Ty = SubTy;
5481 } else if (Size == 128) {
5482 // Reducing from 128 bits is a permute of v2f64/v2i64.
5483 VectorType *ShufTy;
5484 if (ValTy->isFloatingPointTy())
5485 ShufTy =
5487 else
5488 ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
5489 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5490 std::nullopt, CostKind, 0, nullptr);
5491 } else if (Size == 64) {
5492 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5493 FixedVectorType *ShufTy;
5494 if (ValTy->isFloatingPointTy())
5495 ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
5496 else
5497 ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
5498 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5499 std::nullopt, CostKind, 0, nullptr);
5500 } else {
5501 // Reducing from smaller size is a shift by immediate.
5502 auto *ShiftTy = FixedVectorType::get(
5503 Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
5504 MinMaxCost += getArithmeticInstrCost(
5505 Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
5508 }
5509
5510 // Add the arithmetic op for this level.
5511 MinMaxCost += getMinMaxCost(IID, Ty, CostKind, FMF);
5512 }
5513
5514 // Add the final extract element to the cost.
5515 return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5516 CostKind, 0, nullptr, nullptr);
5517}
5518
5519/// Calculate the cost of materializing a 64-bit value. This helper
5520/// method might only calculate a fraction of a larger immediate. Therefore it
5521/// is valid to return a cost of ZERO.
5523 if (Val == 0)
5524 return TTI::TCC_Free;
5525
5526 if (isInt<32>(Val))
5527 return TTI::TCC_Basic;
5528
5529 return 2 * TTI::TCC_Basic;
5530}
5531
5534 assert(Ty->isIntegerTy());
5535
5536 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5537 if (BitSize == 0)
5538 return ~0U;
5539
5540 // Never hoist constants larger than 128bit, because this might lead to
5541 // incorrect code generation or assertions in codegen.
5542 // Fixme: Create a cost model for types larger than i128 once the codegen
5543 // issues have been fixed.
5544 if (BitSize > 128)
5545 return TTI::TCC_Free;
5546
5547 if (Imm == 0)
5548 return TTI::TCC_Free;
5549
5550 // Sign-extend all constants to a multiple of 64-bit.
5551 APInt ImmVal = Imm;
5552 if (BitSize % 64 != 0)
5553 ImmVal = Imm.sext(alignTo(BitSize, 64));
5554
5555 // Split the constant into 64-bit chunks and calculate the cost for each
5556 // chunk.
5558 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
5559 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
5560 int64_t Val = Tmp.getSExtValue();
5561 Cost += getIntImmCost(Val);
5562 }
5563 // We need at least one instruction to materialize the constant.
5564 return std::max<InstructionCost>(1, Cost);
5565}
5566
5568 const APInt &Imm, Type *Ty,
5570 Instruction *Inst) {
5571 assert(Ty->isIntegerTy());
5572
5573 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5574 // There is no cost model for constants with a bit size of 0. Return TCC_Free
5575 // here, so that constant hoisting will ignore this constant.
5576 if (BitSize == 0)
5577 return TTI::TCC_Free;
5578
5579 unsigned ImmIdx = ~0U;
5580 switch (Opcode) {
5581 default:
5582 return TTI::TCC_Free;
5583 case Instruction::GetElementPtr:
5584 // Always hoist the base address of a GetElementPtr. This prevents the
5585 // creation of new constants for every base constant that gets constant
5586 // folded with the offset.
5587 if (Idx == 0)
5588 return 2 * TTI::TCC_Basic;
5589 return TTI::TCC_Free;
5590 case Instruction::Store:
5591 ImmIdx = 0;
5592 break;
5593 case Instruction::ICmp:
5594 // This is an imperfect hack to prevent constant hoisting of
5595 // compares that might be trying to check if a 64-bit value fits in
5596 // 32-bits. The backend can optimize these cases using a right shift by 32.
5597 // Ideally we would check the compare predicate here. There also other
5598 // similar immediates the backend can use shifts for.
5599 if (Idx == 1 && Imm.getBitWidth() == 64) {
5600 uint64_t ImmVal = Imm.getZExtValue();
5601 if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
5602 return TTI::TCC_Free;
5603 }
5604 ImmIdx = 1;
5605 break;
5606 case Instruction::And:
5607 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
5608 // by using a 32-bit operation with implicit zero extension. Detect such
5609 // immediates here as the normal path expects bit 31 to be sign extended.
5610 if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.isIntN(32))
5611 return TTI::TCC_Free;
5612 ImmIdx = 1;
5613 break;
5614 case Instruction::Add:
5615 case Instruction::Sub:
5616 // For add/sub, we can use the opposite instruction for INT32_MIN.
5617 if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000)
5618 return TTI::TCC_Free;
5619 ImmIdx = 1;
5620 break;
5621 case Instruction::UDiv:
5622 case Instruction::SDiv:
5623 case Instruction::URem:
5624 case Instruction::SRem:
5625 // Division by constant is typically expanded later into a different
5626 // instruction sequence. This completely changes the constants.
5627 // Report them as "free" to stop ConstantHoist from marking them as opaque.
5628 return TTI::TCC_Free;
5629 case Instruction::Mul:
5630 case Instruction::Or:
5631 case Instruction::Xor:
5632 ImmIdx = 1;
5633 break;
5634 // Always return TCC_Free for the shift value of a shift instruction.
5635 case Instruction::Shl:
5636 case Instruction::LShr:
5637 case Instruction::AShr:
5638 if (Idx == 1)
5639 return TTI::TCC_Free;
5640 break;
5641 case Instruction::Trunc:
5642 case Instruction::ZExt:
5643 case Instruction::SExt:
5644 case Instruction::IntToPtr:
5645 case Instruction::PtrToInt:
5646 case Instruction::BitCast:
5647 case Instruction::PHI:
5648 case Instruction::Call:
5649 case Instruction::Select:
5650 case Instruction::Ret:
5651 case Instruction::Load:
5652 break;
5653 }
5654
5655 if (Idx == ImmIdx) {
5656 uint64_t NumConstants = divideCeil(BitSize, 64);
5658 return (Cost <= NumConstants * TTI::TCC_Basic)
5659 ? static_cast<int>(TTI::TCC_Free)
5660 : Cost;
5661 }
5662
5663 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
5664}
5665
5667 const APInt &Imm, Type *Ty,
5669 assert(Ty->isIntegerTy());
5670
5671 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5672 // There is no cost model for constants with a bit size of 0. Return TCC_Free
5673 // here, so that constant hoisting will ignore this constant.
5674 if (BitSize == 0)
5675 return TTI::TCC_Free;
5676
5677 switch (IID) {
5678 default:
5679 return TTI::TCC_Free;
5680 case Intrinsic::sadd_with_overflow:
5681 case Intrinsic::uadd_with_overflow:
5682 case Intrinsic::ssub_with_overflow:
5683 case Intrinsic::usub_with_overflow:
5684 case Intrinsic::smul_with_overflow:
5685 case Intrinsic::umul_with_overflow:
5686 if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(32))
5687 return TTI::TCC_Free;
5688 break;
5689 case Intrinsic::experimental_stackmap:
5690 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
5691 return TTI::TCC_Free;
5692 break;
5693 case Intrinsic::experimental_patchpoint_void:
5694 case Intrinsic::experimental_patchpoint:
5695 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
5696 return TTI::TCC_Free;
5697 break;
5698 }
5699 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
5700}
5701
5704 const Instruction *I) {
5706 return Opcode == Instruction::PHI ? 0 : 1;
5707 // Branches are assumed to be predicted.
5708 return 0;
5709}
5710
5711int X86TTIImpl::getGatherOverhead() const {
5712 // Some CPUs have more overhead for gather. The specified overhead is relative
5713 // to the Load operation. "2" is the number provided by Intel architects. This
5714 // parameter is used for cost estimation of Gather Op and comparison with
5715 // other alternatives.
5716 // TODO: Remove the explicit hasAVX512()?, That would mean we would only
5717 // enable gather with a -march.
5718 if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
5719 return 2;
5720
5721 return 1024;
5722}
5723
5724int X86TTIImpl::getScatterOverhead() const {
5725 if (ST->hasAVX512())
5726 return 2;
5727
5728 return 1024;
5729}
5730
5731// Return an average cost of Gather / Scatter instruction, maybe improved later.
5732// FIXME: Add TargetCostKind support.
5733InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode,
5735 Type *SrcVTy, const Value *Ptr,
5736 Align Alignment,
5737 unsigned AddressSpace) {
5738
5739 assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
5740 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
5741
5742 // Try to reduce index size from 64 bit (default for GEP)
5743 // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
5744 // operation will use 16 x 64 indices which do not fit in a zmm and needs
5745 // to split. Also check that the base pointer is the same for all lanes,
5746 // and that there's at most one variable index.
5747 auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
5748 unsigned IndexSize = DL.getPointerSizeInBits();
5749 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
5750 if (IndexSize < 64 || !GEP)
5751 return IndexSize;
5752
5753 unsigned NumOfVarIndices = 0;
5754 const Value *Ptrs = GEP->getPointerOperand();
5755 if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
5756 return IndexSize;
5757 for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) {
5758 if (isa<Constant>(GEP->getOperand(I)))
5759 continue;
5760 Type *IndxTy = GEP->getOperand(I)->getType();
5761 if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
5762 IndxTy = IndexVTy->getElementType();
5763 if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
5764 !isa<SExtInst>(GEP->getOperand(I))) ||
5765 ++NumOfVarIndices > 1)
5766 return IndexSize; // 64
5767 }
5768 return (unsigned)32;
5769 };
5770
5771 // Trying to reduce IndexSize to 32 bits for vector 16.
5772 // By default the IndexSize is equal to pointer size.
5773 unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
5774 ? getIndexSizeInBits(Ptr, DL)
5776
5777 auto *IndexVTy = FixedVectorType::get(
5778 IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
5779 std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(IndexVTy);
5780 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcVTy);
5781 InstructionCost::CostType SplitFactor =
5782 *std::max(IdxsLT.first, SrcLT.first).getValue();
5783 if (SplitFactor > 1) {
5784 // Handle splitting of vector of pointers
5785 auto *SplitSrcTy =
5786 FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
5787 return SplitFactor * getGSVectorCost(Opcode, CostKind, SplitSrcTy, Ptr,
5788 Alignment, AddressSpace);
5789 }
5790
5791 // The gather / scatter cost is given by Intel architects. It is a rough
5792 // number since we are looking at one instruction in a time.
5793 const int GSOverhead = (Opcode == Instruction::Load)
5794 ? getGatherOverhead()
5795 : getScatterOverhead();
5796 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5797 MaybeAlign(Alignment), AddressSpace,
5799}
5800
5801/// Return the cost of full scalarization of gather / scatter operation.
5802///
5803/// Opcode - Load or Store instruction.
5804/// SrcVTy - The type of the data vector that should be gathered or scattered.
5805/// VariableMask - The mask is non-constant at compile time.
5806/// Alignment - Alignment for one element.
5807/// AddressSpace - pointer[s] address space.
5808/// TODO: Remove this and use getCommonMaskedMemoryOpCost directly.
5809InstructionCost X86TTIImpl::getGSScalarCost(unsigned Opcode,
5811 Type *SrcVTy, bool VariableMask,
5812 Align Alignment,
5813 unsigned AddressSpace) {
5814 Type *ScalarTy = SrcVTy->getScalarType();
5815 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
5816 APInt DemandedElts = APInt::getAllOnes(VF);
5817
5818 InstructionCost MaskUnpackCost = 0;
5819 if (VariableMask) {
5820 auto *MaskTy =
5822 MaskUnpackCost = getScalarizationOverhead(
5823 MaskTy, DemandedElts, /*Insert=*/false, /*Extract=*/true, CostKind);
5824 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
5825 Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr,
5827 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
5828 MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
5829 }
5830
5831 InstructionCost AddressUnpackCost = getScalarizationOverhead(
5833 DemandedElts, /*Insert=*/false, /*Extract=*/true, CostKind);
5834
5835 // The cost of the scalar loads/stores.
5836 InstructionCost MemoryOpCost =
5837 VF * getMemoryOpCost(Opcode, ScalarTy, MaybeAlign(Alignment),
5839
5840 // The cost of forming the vector from loaded scalars/
5841 // scalarizing the vector to perform scalar stores.
5842 InstructionCost InsertExtractCost = getScalarizationOverhead(
5843 cast<FixedVectorType>(SrcVTy), DemandedElts,
5844 /*Insert=*/Opcode == Instruction::Load,
5845 /*Extract=*/Opcode == Instruction::Store, CostKind);
5846
5847 return AddressUnpackCost + MemoryOpCost + MaskUnpackCost + InsertExtractCost;
5848}
5849
5850/// Calculate the cost of Gather / Scatter operation
5852 unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask,
5854 const Instruction *I = nullptr) {
5856 if ((Opcode == Instruction::Load &&
5857 isLegalMaskedGather(SrcVTy, Align(Alignment)) &&
5858 !forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
5859 Align(Alignment))) ||
5860 (Opcode == Instruction::Store &&
5861 isLegalMaskedScatter(SrcVTy, Align(Alignment)) &&
5862 !forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
5863 Align(Alignment))))
5864 return 1;
5865 return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask,
5866 Alignment, CostKind, I);
5867 }
5868
5869 assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
5870 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
5871 if (!PtrTy && Ptr->getType()->isVectorTy())
5872 PtrTy = dyn_cast<PointerType>(
5873 cast<VectorType>(Ptr->getType())->getElementType());
5874 assert(PtrTy && "Unexpected type for Ptr argument");
5875 unsigned AddressSpace = PtrTy->getAddressSpace();
5876
5877 if ((Opcode == Instruction::Load &&
5878 (!isLegalMaskedGather(SrcVTy, Align(Alignment)) ||
5879 forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
5880 Align(Alignment)))) ||
5881 (Opcode == Instruction::Store &&
5882 (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) ||
5883 forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
5884 Align(Alignment)))))
5885 return getGSScalarCost(Opcode, CostKind, SrcVTy, VariableMask, Alignment,
5886 AddressSpace);
5887
5888 return getGSVectorCost(Opcode, CostKind, SrcVTy, Ptr, Alignment,
5889 AddressSpace);
5890}
5891
5893 const TargetTransformInfo::LSRCost &C2) {
5894 // X86 specific here are "instruction number 1st priority".
5895 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
5896 C1.NumIVMuls, C1.NumBaseAdds,
5897 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
5898 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
5899 C2.NumIVMuls, C2.NumBaseAdds,
5900 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
5901}
5902
5904 return ST->hasMacroFusion() || ST->hasBranchFusion();
5905}
5906
5907bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
5908 if (!ST->hasAVX())
5909 return false;
5910
5911 // The backend can't handle a single element vector.
5912 if (isa<VectorType>(DataTy) &&
5913 cast<FixedVectorType>(DataTy)->getNumElements() == 1)
5914 return false;
5915 Type *ScalarTy = DataTy->getScalarType();
5916
5917 if (ScalarTy->isPointerTy())
5918 return true;
5919
5920 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
5921 return true;
5922
5923 if (ScalarTy->isHalfTy() && ST->hasBWI())
5924 return true;
5925
5926 if (ScalarTy->isBFloatTy() && ST->hasBF16())
5927 return true;
5928
5929 if (!ScalarTy->isIntegerTy())
5930 return false;
5931
5932 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
5933 return IntWidth == 32 || IntWidth == 64 ||
5934 ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
5935}
5936
5937bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) {
5938 return isLegalMaskedLoad(DataType, Alignment);
5939}
5940
5941bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) {
5942 unsigned DataSize = DL.getTypeStoreSize(DataType);
5943 // The only supported nontemporal loads are for aligned vectors of 16 or 32
5944 // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
5945 // (the equivalent stores only require AVX).
5946 if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
5947 return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
5948
5949 return false;
5950}
5951
5952bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) {
5953 unsigned DataSize = DL.getTypeStoreSize(DataType);
5954
5955 // SSE4A supports nontemporal stores of float and double at arbitrary
5956 // alignment.
5957 if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
5958 return true;
5959
5960 // Besides the SSE4A subtarget exception above, only aligned stores are
5961 // available nontemporaly on any other subtarget. And only stores with a size
5962 // of 4..32 bytes (powers of 2, only) are permitted.
5963 if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
5964 !isPowerOf2_32(DataSize))
5965 return false;
5966
5967 // 32-byte vector nontemporal stores are supported by AVX (the equivalent
5968 // loads require AVX2).
5969 if (DataSize == 32)
5970 return ST->hasAVX();
5971 if (DataSize == 16)
5972 return ST->hasSSE1();
5973 return true;
5974}
5975
5977 ElementCount NumElements) const {
5978 // movddup
5979 return ST->hasSSE3() && !NumElements.isScalable() &&
5980 NumElements.getFixedValue() == 2 &&
5981 ElementTy == Type::getDoubleTy(ElementTy->getContext());
5982}
5983
5985 if (!isa<VectorType>(DataTy))
5986 return false;
5987
5988 if (!ST->hasAVX512())
5989 return false;
5990
5991 // The backend can't handle a single element vector.
5992 if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
5993 return false;
5994
5995 Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
5996
5997 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
5998 return true;
5999
6000 if (!ScalarTy->isIntegerTy())
6001 return false;
6002
6003 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6004 return IntWidth == 32 || IntWidth == 64 ||
6005 ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
6006}
6007
6009 return isLegalMaskedExpandLoad(DataTy, Alignment);
6010}
6011
6012bool X86TTIImpl::supportsGather() const {
6013 // Some CPUs have better gather performance than others.
6014 // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
6015 // enable gather with a -march.
6016 return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2());
6017}
6018
6020 // Gather / Scatter for vector 2 is not profitable on KNL / SKX
6021 // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend
6022 // it to 8 elements, but zeroing upper bits of the mask vector will add more
6023 // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO:
6024 // Check, maybe the gather/scatter instruction is better in the VariableMask
6025 // case.
6026 unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements();
6027 return NumElts == 1 ||
6028 (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())));
6029}
6030
6032 Type *ScalarTy = DataTy->getScalarType();
6033 if (ScalarTy->isPointerTy())
6034 return true;
6035
6036 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6037 return true;
6038
6039 if (!ScalarTy->isIntegerTy())
6040 return false;
6041
6042 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6043 return IntWidth == 32 || IntWidth == 64;
6044}
6045
6047 if (!supportsGather() || !ST->preferGather())
6048 return false;
6049 return isLegalMaskedGatherScatter(DataTy, Alignment);
6050}
6051
6052bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
6053 unsigned Opcode1,
6054 const SmallBitVector &OpcodeMask) const {
6055 // ADDSUBPS 4xf32 SSE3
6056 // VADDSUBPS 4xf32 AVX
6057 // VADDSUBPS 8xf32 AVX2
6058 // ADDSUBPD 2xf64 SSE3
6059 // VADDSUBPD 2xf64 AVX
6060 // VADDSUBPD 4xf64 AVX2
6061
6062 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
6063 assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible");
6064 if (!isPowerOf2_32(NumElements))
6065 return false;
6066 // Check the opcode pattern. We apply the mask on the opcode arguments and
6067 // then check if it is what we expect.
6068 for (int Lane : seq<int>(0, NumElements)) {
6069 unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0;
6070 // We expect FSub for even lanes and FAdd for odd lanes.
6071 if (Lane % 2 == 0 && Opc != Instruction::FSub)
6072 return false;
6073 if (Lane % 2 == 1 && Opc != Instruction::FAdd)
6074 return false;
6075 }
6076 // Now check that the pattern is supported by the target ISA.
6077 Type *ElemTy = cast<VectorType>(VecTy)->getElementType();
6078 if (ElemTy->isFloatTy())
6079 return ST->hasSSE3() && NumElements % 4 == 0;
6080 if (ElemTy->isDoubleTy())
6081 return ST->hasSSE3() && NumElements % 2 == 0;
6082 return false;
6083}
6084
6085bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) {
6086 // AVX2 doesn't support scatter
6087 if (!ST->hasAVX512() || !ST->preferScatter())
6088 return false;
6089 return isLegalMaskedGatherScatter(DataType, Alignment);
6090}
6091
6092bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
6093 EVT VT = TLI->getValueType(DL, DataType);
6094 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
6095}
6096
6098 // FDIV is always expensive, even if it has a very low uop count.
6099 // TODO: Still necessary for recent CPUs with low latency/throughput fdiv?
6100 if (I->getOpcode() == Instruction::FDiv)
6101 return true;
6102
6104}
6105
6107 return false;
6108}
6109
6111 const Function *Callee) const {
6112 const TargetMachine &TM = getTLI()->getTargetMachine();
6113
6114 // Work this as a subsetting of subtarget features.
6115 const FeatureBitset &CallerBits =
6116 TM.getSubtargetImpl(*Caller)->getFeatureBits();
6117 const FeatureBitset &CalleeBits =
6118 TM.getSubtargetImpl(*Callee)->getFeatureBits();
6119
6120 // Check whether features are the same (apart from the ignore list).
6121 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
6122 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
6123 if (RealCallerBits == RealCalleeBits)
6124 return true;
6125
6126 // If the features are a subset, we need to additionally check for calls
6127 // that may become ABI-incompatible as a result of inlining.
6128 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
6129 return false;
6130
6131 for (const Instruction &I : instructions(Callee)) {
6132 if (const auto *CB = dyn_cast<CallBase>(&I)) {
6133 // Having more target features is fine for inline ASM.
6134 if (CB->isInlineAsm())
6135 continue;
6136
6138 for (Value *Arg : CB->args())
6139 Types.push_back(Arg->getType());
6140 if (!CB->getType()->isVoidTy())
6141 Types.push_back(CB->getType());
6142
6143 // Simple types are always ABI compatible.
6144 auto IsSimpleTy = [](Type *Ty) {
6145 return !Ty->isVectorTy() && !Ty->isAggregateType();
6146 };
6147 if (all_of(Types, IsSimpleTy))
6148 continue;
6149
6150 if (Function *NestedCallee = CB->getCalledFunction()) {
6151 // Assume that intrinsics are always ABI compatible.
6152 if (NestedCallee->isIntrinsic())
6153 continue;
6154
6155 // Do a precise compatibility check.
6156 if (!areTypesABICompatible(Caller, NestedCallee, Types))
6157 return false;
6158 } else {
6159 // We don't know the target features of the callee,
6160 // assume it is incompatible.
6161 return false;
6162 }
6163 }
6164 }
6165 return true;
6166}
6167
6169 const Function *Callee,
6170 const ArrayRef<Type *> &Types) const {
6171 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
6172 return false;
6173
6174 // If we get here, we know the target features match. If one function
6175 // considers 512-bit vectors legal and the other does not, consider them
6176 // incompatible.
6177 const TargetMachine &TM = getTLI()->getTargetMachine();
6178
6179 if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
6180 TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs())
6181 return true;
6182
6183 // Consider the arguments compatible if they aren't vectors or aggregates.
6184 // FIXME: Look at the size of vectors.
6185 // FIXME: Look at the element types of aggregates to see if there are vectors.
6186 return llvm::none_of(Types,
6187 [](Type *T) { return T->isVectorTy() || T->isAggregateType(); });
6188}
6189
6191X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
6193 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
6194 Options.NumLoadsPerBlock = 2;
6195 // All GPR and vector loads can be unaligned.
6196 Options.AllowOverlappingLoads = true;
6197 if (IsZeroCmp) {
6198 // Only enable vector loads for equality comparison. Right now the vector
6199 // version is not as fast for three way compare (see #33329).
6200 const unsigned PreferredWidth = ST->getPreferVectorWidth();
6201 if (PreferredWidth >= 512 && ST->hasAVX512() && ST->hasEVEX512())
6202 Options.LoadSizes.push_back(64);
6203 if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
6204 if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
6205 }
6206 if (ST->is64Bit()) {
6207 Options.LoadSizes.push_back(8);
6208 }
6209 Options.LoadSizes.push_back(4);
6210 Options.LoadSizes.push_back(2);
6211 Options.LoadSizes.push_back(1);
6212 return Options;
6213}
6214
6216 return supportsGather();
6217}
6218
6220 return false;
6221}
6222
6224 // TODO: We expect this to be beneficial regardless of arch,
6225 // but there are currently some unexplained performance artifacts on Atom.
6226 // As a temporary solution, disable on Atom.
6227 return !(ST->isAtom());
6228}
6229
6230// Get estimation for interleaved load/store operations and strided load.
6231// \p Indices contains indices for strided load.
6232// \p Factor - the factor of interleaving.
6233// AVX-512 provides 3-src shuffles that significantly reduces the cost.
6235 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
6236 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
6237 TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
6238 // VecTy for interleave memop is <VF*Factor x Elt>.
6239 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6240 // VecTy = <12 x i32>.
6241
6242 // Calculate the number of memory operations (NumOfMemOps), required
6243 // for load/store the VecTy.
6244 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6245 unsigned VecTySize = DL.getTypeStoreSize(VecTy);
6246 unsigned LegalVTSize = LegalVT.getStoreSize();
6247 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
6248
6249 // Get the cost of one memory operation.
6250 auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
6251 LegalVT.getVectorNumElements());
6252 InstructionCost MemOpCost;
6253 bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps;
6254 if (UseMaskedMemOp)
6255 MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment,
6257 else
6258 MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, MaybeAlign(Alignment),
6260
6261 unsigned VF = VecTy->getNumElements() / Factor;
6262 MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
6263
6264 InstructionCost MaskCost;
6265 if (UseMaskedMemOp) {
6266 APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements());
6267 for (unsigned Index : Indices) {
6268 assert(Index < Factor && "Invalid index for interleaved memory op");
6269 for (unsigned Elm = 0; Elm < VF; Elm++)
6270 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
6271 }
6272
6273 Type *I1Type = Type::getInt1Ty(VecTy->getContext());
6274
6275 MaskCost = getReplicationShuffleCost(
6276 I1Type, Factor, VF,
6277 UseMaskForGaps ? DemandedLoadStoreElts
6279 CostKind);
6280
6281 // The Gaps mask is invariant and created outside the loop, therefore the
6282 // cost of creating it is not accounted for here. However if we have both
6283 // a MaskForGaps and some other mask that guards the execution of the
6284 // memory access, we need to account for the cost of And-ing the two masks
6285 // inside the loop.
6286 if (UseMaskForGaps) {
6287 auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements());
6288 MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind);
6289 }
6290 }
6291
6292 if (Opcode == Instruction::Load) {
6293 // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
6294 // contain the cost of the optimized shuffle sequence that the
6295 // X86InterleavedAccess pass will generate.
6296 // The cost of loads and stores are computed separately from the table.
6297
6298 // X86InterleavedAccess support only the following interleaved-access group.
6299 static const CostTblEntry AVX512InterleavedLoadTbl[] = {
6300 {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
6301 {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
6302 {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
6303 };
6304
6305 if (const auto *Entry =
6306 CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
6307 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6308 //If an entry does not exist, fallback to the default implementation.
6309
6310 // Kind of shuffle depends on number of loaded values.
6311 // If we load the entire data in one register, we can use a 1-src shuffle.
6312 // Otherwise, we'll merge 2 sources in each operation.
6313 TTI::ShuffleKind ShuffleKind =
6314 (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
6315
6316 InstructionCost ShuffleCost = getShuffleCost(
6317 ShuffleKind, SingleMemOpTy, std::nullopt, CostKind, 0, nullptr);
6318
6319 unsigned NumOfLoadsInInterleaveGrp =
6320 Indices.size() ? Indices.size() : Factor;
6321 auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
6322 VecTy->getNumElements() / Factor);
6323 InstructionCost NumOfResults =
6324 getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp;
6325
6326 // About a half of the loads may be folded in shuffles when we have only
6327 // one result. If we have more than one result, or the loads are masked,
6328 // we do not fold loads at all.
6329 unsigned NumOfUnfoldedLoads =
6330 UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
6331
6332 // Get a number of shuffle operations per result.
6333 unsigned NumOfShufflesPerResult =
6334 std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
6335
6336 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6337 // When we have more than one destination, we need additional instructions
6338 // to keep sources.
6339 InstructionCost NumOfMoves = 0;
6340 if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
6341 NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
6342
6343 InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
6344 MaskCost + NumOfUnfoldedLoads * MemOpCost +
6345 NumOfMoves;
6346
6347 return Cost;
6348 }
6349
6350 // Store.
6351 assert(Opcode == Instruction::Store &&
6352 "Expected Store Instruction at this point");
6353 // X86InterleavedAccess support only the following interleaved-access group.
6354 static const CostTblEntry AVX512InterleavedStoreTbl[] = {
6355 {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
6356 {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
6357 {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
6358
6359 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
6360 {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
6361 {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
6362 {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
6363 };
6364
6365 if (const auto *Entry =
6366 CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
6367 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6368 //If an entry does not exist, fallback to the default implementation.
6369
6370 // There is no strided stores meanwhile. And store can't be folded in
6371 // shuffle.
6372 unsigned NumOfSources = Factor; // The number of values to be merged.
6373 InstructionCost ShuffleCost = getShuffleCost(
6374 TTI::SK_PermuteTwoSrc, SingleMemOpTy, std::nullopt, CostKind, 0, nullptr);
6375 unsigned NumOfShufflesPerStore = NumOfSources - 1;
6376
6377 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6378 // We need additional instructions to keep sources.
6379 unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
6381 MaskCost +
6382 NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
6383 NumOfMoves;
6384 return Cost;
6385}
6386
6388 unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices,
6389 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
6390 bool UseMaskForCond, bool UseMaskForGaps) {
6391 auto *VecTy = cast<FixedVectorType>(BaseTy);
6392
6393 auto isSupportedOnAVX512 = [&](Type *VecTy) {
6394 Type *EltTy = cast<VectorType>(VecTy)->getElementType();
6395 if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
6396 EltTy->isIntegerTy(32) || EltTy->isPointerTy())
6397 return true;
6398 if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy())
6399 return ST->hasBWI();
6400 if (EltTy->isBFloatTy())
6401 return ST->hasBF16();
6402 return false;
6403 };
6404 if (ST->hasAVX512() && isSupportedOnAVX512(VecTy))
6406 Opcode, VecTy, Factor, Indices, Alignment,
6407 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
6408
6409 if (UseMaskForCond || UseMaskForGaps)
6410 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6411 Alignment, AddressSpace, CostKind,
6412 UseMaskForCond, UseMaskForGaps);
6413
6414 // Get estimation for interleaved load/store operations for SSE-AVX2.
6415 // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow
6416 // computing the cost using a generic formula as a function of generic
6417 // shuffles. We therefore use a lookup table instead, filled according to
6418 // the instruction sequences that codegen currently generates.
6419
6420 // VecTy for interleave memop is <VF*Factor x Elt>.
6421 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6422 // VecTy = <12 x i32>.
6423 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6424
6425 // This function can be called with VecTy=<6xi128>, Factor=3, in which case
6426 // the VF=2, while v2i128 is an unsupported MVT vector type
6427 // (see MachineValueType.h::getVectorVT()).
6428 if (!LegalVT.isVector())
6429 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6430 Alignment, AddressSpace, CostKind);
6431
6432 unsigned VF = VecTy->getNumElements() / Factor;
6433 Type *ScalarTy = VecTy->getElementType();
6434 // Deduplicate entries, model floats/pointers as appropriately-sized integers.
6435 if (!ScalarTy->isIntegerTy())
6436 ScalarTy =
6437 Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
6438
6439 // Get the cost of all the memory operations.
6440 // FIXME: discount dead loads.
6441 InstructionCost MemOpCosts = getMemoryOpCost(
6442 Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind);
6443
6444 auto *VT = FixedVectorType::get(ScalarTy, VF);
6445 EVT ETy = TLI->getValueType(DL, VT);
6446 if (!ETy.isSimple())
6447 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6448 Alignment, AddressSpace, CostKind);
6449
6450 // TODO: Complete for other data-types and strides.
6451 // Each combination of Stride, element bit width and VF results in a different
6452 // sequence; The cost tables are therefore accessed with:
6453 // Factor (stride) and VectorType=VFxiN.
6454 // The Cost accounts only for the shuffle sequence;
6455 // The cost of the loads/stores is accounted for separately.
6456 //
6457 static const CostTblEntry AVX2InterleavedLoadTbl[] = {
6458 {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8
6459 {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8
6460 {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8
6461 {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8
6462 {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8
6463
6464 {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16
6465 {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16
6466 {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16
6467
6468 {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32
6469 {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32
6470 {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32
6471
6472 {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64
6473 {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64
6474 {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64
6475 {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64
6476
6477 {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8
6478 {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8
6479 {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8
6480 {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
6481 {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8
6482
6483 {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16
6484 {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16
6485 {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16
6486 {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16
6487 {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16
6488
6489 {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32
6490 {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32
6491 {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32
6492 {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32
6493 {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32
6494
6495 {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64
6496 {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64
6497 {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64
6498 {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64
6499
6500 {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8
6501 {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8
6502 {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8
6503 {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8
6504 {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8
6505
6506 {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16
6507 {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16
6508 {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16
6509 {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16
6510 {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16
6511
6512 {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32
6513 {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32
6514 {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32
6515 {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32
6516 {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32
6517
6518 {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64
6519 {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64
6520 {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64
6521 {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64
6522
6523 {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8
6524 {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8
6525 {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8
6526 {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8
6527 {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8
6528
6529 {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16
6530 {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16
6531 {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16
6532 {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16
6533 {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16
6534
6535 {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32
6536 {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32
6537 {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32
6538 {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32
6539
6540 {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64
6541 {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64
6542 {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64
6543
6544 {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
6545 };
6546
6547 static const CostTblEntry SSSE3InterleavedLoadTbl[] = {
6548 {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16
6549 };
6550
6551 static const CostTblEntry SSE2InterleavedLoadTbl[] = {
6552 {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16
6553 {2, MVT::v4i16, 7}, // (load 8i16 and) deinterleave into 2 x 4i16
6554
6555 {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32
6556 {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32
6557
6558 {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64
6559 };
6560
6561 static const CostTblEntry AVX2InterleavedStoreTbl[] = {
6562 {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store)
6563 {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store)
6564
6565 {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store)
6566 {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store)
6567 {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store)
6568
6569 {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store)
6570 {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store)
6571 {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store)
6572 {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store)
6573
6574 {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store)
6575 {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store)
6576 {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store)
6577 {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store)
6578 {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store)
6579
6580 {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store)
6581 {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store)
6582 {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store)
6583 {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
6584 {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
6585
6586 {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store)
6587 {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store)
6588 {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store)
6589 {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store)
6590 {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store)
6591
6592 {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store)
6593 {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store)
6594 {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store)
6595 {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store)
6596 {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store)
6597
6598 {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store)
6599 {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store)
6600 {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store)
6601 {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store)
6602
6603 {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store)
6604 {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store)
6605 {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store)
6606 {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store)
6607 {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store)
6608
6609 {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store)
6610 {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store)
6611 {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store)
6612 {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store)
6613 {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store)
6614
6615 {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store)
6616 {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store)
6617 {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store)
6618 {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store)
6619 {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store)
6620
6621 {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store)
6622 {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store)
6623 {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store)
6624 {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store)
6625
6626 {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store)
6627 {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store)
6628 {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store)
6629 {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store)
6630 {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store)
6631
6632 {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store)
6633 {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store)
6634 {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store)
6635 {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store)
6636 {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store)
6637
6638 {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store)
6639 {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store)
6640 {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store)
6641 {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store)
6642
6643 {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store)
6644 {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store)
6645 {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store)
6646 };
6647
6648 static const CostTblEntry SSE2InterleavedStoreTbl[] = {
6649 {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store)
6650 {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store)
6651 {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store)
6652
6653 {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store)
6654 {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store)
6655
6656 {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store)
6657 };
6658
6659 if (Opcode == Instruction::Load) {
6660 auto GetDiscountedCost = [Factor, NumMembers = Indices.size(),
6661 MemOpCosts](const CostTblEntry *Entry) {
6662 // NOTE: this is just an approximation!
6663 // It can over/under -estimate the cost!
6664 return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor);
6665 };
6666
6667 if (ST->hasAVX2())
6668 if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor,
6669 ETy.getSimpleVT()))
6670 return GetDiscountedCost(Entry);
6671
6672 if (ST->hasSSSE3())
6673 if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor,
6674 ETy.getSimpleVT()))
6675 return GetDiscountedCost(Entry);
6676
6677 if (ST->hasSSE2())
6678 if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor,
6679 ETy.getSimpleVT()))
6680 return GetDiscountedCost(Entry);
6681 } else {
6682 assert(Opcode == Instruction::Store &&
6683 "Expected Store Instruction at this point");
6684 assert((!Indices.size() || Indices.size() == Factor) &&
6685 "Interleaved store only supports fully-interleaved groups.");
6686 if (ST->hasAVX2())
6687 if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor,
6688 ETy.getSimpleVT()))
6689 return MemOpCosts + Entry->Cost;
6690
6691 if (ST->hasSSE2())
6692 if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor,
6693 ETy.getSimpleVT()))
6694 return MemOpCosts + Entry->Cost;
6695 }
6696
6697 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6698 Alignment, AddressSpace, CostKind,
6699 UseMaskForCond, UseMaskForGaps);
6700}
6701
6703 int64_t BaseOffset,
6704 bool HasBaseReg, int64_t Scale,
6705 unsigned AddrSpace) const {
6706 // Scaling factors are not free at all.
6707 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
6708 // will take 2 allocations in the out of order engine instead of 1
6709 // for plain addressing mode, i.e. inst (reg1).
6710 // E.g.,
6711 // vaddps (%rsi,%rdx), %ymm0, %ymm1
6712 // Requires two allocations (one for the load, one for the computation)
6713 // whereas:
6714 // vaddps (%rsi), %ymm0, %ymm1
6715 // Requires just 1 allocation, i.e., freeing allocations for other operations
6716 // and having less micro operations to execute.
6717 //
6718 // For some X86 architectures, this is even worse because for instance for
6719 // stores, the complex addressing mode forces the instruction to use the
6720 // "load" ports instead of the dedicated "store" port.
6721 // E.g., on Haswell:
6722 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
6723 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
6725 AM.BaseGV = BaseGV;
6726 AM.BaseOffs = BaseOffset;
6727 AM.HasBaseReg = HasBaseReg;
6728 AM.Scale = Scale;
6729 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
6730 // Scale represents reg2 * scale, thus account for 1
6731 // as soon as we use a second register.
6732 return AM.Scale != 0;
6733 return -1;
6734}
Expand Atomic instructions
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
Hexagon Common GEP
static LVOptions Options
Definition: LVOptions.cpp:25
#define I(x, y, z)
Definition: MD5.cpp:58
if(VerifyEach)
const char LLVMTargetMachineRef TM
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
This file a TargetTransformInfo::Concept conforming object specific to the X86 target machine.
Class for arbitrary precision integers.
Definition: APInt.h:76
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1620
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1308
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:349
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:236
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:358
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1439
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:805
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:178
APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition: APInt.cpp:453
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1513
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:891
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:969
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:762
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:438
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:855
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:339
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:993
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:996
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:1023
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:1017
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:1016
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:1001
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:1004
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:1018
@ ICMP_NE
not equal
Definition: InstrTypes.h:1015
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:1021
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:1019
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:1003
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:410
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:865
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:672
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:472
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:874
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:307
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
unsigned getNumElements() const
Definition: DerivedTypes.h:582
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:973
static InstructionCost getInvalid(CostType Val=0)
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
const SmallVectorImpl< const Value * > & getArgs() const
const IntrinsicInst * getInst() const
Machine Value Type.
bool is128BitVector() const
Return true if this is a 128-bit vector type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:585
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Class to represent pointers.
Definition: DerivedTypes.h:646
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:679
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
bool test(unsigned Idx) const
size_type size() const
Returns the number of bits in this bitvector.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
bool isExpensiveToSpeculativelyExecute(const Instruction *I)
bool isStridedAccess(const SCEV *Ptr) const
unsigned minRequiredElementSize(const Value *Val, bool &isSigned) const
const SCEVConstant * getConstantStrideStep(ScalarEvolution *SE, const SCEV *Ptr) const
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
CacheLevel
The possible cache levels.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:330
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:333
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
static Type * getDoubleTy(LLVMContext &C)
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:234
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
static IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:146
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:143
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
static IntegerType * getInt8Ty(LLVMContext &C)
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:157
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:216
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
static VectorType * getExtendedElementVectorType(VectorType *VTy)
This static method is like getInteger except that the element types are twice as wide as the elements...
Definition: DerivedTypes.h:463
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * getDoubleElementsVectorType(VectorType *VTy)
This static method returns a VectorType with twice as many elements as the input type and the same el...
Definition: DerivedTypes.h:517
Type * getElementType() const
Definition: DerivedTypes.h:436
bool hasSSE1() const
Definition: X86Subtarget.h:200
bool hasSSE42() const
Definition: X86Subtarget.h:205
bool useAVX512Regs() const
Definition: X86Subtarget.h:266
bool hasSSE3() const
Definition: X86Subtarget.h:202
bool hasAVX512() const
Definition: X86Subtarget.h:208
bool hasSSE41() const
Definition: X86Subtarget.h:204
bool hasSSE2() const
Definition: X86Subtarget.h:201
bool hasSSSE3() const
Definition: X86Subtarget.h:203
bool hasAVX() const
Definition: X86Subtarget.h:206
unsigned getPreferVectorWidth() const
Definition: X86Subtarget.h:239
bool hasAVX2() const
Definition: X86Subtarget.h:207
InstructionCost getInterleavedMemoryOpCostAVX512(unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
bool isLegalMaskedGather(Type *DataType, Align Alignment)
InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask, TTI::TargetCostKind CostKind) const
std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool isLegalNTStore(Type *DataType, Align Alignment)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
bool isLegalNTLoad(Type *DataType, Align Alignment)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment)
bool isLegalMaskedGatherScatter(Type *DataType, Align Alignment)
bool isLegalMaskedLoad(Type *DataType, Align Alignment)
bool supportsEfficientVectorElementLoadStore() const
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
bool prefersVectorizedAddressing() const
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment)
std::optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const override
bool isLegalMaskedStore(Type *DataType, Align Alignment)
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
Calculate the cost of Gather / Scatter operation.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
unsigned getMaxInterleaveFactor(ElementCount VF)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
bool isLegalMaskedCompressStore(Type *DataType, Align Alignment)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
unsigned getNumberOfRegisters(unsigned ClassID) const
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
unsigned getAtomicMemIntrinsicMaxElementSize() const
bool isLegalMaskedScatter(Type *DataType, Align Alignment)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
InstructionCost getIntImmCost(int64_t)
Calculate the cost of materializing a 64-bit value.
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isExpensiveToSpeculativelyExecute(const Instruction *I)
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
InstructionCost getMinMaxCost(Intrinsic::ID IID, Type *Ty, TTI::TargetCostKind CostKind, FastMathFlags FMF)
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Type) const
bool hasDivRemOp(Type *DataType, bool IsSigned)
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:187
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:2978
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:750
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:723
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:714
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:790
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:688
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:255
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:774
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:722
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:930
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:349
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:727
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:323
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:705
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:535
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:780
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:971
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:674
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:888
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:984
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:836
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:680
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:869
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:786
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:340
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
apint_match m_APIntAllowPoison(const APInt *&Res)
Match APInt while allowing poison in splat vector constants.
Definition: PatternMatch.h:300
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:417
AddressSpace
Definition: NVPTXBaseInfo.h:21
Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:361
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:264
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
constexpr int PoisonMaskElem
void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1824
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
InstructionCost Cost
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:428
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
std::optional< unsigned > operator[](TargetTransformInfo::TargetCostKind Kind) const
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:34
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition: Alignment.h:141
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Type Conversion Cost Table.
Definition: CostTable.h:55