LLVM 19.0.0git
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements a TargetTransformInfo analysis pass specific to the
10/// X86 target machine. It uses the target's detailed information to provide
11/// more precise answers to certain TTI queries, while letting the target
12/// independent and default TTI implementations handle the rest.
13///
14//===----------------------------------------------------------------------===//
15/// About Cost Model numbers used below it's necessary to say the following:
16/// the numbers correspond to some "generic" X86 CPU instead of usage of a
17/// specific CPU model. Usually the numbers correspond to the CPU where the
18/// feature first appeared. For example, if we do Subtarget.hasSSE42() in
19/// the lookups below the cost is based on Nehalem as that was the first CPU
20/// to support that feature level and thus has most likely the worst case cost,
21/// although we may discard an outlying worst cost from one CPU (e.g. Atom).
22///
23/// Some examples of other technologies/CPUs:
24/// SSE 3 - Pentium4 / Athlon64
25/// SSE 4.1 - Penryn
26/// SSE 4.2 - Nehalem / Silvermont
27/// AVX - Sandy Bridge / Jaguar / Bulldozer
28/// AVX2 - Haswell / Ryzen
29/// AVX-512 - Xeon Phi / Skylake
30///
31/// And some examples of instruction target dependent costs (latency)
32/// divss sqrtss rsqrtss
33/// AMD K7 11-16 19 3
34/// Piledriver 9-24 13-15 5
35/// Jaguar 14 16 2
36/// Pentium II,III 18 30 2
37/// Nehalem 7-14 7-18 3
38/// Haswell 10-13 11 5
39///
40/// Interpreting the 4 TargetCostKind types:
41/// TCK_RecipThroughput and TCK_Latency should try to match the worst case
42/// values reported by the CPU scheduler models (and llvm-mca).
43/// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the
44/// actual encoding size of the instruction.
45/// TCK_SizeAndLatency should match the worst case micro-op counts reported by
46/// by the CPU scheduler models (and llvm-mca), to ensure that they are
47/// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are
48/// often used as the cost thresholds where TCK_SizeAndLatency is requested.
49//===----------------------------------------------------------------------===//
50
58#include "llvm/Support/Debug.h"
59#include <optional>
60
61using namespace llvm;
62
63#define DEBUG_TYPE "x86tti"
64
65//===----------------------------------------------------------------------===//
66//
67// X86 cost model.
68//
69//===----------------------------------------------------------------------===//
70
71// Helper struct to store/access costs for each cost kind.
72// TODO: Move this to allow other targets to use it?
74 unsigned RecipThroughputCost = ~0U;
75 unsigned LatencyCost = ~0U;
76 unsigned CodeSizeCost = ~0U;
77 unsigned SizeAndLatencyCost = ~0U;
78
79 std::optional<unsigned>
81 unsigned Cost = ~0U;
82 switch (Kind) {
85 break;
88 break;
91 break;
94 break;
95 }
96 if (Cost == ~0U)
97 return std::nullopt;
98 return Cost;
99 }
100};
102
105 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
106 // TODO: Currently the __builtin_popcount() implementation using SSE3
107 // instructions is inefficient. Once the problem is fixed, we should
108 // call ST->hasSSE3() instead of ST->hasPOPCNT().
109 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
110}
111
112std::optional<unsigned> X86TTIImpl::getCacheSize(
114 switch (Level) {
116 // - Penryn
117 // - Nehalem
118 // - Westmere
119 // - Sandy Bridge
120 // - Ivy Bridge
121 // - Haswell
122 // - Broadwell
123 // - Skylake
124 // - Kabylake
125 return 32 * 1024; // 32 KByte
127 // - Penryn
128 // - Nehalem
129 // - Westmere
130 // - Sandy Bridge
131 // - Ivy Bridge
132 // - Haswell
133 // - Broadwell
134 // - Skylake
135 // - Kabylake
136 return 256 * 1024; // 256 KByte
137 }
138
139 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
140}
141
142std::optional<unsigned> X86TTIImpl::getCacheAssociativity(
144 // - Penryn
145 // - Nehalem
146 // - Westmere
147 // - Sandy Bridge
148 // - Ivy Bridge
149 // - Haswell
150 // - Broadwell
151 // - Skylake
152 // - Kabylake
153 switch (Level) {
155 [[fallthrough]];
157 return 8;
158 }
159
160 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
161}
162
163unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
164 bool Vector = (ClassID == 1);
165 if (Vector && !ST->hasSSE1())
166 return 0;
167
168 if (ST->is64Bit()) {
169 if (Vector && ST->hasAVX512())
170 return 32;
171 return 16;
172 }
173 return 8;
174}
175
178 unsigned PreferVectorWidth = ST->getPreferVectorWidth();
179 switch (K) {
181 return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
183 if (ST->hasAVX512() && ST->hasEVEX512() && PreferVectorWidth >= 512)
184 return TypeSize::getFixed(512);
185 if (ST->hasAVX() && PreferVectorWidth >= 256)
186 return TypeSize::getFixed(256);
187 if (ST->hasSSE1() && PreferVectorWidth >= 128)
188 return TypeSize::getFixed(128);
189 return TypeSize::getFixed(0);
191 return TypeSize::getScalable(0);
192 }
193
194 llvm_unreachable("Unsupported register kind");
195}
196
199 .getFixedValue();
200}
201
203 // If the loop will not be vectorized, don't interleave the loop.
204 // Let regular unroll to unroll the loop, which saves the overflow
205 // check and memory check cost.
206 if (VF.isScalar())
207 return 1;
208
209 if (ST->isAtom())
210 return 1;
211
212 // Sandybridge and Haswell have multiple execution ports and pipelined
213 // vector units.
214 if (ST->hasAVX())
215 return 4;
216
217 return 2;
218}
219
221 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
224 const Instruction *CxtI) {
225
226 // vXi8 multiplications are always promoted to vXi16.
227 // Sub-128-bit types can be extended/packed more efficiently.
228 if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
229 Ty->getPrimitiveSizeInBits() <= 64 && Ty->getScalarSizeInBits() == 8) {
230 Type *WideVecTy =
231 VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
232 return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
234 CostKind) +
235 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
237 CostKind) +
238 getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info);
239 }
240
241 // Legalize the type.
242 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
243
244 int ISD = TLI->InstructionOpcodeToISD(Opcode);
245 assert(ISD && "Invalid opcode");
246
247 if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
248 (LT.second.getScalarType() == MVT::i32 ||
249 LT.second.getScalarType() == MVT::i64)) {
250 // Check if the operands can be represented as a smaller datatype.
251 bool Op1Signed = false, Op2Signed = false;
252 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
253 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
254 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
255 bool SignedMode = Op1Signed || Op2Signed;
256
257 // If both vXi32 are representable as i15 and at least one is constant,
258 // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
259 // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
260 if (OpMinSize <= 15 && !ST->isPMADDWDSlow() &&
261 LT.second.getScalarType() == MVT::i32) {
262 bool Op1Constant =
263 isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
264 bool Op2Constant =
265 isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
266 bool Op1Sext = isa<SExtInst>(Args[0]) &&
267 (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
268 bool Op2Sext = isa<SExtInst>(Args[1]) &&
269 (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
270
271 bool IsZeroExtended = !Op1Signed || !Op2Signed;
272 bool IsConstant = Op1Constant || Op2Constant;
273 bool IsSext = Op1Sext || Op2Sext;
274 if (IsConstant || IsZeroExtended || IsSext)
275 LT.second =
276 MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
277 }
278
279 // Check if the vXi32 operands can be shrunk into a smaller datatype.
280 // This should match the codegen from reduceVMULWidth.
281 // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()).
282 if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) {
283 if (OpMinSize <= 7)
284 return LT.first * 3; // pmullw/sext
285 if (!SignedMode && OpMinSize <= 8)
286 return LT.first * 3; // pmullw/zext
287 if (OpMinSize <= 15)
288 return LT.first * 5; // pmullw/pmulhw/pshuf
289 if (!SignedMode && OpMinSize <= 16)
290 return LT.first * 5; // pmullw/pmulhw/pshuf
291 }
292
293 // If both vXi64 are representable as (unsigned) i32, then we can perform
294 // the multiple with a single PMULUDQ instruction.
295 // TODO: Add (SSE41+) PMULDQ handling for signed extensions.
296 if (!SignedMode && OpMinSize <= 32 && LT.second.getScalarType() == MVT::i64)
297 ISD = X86ISD::PMULUDQ;
298 }
299
300 // Vector multiply by pow2 will be simplified to shifts.
301 // Vector multiply by -pow2 will be simplified to shifts/negates.
302 if (ISD == ISD::MUL && Op2Info.isConstant() &&
303 (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) {
305 getArithmeticInstrCost(Instruction::Shl, Ty, CostKind,
306 Op1Info.getNoProps(), Op2Info.getNoProps());
307 if (Op2Info.isNegatedPowerOf2())
308 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
309 return Cost;
310 }
311
312 // On X86, vector signed division by constants power-of-two are
313 // normally expanded to the sequence SRA + SRL + ADD + SRA.
314 // The OperandValue properties may not be the same as that of the previous
315 // operation; conservatively assume OP_None.
316 if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
317 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
319 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
320 Op1Info.getNoProps(), Op2Info.getNoProps());
321 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
322 Op1Info.getNoProps(), Op2Info.getNoProps());
323 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
324 Op1Info.getNoProps(), Op2Info.getNoProps());
325
326 if (ISD == ISD::SREM) {
327 // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
328 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(),
329 Op2Info.getNoProps());
330 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(),
331 Op2Info.getNoProps());
332 }
333
334 return Cost;
335 }
336
337 // Vector unsigned division/remainder will be simplified to shifts/masks.
338 if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
339 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
340 if (ISD == ISD::UDIV)
341 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
342 Op1Info.getNoProps(), Op2Info.getNoProps());
343 // UREM
344 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
345 Op1Info.getNoProps(), Op2Info.getNoProps());
346 }
347
348 static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
349 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
350 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
351 { ISD::SRA, MVT::v16i8, { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb.
352 { ISD::SHL, MVT::v32i8, { 1, 8, 2, 3 } }, // psllw + pand.
353 { ISD::SRL, MVT::v32i8, { 1, 8, 2, 3 } }, // psrlw + pand.
354 { ISD::SRA, MVT::v32i8, { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb.
355 { ISD::SHL, MVT::v64i8, { 1, 8, 2, 3 } }, // psllw + pand.
356 { ISD::SRL, MVT::v64i8, { 1, 8, 2, 3 } }, // psrlw + pand.
357 { ISD::SRA, MVT::v64i8, { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb.
358
359 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // psllw
360 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
361 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
362 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // psllw
363 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
364 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
365 };
366
367 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI())
368 if (const auto *Entry =
369 CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second))
370 if (auto KindCost = Entry->Cost[CostKind])
371 return LT.first * *KindCost;
372
373 static const CostKindTblEntry AVX512UniformConstCostTable[] = {
374 { ISD::SHL, MVT::v64i8, { 2, 12, 5, 6 } }, // psllw + pand.
375 { ISD::SRL, MVT::v64i8, { 2, 12, 5, 6 } }, // psrlw + pand.
376 { ISD::SRA, MVT::v64i8, { 3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb.
377
378 { ISD::SHL, MVT::v16i16, { 2, 7, 4, 4 } }, // psllw + split.
379 { ISD::SRL, MVT::v16i16, { 2, 7, 4, 4 } }, // psrlw + split.
380 { ISD::SRA, MVT::v16i16, { 2, 7, 4, 4 } }, // psraw + split.
381
382 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, // pslld
383 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, // psrld
384 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, // psrad
385 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, // pslld
386 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, // psrld
387 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, // psrad
388
389 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, // psraq
390 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, // psllq
391 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, // psrlq
392 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, // psraq
393 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, // psllq
394 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, // psrlq
395 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, // psraq
396
397 { ISD::SDIV, MVT::v16i32, { 6 } }, // pmuludq sequence
398 { ISD::SREM, MVT::v16i32, { 8 } }, // pmuludq+mul+sub sequence
399 { ISD::UDIV, MVT::v16i32, { 5 } }, // pmuludq sequence
400 { ISD::UREM, MVT::v16i32, { 7 } }, // pmuludq+mul+sub sequence
401 };
402
403 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512())
404 if (const auto *Entry =
405 CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second))
406 if (auto KindCost = Entry->Cost[CostKind])
407 return LT.first * *KindCost;
408
409 static const CostKindTblEntry AVX2UniformConstCostTable[] = {
410 { ISD::SHL, MVT::v16i8, { 1, 8, 2, 3 } }, // psllw + pand.
411 { ISD::SRL, MVT::v16i8, { 1, 8, 2, 3 } }, // psrlw + pand.
412 { ISD::SRA, MVT::v16i8, { 2, 10, 5, 6 } }, // psrlw, pand, pxor, psubb.
413 { ISD::SHL, MVT::v32i8, { 2, 8, 2, 4 } }, // psllw + pand.
414 { ISD::SRL, MVT::v32i8, { 2, 8, 2, 4 } }, // psrlw + pand.
415 { ISD::SRA, MVT::v32i8, { 3, 10, 5, 9 } }, // psrlw, pand, pxor, psubb.
416
417 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw
418 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw
419 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw
420 { ISD::SHL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psllw
421 { ISD::SRL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psrlw
422 { ISD::SRA, MVT::v16i16,{ 2, 2, 1, 2 } }, // psraw
423
424 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
425 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld
426 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad
427 { ISD::SHL, MVT::v8i32, { 2, 2, 1, 2 } }, // pslld
428 { ISD::SRL, MVT::v8i32, { 2, 2, 1, 2 } }, // psrld
429 { ISD::SRA, MVT::v8i32, { 2, 2, 1, 2 } }, // psrad
430
431 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq
432 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq
433 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
434 { ISD::SHL, MVT::v4i64, { 2, 2, 1, 2 } }, // psllq
435 { ISD::SRL, MVT::v4i64, { 2, 2, 1, 2 } }, // psrlq
436 { ISD::SRA, MVT::v4i64, { 4, 4, 3, 6 } }, // psrad + shuffle + split.
437
438 { ISD::SDIV, MVT::v8i32, { 6 } }, // pmuludq sequence
439 { ISD::SREM, MVT::v8i32, { 8 } }, // pmuludq+mul+sub sequence
440 { ISD::UDIV, MVT::v8i32, { 5 } }, // pmuludq sequence
441 { ISD::UREM, MVT::v8i32, { 7 } }, // pmuludq+mul+sub sequence
442 };
443
444 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2())
445 if (const auto *Entry =
446 CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second))
447 if (auto KindCost = Entry->Cost[CostKind])
448 return LT.first * *KindCost;
449
450 static const CostKindTblEntry AVXUniformConstCostTable[] = {
451 { ISD::SHL, MVT::v16i8, { 2, 7, 2, 3 } }, // psllw + pand.
452 { ISD::SRL, MVT::v16i8, { 2, 7, 2, 3 } }, // psrlw + pand.
453 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
454 { ISD::SHL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psllw + pand) + split.
455 { ISD::SRL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psrlw + pand) + split.
456 { ISD::SRA, MVT::v32i8, { 7, 7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split.
457
458 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 1 } }, // psllw.
459 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 1 } }, // psrlw.
460 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 1 } }, // psraw.
461 { ISD::SHL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psllw + split.
462 { ISD::SRL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psrlw + split.
463 { ISD::SRA, MVT::v16i16,{ 3, 6, 4, 5 } }, // psraw + split.
464
465 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 1 } }, // pslld.
466 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 1 } }, // psrld.
467 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 1 } }, // psrad.
468 { ISD::SHL, MVT::v8i32, { 3, 6, 4, 5 } }, // pslld + split.
469 { ISD::SRL, MVT::v8i32, { 3, 6, 4, 5 } }, // psrld + split.
470 { ISD::SRA, MVT::v8i32, { 3, 6, 4, 5 } }, // psrad + split.
471
472 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 1 } }, // psllq.
473 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 1 } }, // psrlq.
474 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
475 { ISD::SHL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
476 { ISD::SRL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
477 { ISD::SRA, MVT::v4i64, { 5, 7, 8, 9 } }, // 2 x psrad + shuffle + split.
478
479 { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split.
480 { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split.
481 { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split.
482 { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split.
483 };
484
485 // XOP has faster vXi8 shifts.
486 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() &&
487 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
488 if (const auto *Entry =
489 CostTableLookup(AVXUniformConstCostTable, ISD, LT.second))
490 if (auto KindCost = Entry->Cost[CostKind])
491 return LT.first * *KindCost;
492
493 static const CostKindTblEntry SSE2UniformConstCostTable[] = {
494 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
495 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
496 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
497
498 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw.
499 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw.
500 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw.
501
502 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
503 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld.
504 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad.
505
506 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq.
507 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq.
508 { ISD::SRA, MVT::v2i64, { 3, 5, 6, 6 } }, // 2 x psrad + shuffle.
509
510 { ISD::SDIV, MVT::v4i32, { 6 } }, // pmuludq sequence
511 { ISD::SREM, MVT::v4i32, { 8 } }, // pmuludq+mul+sub sequence
512 { ISD::UDIV, MVT::v4i32, { 5 } }, // pmuludq sequence
513 { ISD::UREM, MVT::v4i32, { 7 } }, // pmuludq+mul+sub sequence
514 };
515
516 // XOP has faster vXi8 shifts.
517 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() &&
518 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
519 if (const auto *Entry =
520 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
521 if (auto KindCost = Entry->Cost[CostKind])
522 return LT.first * *KindCost;
523
524 static const CostKindTblEntry AVX512BWConstCostTable[] = {
525 { ISD::SDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
526 { ISD::SREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
527 { ISD::UDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
528 { ISD::UREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
529
530 { ISD::SDIV, MVT::v32i16, { 6 } }, // vpmulhw sequence
531 { ISD::SREM, MVT::v32i16, { 8 } }, // vpmulhw+mul+sub sequence
532 { ISD::UDIV, MVT::v32i16, { 6 } }, // vpmulhuw sequence
533 { ISD::UREM, MVT::v32i16, { 8 } }, // vpmulhuw+mul+sub sequence
534 };
535
536 if (Op2Info.isConstant() && ST->hasBWI())
537 if (const auto *Entry =
538 CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
539 if (auto KindCost = Entry->Cost[CostKind])
540 return LT.first * *KindCost;
541
542 static const CostKindTblEntry AVX512ConstCostTable[] = {
543 { ISD::SDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
544 { ISD::SREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
545 { ISD::UDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
546 { ISD::UREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
547
548 { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence
549 { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence
550 { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence
551 { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence
552
553 { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence
554 { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence
555 { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence
556 { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence
557 };
558
559 if (Op2Info.isConstant() && ST->hasAVX512())
560 if (const auto *Entry =
561 CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
562 if (auto KindCost = Entry->Cost[CostKind])
563 return LT.first * *KindCost;
564
565 static const CostKindTblEntry AVX2ConstCostTable[] = {
566 { ISD::SDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
567 { ISD::SREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
568 { ISD::UDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
569 { ISD::UREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
570
571 { ISD::SDIV, MVT::v16i16, { 6 } }, // vpmulhw sequence
572 { ISD::SREM, MVT::v16i16, { 8 } }, // vpmulhw+mul+sub sequence
573 { ISD::UDIV, MVT::v16i16, { 6 } }, // vpmulhuw sequence
574 { ISD::UREM, MVT::v16i16, { 8 } }, // vpmulhuw+mul+sub sequence
575
576 { ISD::SDIV, MVT::v8i32, { 15 } }, // vpmuldq sequence
577 { ISD::SREM, MVT::v8i32, { 19 } }, // vpmuldq+mul+sub sequence
578 { ISD::UDIV, MVT::v8i32, { 15 } }, // vpmuludq sequence
579 { ISD::UREM, MVT::v8i32, { 19 } }, // vpmuludq+mul+sub sequence
580 };
581
582 if (Op2Info.isConstant() && ST->hasAVX2())
583 if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
584 if (auto KindCost = Entry->Cost[CostKind])
585 return LT.first * *KindCost;
586
587 static const CostKindTblEntry AVXConstCostTable[] = {
588 { ISD::SDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
589 { ISD::SREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
590 { ISD::UDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
591 { ISD::UREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
592
593 { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split.
594 { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split.
595 { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split.
596 { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split.
597
598 { ISD::SDIV, MVT::v8i32, { 32 } }, // vpmuludq sequence
599 { ISD::SREM, MVT::v8i32, { 38 } }, // vpmuludq+mul+sub sequence
600 { ISD::UDIV, MVT::v8i32, { 32 } }, // 2*pmuludq sequence + split.
601 { ISD::UREM, MVT::v8i32, { 42 } }, // 2*pmuludq+mul+sub sequence + split.
602 };
603
604 if (Op2Info.isConstant() && ST->hasAVX())
605 if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second))
606 if (auto KindCost = Entry->Cost[CostKind])
607 return LT.first * *KindCost;
608
609 static const CostKindTblEntry SSE41ConstCostTable[] = {
610 { ISD::SDIV, MVT::v4i32, { 15 } }, // vpmuludq sequence
611 { ISD::SREM, MVT::v4i32, { 20 } }, // vpmuludq+mul+sub sequence
612 };
613
614 if (Op2Info.isConstant() && ST->hasSSE41())
615 if (const auto *Entry =
616 CostTableLookup(SSE41ConstCostTable, ISD, LT.second))
617 if (auto KindCost = Entry->Cost[CostKind])
618 return LT.first * *KindCost;
619
620 static const CostKindTblEntry SSE2ConstCostTable[] = {
621 { ISD::SDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
622 { ISD::SREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
623 { ISD::UDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
624 { ISD::UREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
625
626 { ISD::SDIV, MVT::v8i16, { 6 } }, // pmulhw sequence
627 { ISD::SREM, MVT::v8i16, { 8 } }, // pmulhw+mul+sub sequence
628 { ISD::UDIV, MVT::v8i16, { 6 } }, // pmulhuw sequence
629 { ISD::UREM, MVT::v8i16, { 8 } }, // pmulhuw+mul+sub sequence
630
631 { ISD::SDIV, MVT::v4i32, { 19 } }, // pmuludq sequence
632 { ISD::SREM, MVT::v4i32, { 24 } }, // pmuludq+mul+sub sequence
633 { ISD::UDIV, MVT::v4i32, { 15 } }, // pmuludq sequence
634 { ISD::UREM, MVT::v4i32, { 20 } }, // pmuludq+mul+sub sequence
635 };
636
637 if (Op2Info.isConstant() && ST->hasSSE2())
638 if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
639 if (auto KindCost = Entry->Cost[CostKind])
640 return LT.first * *KindCost;
641
642 static const CostKindTblEntry AVX512BWUniformCostTable[] = {
643 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
644 { ISD::SRL, MVT::v16i8, { 3,10, 5, 8 } }, // psrlw + pand.
645 { ISD::SRA, MVT::v16i8, { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb.
646 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
647 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
648 { ISD::SRA, MVT::v32i8, { 5,10,10,13 } }, // psrlw, pand, pxor, psubb.
649 { ISD::SHL, MVT::v64i8, { 4, 7, 6, 8 } }, // psllw + pand.
650 { ISD::SRL, MVT::v64i8, { 4, 8, 7,10 } }, // psrlw + pand.
651 { ISD::SRA, MVT::v64i8, { 5,10,10,15 } }, // psrlw, pand, pxor, psubb.
652
653 { ISD::SHL, MVT::v32i16, { 2, 4, 2, 3 } }, // psllw
654 { ISD::SRL, MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw
655 { ISD::SRA, MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw
656 };
657
658 if (ST->hasBWI() && Op2Info.isUniform())
659 if (const auto *Entry =
660 CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second))
661 if (auto KindCost = Entry->Cost[CostKind])
662 return LT.first * *KindCost;
663
664 static const CostKindTblEntry AVX512UniformCostTable[] = {
665 { ISD::SHL, MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split.
666 { ISD::SRL, MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split.
667 { ISD::SRA, MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split.
668
669 { ISD::SHL, MVT::v16i32, { 2, 4, 2, 3 } }, // pslld
670 { ISD::SRL, MVT::v16i32, { 2, 4, 2, 3 } }, // psrld
671 { ISD::SRA, MVT::v16i32, { 2, 4, 2, 3 } }, // psrad
672
673 { ISD::SRA, MVT::v2i64, { 1, 2, 1, 2 } }, // psraq
674 { ISD::SHL, MVT::v4i64, { 1, 4, 1, 2 } }, // psllq
675 { ISD::SRL, MVT::v4i64, { 1, 4, 1, 2 } }, // psrlq
676 { ISD::SRA, MVT::v4i64, { 1, 4, 1, 2 } }, // psraq
677 { ISD::SHL, MVT::v8i64, { 1, 4, 1, 2 } }, // psllq
678 { ISD::SRL, MVT::v8i64, { 1, 4, 1, 2 } }, // psrlq
679 { ISD::SRA, MVT::v8i64, { 1, 4, 1, 2 } }, // psraq
680 };
681
682 if (ST->hasAVX512() && Op2Info.isUniform())
683 if (const auto *Entry =
684 CostTableLookup(AVX512UniformCostTable, ISD, LT.second))
685 if (auto KindCost = Entry->Cost[CostKind])
686 return LT.first * *KindCost;
687
688 static const CostKindTblEntry AVX2UniformCostTable[] = {
689 // Uniform splats are cheaper for the following instructions.
690 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
691 { ISD::SRL, MVT::v16i8, { 3, 9, 5, 8 } }, // psrlw + pand.
692 { ISD::SRA, MVT::v16i8, { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb.
693 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
694 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
695 { ISD::SRA, MVT::v32i8, { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb.
696
697 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 2 } }, // psllw.
698 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 2 } }, // psrlw.
699 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 2 } }, // psraw.
700 { ISD::SHL, MVT::v16i16, { 2, 4, 2, 3 } }, // psllw.
701 { ISD::SRL, MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw.
702 { ISD::SRA, MVT::v16i16, { 2, 4, 2, 3 } }, // psraw.
703
704 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 2 } }, // pslld
705 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 2 } }, // psrld
706 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 2 } }, // psrad
707 { ISD::SHL, MVT::v8i32, { 2, 4, 2, 3 } }, // pslld
708 { ISD::SRL, MVT::v8i32, { 2, 4, 2, 3 } }, // psrld
709 { ISD::SRA, MVT::v8i32, { 2, 4, 2, 3 } }, // psrad
710
711 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 2 } }, // psllq
712 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 2 } }, // psrlq
713 { ISD::SRA, MVT::v2i64, { 2, 4, 5, 7 } }, // 2 x psrad + shuffle.
714 { ISD::SHL, MVT::v4i64, { 2, 4, 1, 2 } }, // psllq
715 { ISD::SRL, MVT::v4i64, { 2, 4, 1, 2 } }, // psrlq
716 { ISD::SRA, MVT::v4i64, { 4, 6, 5, 9 } }, // 2 x psrad + shuffle.
717 };
718
719 if (ST->hasAVX2() && Op2Info.isUniform())
720 if (const auto *Entry =
721 CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
722 if (auto KindCost = Entry->Cost[CostKind])
723 return LT.first * *KindCost;
724
725 static const CostKindTblEntry AVXUniformCostTable[] = {
726 { ISD::SHL, MVT::v16i8, { 4, 4, 6, 8 } }, // psllw + pand.
727 { ISD::SRL, MVT::v16i8, { 4, 8, 5, 8 } }, // psrlw + pand.
728 { ISD::SRA, MVT::v16i8, { 6, 6, 9,13 } }, // psrlw, pand, pxor, psubb.
729 { ISD::SHL, MVT::v32i8, { 7, 8,11,14 } }, // psllw + pand + split.
730 { ISD::SRL, MVT::v32i8, { 7, 9,10,14 } }, // psrlw + pand + split.
731 { ISD::SRA, MVT::v32i8, { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split.
732
733 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 2 } }, // psllw.
734 { ISD::SRL, MVT::v8i16, { 1, 3, 1, 2 } }, // psrlw.
735 { ISD::SRA, MVT::v8i16, { 1, 3, 1, 2 } }, // psraw.
736 { ISD::SHL, MVT::v16i16, { 3, 7, 5, 7 } }, // psllw + split.
737 { ISD::SRL, MVT::v16i16, { 3, 7, 5, 7 } }, // psrlw + split.
738 { ISD::SRA, MVT::v16i16, { 3, 7, 5, 7 } }, // psraw + split.
739
740 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 2 } }, // pslld.
741 { ISD::SRL, MVT::v4i32, { 1, 3, 1, 2 } }, // psrld.
742 { ISD::SRA, MVT::v4i32, { 1, 3, 1, 2 } }, // psrad.
743 { ISD::SHL, MVT::v8i32, { 3, 7, 5, 7 } }, // pslld + split.
744 { ISD::SRL, MVT::v8i32, { 3, 7, 5, 7 } }, // psrld + split.
745 { ISD::SRA, MVT::v8i32, { 3, 7, 5, 7 } }, // psrad + split.
746
747 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 2 } }, // psllq.
748 { ISD::SRL, MVT::v2i64, { 1, 3, 1, 2 } }, // psrlq.
749 { ISD::SRA, MVT::v2i64, { 3, 4, 5, 7 } }, // 2 x psrad + shuffle.
750 { ISD::SHL, MVT::v4i64, { 3, 7, 4, 6 } }, // psllq + split.
751 { ISD::SRL, MVT::v4i64, { 3, 7, 4, 6 } }, // psrlq + split.
752 { ISD::SRA, MVT::v4i64, { 6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split.
753 };
754
755 // XOP has faster vXi8 shifts.
756 if (ST->hasAVX() && Op2Info.isUniform() &&
757 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
758 if (const auto *Entry =
759 CostTableLookup(AVXUniformCostTable, ISD, LT.second))
760 if (auto KindCost = Entry->Cost[CostKind])
761 return LT.first * *KindCost;
762
763 static const CostKindTblEntry SSE2UniformCostTable[] = {
764 // Uniform splats are cheaper for the following instructions.
765 { ISD::SHL, MVT::v16i8, { 9, 10, 6, 9 } }, // psllw + pand.
766 { ISD::SRL, MVT::v16i8, { 9, 13, 5, 9 } }, // psrlw + pand.
767 { ISD::SRA, MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence.
768
769 { ISD::SHL, MVT::v8i16, { 2, 2, 1, 2 } }, // psllw.
770 { ISD::SRL, MVT::v8i16, { 2, 2, 1, 2 } }, // psrlw.
771 { ISD::SRA, MVT::v8i16, { 2, 2, 1, 2 } }, // psraw.
772
773 { ISD::SHL, MVT::v4i32, { 2, 2, 1, 2 } }, // pslld
774 { ISD::SRL, MVT::v4i32, { 2, 2, 1, 2 } }, // psrld.
775 { ISD::SRA, MVT::v4i32, { 2, 2, 1, 2 } }, // psrad.
776
777 { ISD::SHL, MVT::v2i64, { 2, 2, 1, 2 } }, // psllq.
778 { ISD::SRL, MVT::v2i64, { 2, 2, 1, 2 } }, // psrlq.
779 { ISD::SRA, MVT::v2i64, { 5, 9, 5, 7 } }, // 2*psrlq + xor + sub.
780 };
781
782 if (ST->hasSSE2() && Op2Info.isUniform() &&
783 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
784 if (const auto *Entry =
785 CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
786 if (auto KindCost = Entry->Cost[CostKind])
787 return LT.first * *KindCost;
788
789 static const CostKindTblEntry AVX512DQCostTable[] = {
790 { ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq
791 { ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq
792 { ISD::MUL, MVT::v8i64, { 3, 15, 1, 3 } } // pmullq
793 };
794
795 // Look for AVX512DQ lowering tricks for custom cases.
796 if (ST->hasDQI())
797 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
798 if (auto KindCost = Entry->Cost[CostKind])
799 return LT.first * *KindCost;
800
801 static const CostKindTblEntry AVX512BWCostTable[] = {
802 { ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence.
803 { ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence.
804 { ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence.
805 { ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence.
806 { ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence.
807 { ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence.
808 { ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence.
809 { ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence.
810 { ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence.
811
812 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw
813 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw
814 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw
815 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw
816 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw
817 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw
818 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw
819 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw
820 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw
821
822 { ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb
823 { ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw
824
825 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb
826 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw
827 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd
828 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq
829
830 { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb
831 { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw
832
833 { ISD::MUL, MVT::v64i8, { 5, 10,10,11 } },
834 { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw
835
836 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb
837 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw
838 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd
839 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq
840 };
841
842 // Look for AVX512BW lowering tricks for custom cases.
843 if (ST->hasBWI())
844 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
845 if (auto KindCost = Entry->Cost[CostKind])
846 return LT.first * *KindCost;
847
848 static const CostKindTblEntry AVX512CostTable[] = {
849 { ISD::SHL, MVT::v64i8, { 15, 19,27,33 } }, // vpblendv+split sequence.
850 { ISD::SRL, MVT::v64i8, { 15, 19,30,36 } }, // vpblendv+split sequence.
851 { ISD::SRA, MVT::v64i8, { 37, 37,51,63 } }, // vpblendv+split sequence.
852
853 { ISD::SHL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
854 { ISD::SRL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
855 { ISD::SRA, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence.
856
857 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } },
858 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } },
859 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } },
860 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } },
861 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } },
862 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } },
863 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } },
864 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } },
865 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } },
866
867 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } },
868 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } },
869 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } },
870 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } },
871 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } },
872 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } },
873 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } },
874 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } },
875 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } },
876
877 { ISD::ADD, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*paddb + split
878 { ISD::ADD, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*paddw + split
879
880 { ISD::SUB, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*psubb + split
881 { ISD::SUB, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*psubw + split
882
883 { ISD::AND, MVT::v32i8, { 1, 1, 1, 1 } },
884 { ISD::AND, MVT::v16i16, { 1, 1, 1, 1 } },
885 { ISD::AND, MVT::v8i32, { 1, 1, 1, 1 } },
886 { ISD::AND, MVT::v4i64, { 1, 1, 1, 1 } },
887
888 { ISD::OR, MVT::v32i8, { 1, 1, 1, 1 } },
889 { ISD::OR, MVT::v16i16, { 1, 1, 1, 1 } },
890 { ISD::OR, MVT::v8i32, { 1, 1, 1, 1 } },
891 { ISD::OR, MVT::v4i64, { 1, 1, 1, 1 } },
892
893 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 1 } },
894 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 1 } },
895 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } },
896 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } },
897
898 { ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
899 { ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
900 { ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
901 { ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add
902 { ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/
903
904 { X86ISD::PMULUDQ, MVT::v8i64, { 1, 5, 1, 1 } },
905
906 { ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
907 { ISD::FADD, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
908 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
909 { ISD::FSUB, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
910 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
911 { ISD::FMUL, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
912 { ISD::FMUL, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
913 { ISD::FMUL, MVT::v2f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
914 { ISD::FMUL, MVT::f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
915
916 { ISD::FDIV, MVT::f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
917 { ISD::FDIV, MVT::v2f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
918 { ISD::FDIV, MVT::v4f64, { 8, 14, 1, 1 } }, // Skylake from http://www.agner.org/
919 { ISD::FDIV, MVT::v8f64, { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/
920
921 { ISD::FNEG, MVT::v16f32, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
922 { ISD::FADD, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
923 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
924 { ISD::FSUB, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
925 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
926 { ISD::FMUL, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
927 { ISD::FMUL, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
928 { ISD::FMUL, MVT::v4f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
929 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
930
931 { ISD::FDIV, MVT::f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
932 { ISD::FDIV, MVT::v4f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
933 { ISD::FDIV, MVT::v8f32, { 5, 11, 1, 1 } }, // Skylake from http://www.agner.org/
934 { ISD::FDIV, MVT::v16f32, { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/
935 };
936
937 if (ST->hasAVX512())
938 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
939 if (auto KindCost = Entry->Cost[CostKind])
940 return LT.first * *KindCost;
941
942 static const CostKindTblEntry AVX2ShiftCostTable[] = {
943 // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
944 // customize them to detect the cases where shift amount is a scalar one.
945 { ISD::SHL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org)
946 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
947 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org)
948 { ISD::SHL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org)
949 { ISD::SRL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
950 { ISD::SRA, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org)
951 { ISD::SHL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org)
952 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org)
953 { ISD::SHL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org)
954 { ISD::SRL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org)
955 };
956
957 if (ST->hasAVX512()) {
958 if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant())
959 // On AVX512, a packed v32i16 shift left by a constant build_vector
960 // is lowered into a vector multiply (vpmullw).
961 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
962 Op1Info.getNoProps(), Op2Info.getNoProps());
963 }
964
965 // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
966 if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
967 if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
968 Op2Info.isConstant())
969 // On AVX2, a packed v16i16 shift left by a constant build_vector
970 // is lowered into a vector multiply (vpmullw).
971 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
972 Op1Info.getNoProps(), Op2Info.getNoProps());
973
974 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
975 if (auto KindCost = Entry->Cost[CostKind])
976 return LT.first * *KindCost;
977 }
978
979 static const CostKindTblEntry XOPShiftCostTable[] = {
980 // 128bit shifts take 1cy, but right shifts require negation beforehand.
981 { ISD::SHL, MVT::v16i8, { 1, 3, 1, 1 } },
982 { ISD::SRL, MVT::v16i8, { 2, 3, 1, 1 } },
983 { ISD::SRA, MVT::v16i8, { 2, 3, 1, 1 } },
984 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 1 } },
985 { ISD::SRL, MVT::v8i16, { 2, 3, 1, 1 } },
986 { ISD::SRA, MVT::v8i16, { 2, 3, 1, 1 } },
987 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 1 } },
988 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 1 } },
989 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 1 } },
990 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 1 } },
991 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } },
992 { ISD::SRA, MVT::v2i64, { 2, 3, 1, 1 } },
993 // 256bit shifts require splitting if AVX2 didn't catch them above.
994 { ISD::SHL, MVT::v32i8, { 4, 7, 5, 6 } },
995 { ISD::SRL, MVT::v32i8, { 6, 7, 5, 6 } },
996 { ISD::SRA, MVT::v32i8, { 6, 7, 5, 6 } },
997 { ISD::SHL, MVT::v16i16, { 4, 7, 5, 6 } },
998 { ISD::SRL, MVT::v16i16, { 6, 7, 5, 6 } },
999 { ISD::SRA, MVT::v16i16, { 6, 7, 5, 6 } },
1000 { ISD::SHL, MVT::v8i32, { 4, 7, 5, 6 } },
1001 { ISD::SRL, MVT::v8i32, { 6, 7, 5, 6 } },
1002 { ISD::SRA, MVT::v8i32, { 6, 7, 5, 6 } },
1003 { ISD::SHL, MVT::v4i64, { 4, 7, 5, 6 } },
1004 { ISD::SRL, MVT::v4i64, { 6, 7, 5, 6 } },
1005 { ISD::SRA, MVT::v4i64, { 6, 7, 5, 6 } },
1006 };
1007
1008 // Look for XOP lowering tricks.
1009 if (ST->hasXOP()) {
1010 // If the right shift is constant then we'll fold the negation so
1011 // it's as cheap as a left shift.
1012 int ShiftISD = ISD;
1013 if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant())
1014 ShiftISD = ISD::SHL;
1015 if (const auto *Entry =
1016 CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
1017 if (auto KindCost = Entry->Cost[CostKind])
1018 return LT.first * *KindCost;
1019 }
1020
1021 if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) {
1022 MVT VT = LT.second;
1023 // Vector shift left by non uniform constant can be lowered
1024 // into vector multiply.
1025 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
1026 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
1027 ISD = ISD::MUL;
1028 }
1029
1030 static const CostKindTblEntry GLMCostTable[] = {
1031 { ISD::FDIV, MVT::f32, { 18, 19, 1, 1 } }, // divss
1032 { ISD::FDIV, MVT::v4f32, { 35, 36, 1, 1 } }, // divps
1033 { ISD::FDIV, MVT::f64, { 33, 34, 1, 1 } }, // divsd
1034 { ISD::FDIV, MVT::v2f64, { 65, 66, 1, 1 } }, // divpd
1035 };
1036
1037 if (ST->useGLMDivSqrtCosts())
1038 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second))
1039 if (auto KindCost = Entry->Cost[CostKind])
1040 return LT.first * *KindCost;
1041
1042 static const CostKindTblEntry SLMCostTable[] = {
1043 { ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld
1044 { ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw
1045 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd
1046 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss
1047 { ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd
1048 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // mulps
1049 { ISD::FDIV, MVT::f32, { 17, 19, 1, 1 } }, // divss
1050 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 6 } }, // divps
1051 { ISD::FDIV, MVT::f64, { 32, 34, 1, 1 } }, // divsd
1052 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 6 } }, // divpd
1053 { ISD::FADD, MVT::v2f64, { 2, 4, 1, 1 } }, // addpd
1054 { ISD::FSUB, MVT::v2f64, { 2, 4, 1, 1 } }, // subpd
1055 // v2i64/v4i64 mul is custom lowered as a series of long:
1056 // multiplies(3), shifts(3) and adds(2)
1057 // slm muldq version throughput is 2 and addq throughput 4
1058 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
1059 // 3X4 (addq throughput) = 17
1060 { ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } },
1061 // slm addq\subq throughput is 4
1062 { ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } },
1063 { ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } },
1064 };
1065
1066 if (ST->useSLMArithCosts())
1067 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second))
1068 if (auto KindCost = Entry->Cost[CostKind])
1069 return LT.first * *KindCost;
1070
1071 static const CostKindTblEntry AVX2CostTable[] = {
1072 { ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence.
1073 { ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence.
1074 { ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence.
1075 { ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1076
1077 { ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence.
1078 { ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence.
1079 { ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence.
1080 { ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1081
1082 { ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence.
1083 { ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence.
1084 { ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence.
1085 { ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence.
1086 { ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence.
1087 { ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence.
1088
1089 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb
1090 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb
1091 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw
1092 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw
1093 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd
1094 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd
1095 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq
1096 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq
1097
1098 { ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack
1099 { ISD::MUL, MVT::v32i8, { 6, 11,10,19 } }, // unpack/pmullw
1100 { ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw
1101 { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld
1102 { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld
1103 { ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add
1104 { ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
1105
1106 { X86ISD::PMULUDQ, MVT::v4i64, { 1, 5, 1, 1 } },
1107
1108 { ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd
1109 { ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps
1110
1111 { ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd
1112 { ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss
1113 { ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd
1114 { ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps
1115 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd
1116 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps
1117
1118 { ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd
1119 { ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss
1120 { ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd
1121 { ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps
1122 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd
1123 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps
1124
1125 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd
1126 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss
1127 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd
1128 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps
1129 { ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd
1130 { ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps
1131
1132 { ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss
1133 { ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps
1134 { ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps
1135 { ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd
1136 { ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd
1137 { ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd
1138 };
1139
1140 // Look for AVX2 lowering tricks for custom cases.
1141 if (ST->hasAVX2())
1142 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
1143 if (auto KindCost = Entry->Cost[CostKind])
1144 return LT.first * *KindCost;
1145
1146 static const CostKindTblEntry AVX1CostTable[] = {
1147 // We don't have to scalarize unsupported ops. We can issue two half-sized
1148 // operations and we only need to extract the upper YMM half.
1149 // Two ops + 1 extract + 1 insert = 4.
1150 { ISD::MUL, MVT::v32i8, { 12, 13, 22, 23 } }, // unpack/pmullw + split
1151 { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split
1152 { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split
1153 { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
1154 { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } },
1155
1156 { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps
1157 { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps
1158 { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps
1159 { ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps
1160
1161 { ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps
1162 { ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps
1163 { ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps
1164 { ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps
1165
1166 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps
1167 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps
1168 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps
1169 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps
1170
1171 { ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split
1172 { ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split
1173 { ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split
1174 { ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split
1175 { ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split
1176 { ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split
1177 { ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split
1178 { ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split
1179 { ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq
1180 { ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq
1181
1182 { ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence.
1183 { ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split.
1184 { ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence.
1185 { ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split.
1186 { ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld
1187 { ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split
1188 { ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1189 { ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1190
1191 { ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence.
1192 { ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split.
1193 { ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1194 { ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1195 { ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1196 { ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1197 { ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1198 { ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1199
1200 { ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence.
1201 { ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split.
1202 { ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1203 { ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1204 { ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1205 { ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1206 { ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend.
1207 { ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split.
1208
1209 { ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1210 { ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1211
1212 { ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1213 { ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1214 { ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1215 { ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1216 { ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1217 { ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1218
1219 { ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1220 { ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1221 { ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1222 { ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1223 { ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1224 { ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1225
1226 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1227 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1228 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1229 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1230 { ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1231 { ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1232
1233 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1234 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1235 { ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/
1236 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1237 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1238 { ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/
1239 };
1240
1241 if (ST->hasAVX())
1242 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
1243 if (auto KindCost = Entry->Cost[CostKind])
1244 return LT.first * *KindCost;
1245
1246 static const CostKindTblEntry SSE42CostTable[] = {
1247 { ISD::FADD, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1248 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1249 { ISD::FADD, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1250 { ISD::FADD, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1251
1252 { ISD::FSUB, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1253 { ISD::FSUB, MVT::f32 , { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1254 { ISD::FSUB, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1255 { ISD::FSUB, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1256
1257 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1258 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1259 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1260 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1261
1262 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1263 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1264 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1265 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1266
1267 { ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add
1268 };
1269
1270 if (ST->hasSSE42())
1271 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
1272 if (auto KindCost = Entry->Cost[CostKind])
1273 return LT.first * *KindCost;
1274
1275 static const CostKindTblEntry SSE41CostTable[] = {
1276 { ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence.
1277 { ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence.
1278 { ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld
1279
1280 { ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence.
1281 { ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1282 { ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1283 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1284
1285 { ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence.
1286 { ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1287 { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1288 { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence.
1289
1290 { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*unpack/2*pmullw/2*and/pack
1291 { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org)
1292 };
1293
1294 if (ST->hasSSE41())
1295 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
1296 if (auto KindCost = Entry->Cost[CostKind])
1297 return LT.first * *KindCost;
1298
1299 static const CostKindTblEntry SSE2CostTable[] = {
1300 // We don't correctly identify costs of casts because they are marked as
1301 // custom.
1302 { ISD::SHL, MVT::v16i8, { 13, 21,26,28 } }, // cmpgtb sequence.
1303 { ISD::SHL, MVT::v8i16, { 24, 27,16,20 } }, // cmpgtw sequence.
1304 { ISD::SHL, MVT::v4i32, { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq.
1305 { ISD::SHL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1306
1307 { ISD::SRL, MVT::v16i8, { 14, 28,27,30 } }, // cmpgtb sequence.
1308 { ISD::SRL, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1309 { ISD::SRL, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1310 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1311
1312 { ISD::SRA, MVT::v16i8, { 27, 30,54,54 } }, // unpacked cmpgtb sequence.
1313 { ISD::SRA, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1314 { ISD::SRA, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1315 { ISD::SRA, MVT::v2i64, { 8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence.
1316
1317 { ISD::AND, MVT::v16i8, { 1, 1, 1, 1 } }, // pand
1318 { ISD::AND, MVT::v8i16, { 1, 1, 1, 1 } }, // pand
1319 { ISD::AND, MVT::v4i32, { 1, 1, 1, 1 } }, // pand
1320 { ISD::AND, MVT::v2i64, { 1, 1, 1, 1 } }, // pand
1321
1322 { ISD::OR, MVT::v16i8, { 1, 1, 1, 1 } }, // por
1323 { ISD::OR, MVT::v8i16, { 1, 1, 1, 1 } }, // por
1324 { ISD::OR, MVT::v4i32, { 1, 1, 1, 1 } }, // por
1325 { ISD::OR, MVT::v2i64, { 1, 1, 1, 1 } }, // por
1326
1327 { ISD::XOR, MVT::v16i8, { 1, 1, 1, 1 } }, // pxor
1328 { ISD::XOR, MVT::v8i16, { 1, 1, 1, 1 } }, // pxor
1329 { ISD::XOR, MVT::v4i32, { 1, 1, 1, 1 } }, // pxor
1330 { ISD::XOR, MVT::v2i64, { 1, 1, 1, 1 } }, // pxor
1331
1332 { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq
1333 { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq
1334
1335 { ISD::MUL, MVT::v16i8, { 5, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack
1336 { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw
1337 { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle
1338 { ISD::MUL, MVT::v2i64, { 7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add
1339
1340 { X86ISD::PMULUDQ, MVT::v2i64, { 1, 5, 1, 1 } },
1341
1342 { ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/
1343 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/
1344 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/
1345 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/
1346
1347 { ISD::FNEG, MVT::f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1348 { ISD::FNEG, MVT::f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1349 { ISD::FNEG, MVT::v4f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1350 { ISD::FNEG, MVT::v2f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1351
1352 { ISD::FADD, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1353 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1354 { ISD::FADD, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1355
1356 { ISD::FSUB, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1357 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1358 { ISD::FSUB, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1359
1360 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1361 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1362 };
1363
1364 if (ST->hasSSE2())
1365 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1366 if (auto KindCost = Entry->Cost[CostKind])
1367 return LT.first * *KindCost;
1368
1369 static const CostKindTblEntry SSE1CostTable[] = {
1370 { ISD::FDIV, MVT::f32, { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/
1371 { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/
1372
1373 { ISD::FNEG, MVT::f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1374 { ISD::FNEG, MVT::v4f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1375
1376 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1377 { ISD::FADD, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1378
1379 { ISD::FSUB, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1380 { ISD::FSUB, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1381
1382 { ISD::FMUL, MVT::f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1383 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1384 };
1385
1386 if (ST->hasSSE1())
1387 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1388 if (auto KindCost = Entry->Cost[CostKind])
1389 return LT.first * *KindCost;
1390
1391 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
1392 { ISD::ADD, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1393 { ISD::SUB, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1394 { ISD::MUL, MVT::i64, { 2, 6, 1, 2 } },
1395 };
1396
1397 if (ST->is64Bit())
1398 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1399 if (auto KindCost = Entry->Cost[CostKind])
1400 return LT.first * *KindCost;
1401
1402 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1403 { ISD::ADD, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1404 { ISD::ADD, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1405 { ISD::ADD, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1406
1407 { ISD::SUB, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1408 { ISD::SUB, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1409 { ISD::SUB, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1410
1411 { ISD::MUL, MVT::i8, { 3, 4, 1, 1 } },
1412 { ISD::MUL, MVT::i16, { 2, 4, 1, 1 } },
1413 { ISD::MUL, MVT::i32, { 1, 4, 1, 1 } },
1414
1415 { ISD::FNEG, MVT::f64, { 2, 2, 1, 3 } }, // (x87)
1416 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1417 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1418 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // (x87)
1419 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87)
1420 };
1421
1422 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1423 if (auto KindCost = Entry->Cost[CostKind])
1424 return LT.first * *KindCost;
1425
1426 // It is not a good idea to vectorize division. We have to scalarize it and
1427 // in the process we will often end up having to spilling regular
1428 // registers. The overhead of division is going to dominate most kernels
1429 // anyways so try hard to prevent vectorization of division - it is
1430 // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1431 // to hide "20 cycles" for each lane.
1432 if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() &&
1433 (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
1434 ISD == ISD::UREM)) {
1435 InstructionCost ScalarCost =
1437 Op1Info.getNoProps(), Op2Info.getNoProps());
1438 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1439 }
1440
1441 // Handle some basic single instruction code size cases.
1442 if (CostKind == TTI::TCK_CodeSize) {
1443 switch (ISD) {
1444 case ISD::FADD:
1445 case ISD::FSUB:
1446 case ISD::FMUL:
1447 case ISD::FDIV:
1448 case ISD::FNEG:
1449 case ISD::AND:
1450 case ISD::OR:
1451 case ISD::XOR:
1452 return LT.first;
1453 break;
1454 }
1455 }
1456
1457 // Fallback to the default implementation.
1458 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1459 Args, CxtI);
1460}
1461
1464 unsigned Opcode1, const SmallBitVector &OpcodeMask,
1466 if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
1467 return TTI::TCC_Basic;
1469}
1470
1472 TTI::ShuffleKind Kind, VectorType *BaseTp, ArrayRef<int> Mask,
1474 ArrayRef<const Value *> Args, const Instruction *CxtI) {
1475 // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1476 // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1477 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(BaseTp);
1478
1479 Kind = improveShuffleKindFromMask(Kind, Mask, BaseTp, Index, SubTp);
1480
1481 // Recognize a basic concat_vector shuffle.
1482 if (Kind == TTI::SK_PermuteTwoSrc &&
1483 Mask.size() == (2 * BaseTp->getElementCount().getKnownMinValue()) &&
1484 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
1487 CostKind, Mask.size() / 2, BaseTp);
1488
1489 // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1490 if (Kind == TTI::SK_Transpose)
1491 Kind = TTI::SK_PermuteTwoSrc;
1492
1493 if (Kind == TTI::SK_Broadcast) {
1494 // For Broadcasts we are splatting the first element from the first input
1495 // register, so only need to reference that input and all the output
1496 // registers are the same.
1497 LT.first = 1;
1498
1499 // If we're broadcasting a load then AVX/AVX2 can do this for free.
1500 using namespace PatternMatch;
1501 if (!Args.empty() && match(Args[0], m_OneUse(m_Load(m_Value()))) &&
1502 (ST->hasAVX2() ||
1503 (ST->hasAVX() && LT.second.getScalarSizeInBits() >= 32)))
1504 return TTI::TCC_Free;
1505 }
1506
1507 // Treat <X x bfloat> shuffles as <X x half>.
1508 if (LT.second.isVector() && LT.second.getScalarType() == MVT::bf16)
1509 LT.second = LT.second.changeVectorElementType(MVT::f16);
1510
1511 // Subvector extractions are free if they start at the beginning of a
1512 // vector and cheap if the subvectors are aligned.
1513 if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1514 int NumElts = LT.second.getVectorNumElements();
1515 if ((Index % NumElts) == 0)
1516 return 0;
1517 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1518 if (SubLT.second.isVector()) {
1519 int NumSubElts = SubLT.second.getVectorNumElements();
1520 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1521 return SubLT.first;
1522 // Handle some cases for widening legalization. For now we only handle
1523 // cases where the original subvector was naturally aligned and evenly
1524 // fit in its legalized subvector type.
1525 // FIXME: Remove some of the alignment restrictions.
1526 // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1527 // vectors.
1528 int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1529 if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1530 (NumSubElts % OrigSubElts) == 0 &&
1531 LT.second.getVectorElementType() ==
1532 SubLT.second.getVectorElementType() &&
1533 LT.second.getVectorElementType().getSizeInBits() ==
1535 assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1536 "Unexpected number of elements!");
1537 auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
1538 LT.second.getVectorNumElements());
1539 auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
1540 SubLT.second.getVectorNumElements());
1541 int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1542 InstructionCost ExtractCost =
1543 getShuffleCost(TTI::SK_ExtractSubvector, VecTy, std::nullopt,
1544 CostKind, ExtractIndex, SubTy);
1545
1546 // If the original size is 32-bits or more, we can use pshufd. Otherwise
1547 // if we have SSSE3 we can use pshufb.
1548 if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1549 return ExtractCost + 1; // pshufd or pshufb
1550
1551 assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1552 "Unexpected vector size");
1553
1554 return ExtractCost + 2; // worst case pshufhw + pshufd
1555 }
1556 }
1557 // If the extract subvector is not optimal, treat it as single op shuffle.
1559 }
1560
1561 // Subvector insertions are cheap if the subvectors are aligned.
1562 // Note that in general, the insertion starting at the beginning of a vector
1563 // isn't free, because we need to preserve the rest of the wide vector.
1564 if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1565 int NumElts = LT.second.getVectorNumElements();
1566 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1567 if (SubLT.second.isVector()) {
1568 int NumSubElts = SubLT.second.getVectorNumElements();
1569 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1570 return SubLT.first;
1571 }
1572
1573 // If the insertion isn't aligned, treat it like a 2-op shuffle.
1574 Kind = TTI::SK_PermuteTwoSrc;
1575 }
1576
1577 // Handle some common (illegal) sub-vector types as they are often very cheap
1578 // to shuffle even on targets without PSHUFB.
1579 EVT VT = TLI->getValueType(DL, BaseTp);
1580 if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1581 !ST->hasSSSE3()) {
1582 static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
1583 {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw
1584 {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw
1585 {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw
1586 {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw
1587 {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck
1588
1589 {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw
1590 {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw
1591 {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus
1592 {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck
1593
1594 {TTI::SK_Splice, MVT::v4i16, 2}, // punpck+psrldq
1595 {TTI::SK_Splice, MVT::v2i16, 2}, // punpck+psrldq
1596 {TTI::SK_Splice, MVT::v4i8, 2}, // punpck+psrldq
1597 {TTI::SK_Splice, MVT::v2i8, 2}, // punpck+psrldq
1598
1599 {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw
1600 {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw
1601 {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw
1602 {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw
1603 {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck
1604
1605 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
1606 {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
1607 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw
1608 {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw
1609 {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck
1610 };
1611
1612 if (ST->hasSSE2())
1613 if (const auto *Entry =
1614 CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1615 return Entry->Cost;
1616 }
1617
1618 // We are going to permute multiple sources and the result will be in multiple
1619 // destinations. Providing an accurate cost only for splits where the element
1620 // type remains the same.
1621 if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
1622 MVT LegalVT = LT.second;
1623 if (LegalVT.isVector() &&
1624 LegalVT.getVectorElementType().getSizeInBits() ==
1626 LegalVT.getVectorNumElements() <
1627 cast<FixedVectorType>(BaseTp)->getNumElements()) {
1628 unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
1629 unsigned LegalVTSize = LegalVT.getStoreSize();
1630 // Number of source vectors after legalization:
1631 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1632 // Number of destination vectors after legalization:
1633 InstructionCost NumOfDests = LT.first;
1634
1635 auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
1636 LegalVT.getVectorNumElements());
1637
1638 if (!Mask.empty() && NumOfDests.isValid()) {
1639 // Try to perform better estimation of the permutation.
1640 // 1. Split the source/destination vectors into real registers.
1641 // 2. Do the mask analysis to identify which real registers are
1642 // permuted. If more than 1 source registers are used for the
1643 // destination register building, the cost for this destination register
1644 // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
1645 // source register is used, build mask and calculate the cost as a cost
1646 // of PermuteSingleSrc.
1647 // Also, for the single register permute we try to identify if the
1648 // destination register is just a copy of the source register or the
1649 // copy of the previous destination register (the cost is
1650 // TTI::TCC_Basic). If the source register is just reused, the cost for
1651 // this operation is 0.
1652 NumOfDests =
1654 FixedVectorType::get(BaseTp->getElementType(), Mask.size()))
1655 .first;
1656 unsigned E = *NumOfDests.getValue();
1657 unsigned NormalizedVF =
1658 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
1659 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
1660 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
1661 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
1662 copy(Mask, NormalizedMask.begin());
1663 unsigned PrevSrcReg = 0;
1664 ArrayRef<int> PrevRegMask;
1667 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
1668 [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask,
1669 &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
1670 if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) {
1671 // Check if the previous register can be just copied to the next
1672 // one.
1673 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
1674 PrevRegMask != RegMask)
1676 RegMask, CostKind, 0, nullptr);
1677 else
1678 // Just a copy of previous destination register.
1680 return;
1681 }
1682 if (SrcReg != DestReg &&
1683 any_of(RegMask, [](int I) { return I != PoisonMaskElem; })) {
1684 // Just a copy of the source register.
1686 }
1687 PrevSrcReg = SrcReg;
1688 PrevRegMask = RegMask;
1689 },
1690 [this, SingleOpTy, CostKind, &Cost](ArrayRef<int> RegMask,
1691 unsigned /*Unused*/,
1692 unsigned /*Unused*/) {
1693 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
1694 CostKind, 0, nullptr);
1695 });
1696 return Cost;
1697 }
1698
1699 InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1700 return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1701 std::nullopt, CostKind, 0, nullptr);
1702 }
1703
1704 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
1705 }
1706
1707 // For 2-input shuffles, we must account for splitting the 2 inputs into many.
1708 if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
1709 // We assume that source and destination have the same vector type.
1710 InstructionCost NumOfDests = LT.first;
1711 InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1;
1712 LT.first = NumOfDests * NumOfShufflesPerDest;
1713 }
1714
1715 static const CostTblEntry AVX512VBMIShuffleTbl[] = {
1716 {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
1717 {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
1718
1719 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
1720 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
1721
1722 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
1723 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
1724 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b
1725 };
1726
1727 if (ST->hasVBMI())
1728 if (const auto *Entry =
1729 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1730 return LT.first * Entry->Cost;
1731
1732 static const CostTblEntry AVX512BWShuffleTbl[] = {
1733 {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1734 {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
1735 {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1736
1737 {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
1738 {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
1739 {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
1740 {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
1741
1742 {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
1743 {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
1744 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
1745 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
1746 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
1747
1748 {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
1749 {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
1750 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
1751 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
1752 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1753
1754 {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
1755 {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
1756
1757 {TTI::SK_Splice, MVT::v32i16, 2}, // vshufi64x2 + palignr
1758 {TTI::SK_Splice, MVT::v32f16, 2}, // vshufi64x2 + palignr
1759 {TTI::SK_Splice, MVT::v64i8, 2}, // vshufi64x2 + palignr
1760 };
1761
1762 if (ST->hasBWI())
1763 if (const auto *Entry =
1764 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1765 return LT.first * Entry->Cost;
1766
1767 static const CostKindTblEntry AVX512ShuffleTbl[] = {
1768 {TTI::SK_Broadcast, MVT::v8f64, { 1, 1, 1, 1 } }, // vbroadcastsd
1769 {TTI::SK_Broadcast, MVT::v16f32, { 1, 1, 1, 1 } }, // vbroadcastss
1770 {TTI::SK_Broadcast, MVT::v8i64, { 1, 1, 1, 1 } }, // vpbroadcastq
1771 {TTI::SK_Broadcast, MVT::v16i32, { 1, 1, 1, 1 } }, // vpbroadcastd
1772 {TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw
1773 {TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw
1774 {TTI::SK_Broadcast, MVT::v64i8, { 1, 1, 1, 1 } }, // vpbroadcastb
1775
1776 {TTI::SK_Reverse, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1777 {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1778 {TTI::SK_Reverse, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1779 {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1780 {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca
1781 {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca
1782 {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca
1783
1784 {TTI::SK_Splice, MVT::v8f64, { 1, 1, 1, 1 } }, // vpalignd
1785 {TTI::SK_Splice, MVT::v4f64, { 1, 1, 1, 1 } }, // vpalignd
1786 {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd
1787 {TTI::SK_Splice, MVT::v8f32, { 1, 1, 1, 1 } }, // vpalignd
1788 {TTI::SK_Splice, MVT::v8i64, { 1, 1, 1, 1 } }, // vpalignd
1789 {TTI::SK_Splice, MVT::v4i64, { 1, 1, 1, 1 } }, // vpalignd
1790 {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd
1791 {TTI::SK_Splice, MVT::v8i32, { 1, 1, 1, 1 } }, // vpalignd
1792 {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr
1793 {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr
1794 {TTI::SK_Splice, MVT::v64i8, { 4, 4, 4, 4 } }, // split + palignr
1795
1796 {TTI::SK_PermuteSingleSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1797 {TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermpd
1798 {TTI::SK_PermuteSingleSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermpd
1799 {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1800 {TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermps
1801 {TTI::SK_PermuteSingleSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermps
1802 {TTI::SK_PermuteSingleSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1803 {TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermq
1804 {TTI::SK_PermuteSingleSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermq
1805 {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1806 {TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermd
1807 {TTI::SK_PermuteSingleSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermd
1808 {TTI::SK_PermuteSingleSrc, MVT::v16i8, { 1, 3, 1, 1 } }, // pshufb
1809
1810 {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermt2pd
1811 {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermt2ps
1812 {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermt2q
1813 {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermt2d
1814 {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermt2pd
1815 {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermt2ps
1816 {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermt2q
1817 {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermt2d
1818 {TTI::SK_PermuteTwoSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermt2pd
1819 {TTI::SK_PermuteTwoSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermt2ps
1820 {TTI::SK_PermuteTwoSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermt2q
1821 {TTI::SK_PermuteTwoSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermt2d
1822
1823 // FIXME: This just applies the type legalization cost rules above
1824 // assuming these completely split.
1825 {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } },
1826 {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } },
1827 {TTI::SK_PermuteSingleSrc, MVT::v64i8, { 14, 14, 14, 14 } },
1828 {TTI::SK_PermuteTwoSrc, MVT::v32i16, { 42, 42, 42, 42 } },
1829 {TTI::SK_PermuteTwoSrc, MVT::v32f16, { 42, 42, 42, 42 } },
1830 {TTI::SK_PermuteTwoSrc, MVT::v64i8, { 42, 42, 42, 42 } },
1831
1832 {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq
1833 {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq
1834 {TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vpternlogq
1835 {TTI::SK_Select, MVT::v8f64, { 1, 1, 1, 1 } }, // vblendmpd
1836 {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps
1837 {TTI::SK_Select, MVT::v8i64, { 1, 1, 1, 1 } }, // vblendmq
1838 {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd
1839 };
1840
1841 if (ST->hasAVX512())
1842 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1843 if (auto KindCost = Entry->Cost[CostKind])
1844 return LT.first * *KindCost;
1845
1846 static const CostTblEntry AVX2ShuffleTbl[] = {
1847 {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1848 {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1849 {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1850 {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1851 {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1852 {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
1853 {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1854
1855 {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1856 {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1857 {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1858 {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1859 {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1860 {TTI::SK_Reverse, MVT::v16f16, 2}, // vperm2i128 + pshufb
1861 {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1862
1863 {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1864 {TTI::SK_Select, MVT::v16f16, 1}, // vpblendvb
1865 {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1866
1867 {TTI::SK_Splice, MVT::v8i32, 2}, // vperm2i128 + vpalignr
1868 {TTI::SK_Splice, MVT::v8f32, 2}, // vperm2i128 + vpalignr
1869 {TTI::SK_Splice, MVT::v16i16, 2}, // vperm2i128 + vpalignr
1870 {TTI::SK_Splice, MVT::v16f16, 2}, // vperm2i128 + vpalignr
1871 {TTI::SK_Splice, MVT::v32i8, 2}, // vperm2i128 + vpalignr
1872
1873 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1874 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1875 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1876 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1877 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1878 // + vpblendvb
1879 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vperm2i128 + 2*vpshufb
1880 // + vpblendvb
1881 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1882 // + vpblendvb
1883
1884 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1885 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1886 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1887 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1888 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1889 // + vpblendvb
1890 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 7}, // 2*vperm2i128 + 4*vpshufb
1891 // + vpblendvb
1892 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
1893 // + vpblendvb
1894 };
1895
1896 if (ST->hasAVX2())
1897 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1898 return LT.first * Entry->Cost;
1899
1900 static const CostTblEntry XOPShuffleTbl[] = {
1901 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
1902 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
1903 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
1904 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
1905 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1906 // + vinsertf128
1907 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
1908 // + vinsertf128
1909
1910 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1911 // + vinsertf128
1912 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
1913 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
1914 // + vinsertf128
1915 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
1916 };
1917
1918 if (ST->hasXOP())
1919 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1920 return LT.first * Entry->Cost;
1921
1922 static const CostTblEntry AVX1ShuffleTbl[] = {
1923 {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1924 {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1925 {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1926 {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1927 {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1928 {TTI::SK_Broadcast, MVT::v16f16, 3}, // vpshuflw + vpshufd + vinsertf128
1929 {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
1930
1931 {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1932 {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1933 {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1934 {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1935 {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1936 // + vinsertf128
1937 {TTI::SK_Reverse, MVT::v16f16, 4}, // vextractf128 + 2*pshufb
1938 // + vinsertf128
1939 {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
1940 // + vinsertf128
1941
1942 {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
1943 {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
1944 {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
1945 {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
1946 {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
1947 {TTI::SK_Select, MVT::v16f16, 3}, // vpand + vpandn + vpor
1948 {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
1949
1950 {TTI::SK_Splice, MVT::v4i64, 2}, // vperm2f128 + shufpd
1951 {TTI::SK_Splice, MVT::v4f64, 2}, // vperm2f128 + shufpd
1952 {TTI::SK_Splice, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1953 {TTI::SK_Splice, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1954 {TTI::SK_Splice, MVT::v16i16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1955 {TTI::SK_Splice, MVT::v16f16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1956 {TTI::SK_Splice, MVT::v32i8, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1957
1958 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
1959 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
1960 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1961 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1962 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
1963 // + 2*por + vinsertf128
1964 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 8}, // vextractf128 + 4*pshufb
1965 // + 2*por + vinsertf128
1966 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
1967 // + 2*por + vinsertf128
1968
1969 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
1970 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
1971 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1972 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1973 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
1974 // + 4*por + vinsertf128
1975 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 15}, // 2*vextractf128 + 8*pshufb
1976 // + 4*por + vinsertf128
1977 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
1978 // + 4*por + vinsertf128
1979 };
1980
1981 if (ST->hasAVX())
1982 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
1983 return LT.first * Entry->Cost;
1984
1985 static const CostTblEntry SSE41ShuffleTbl[] = {
1986 {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
1987 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1988 {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
1989 {TTI::SK_Select, MVT::v4f32, 1}, // blendps
1990 {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
1991 {TTI::SK_Select, MVT::v8f16, 1}, // pblendw
1992 {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
1993 };
1994
1995 if (ST->hasSSE41())
1996 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
1997 return LT.first * Entry->Cost;
1998
1999 static const CostTblEntry SSSE3ShuffleTbl[] = {
2000 {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
2001 {TTI::SK_Broadcast, MVT::v8f16, 1}, // pshufb
2002 {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
2003
2004 {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
2005 {TTI::SK_Reverse, MVT::v8f16, 1}, // pshufb
2006 {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
2007
2008 {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
2009 {TTI::SK_Select, MVT::v8f16, 3}, // 2*pshufb + por
2010 {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
2011
2012 {TTI::SK_Splice, MVT::v4i32, 1}, // palignr
2013 {TTI::SK_Splice, MVT::v4f32, 1}, // palignr
2014 {TTI::SK_Splice, MVT::v8i16, 1}, // palignr
2015 {TTI::SK_Splice, MVT::v8f16, 1}, // palignr
2016 {TTI::SK_Splice, MVT::v16i8, 1}, // palignr
2017
2018 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
2019 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // pshufb
2020 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
2021
2022 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
2023 {TTI::SK_PermuteTwoSrc, MVT::v8f16, 3}, // 2*pshufb + por
2024 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
2025 };
2026
2027 if (ST->hasSSSE3())
2028 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
2029 return LT.first * Entry->Cost;
2030
2031 static const CostTblEntry SSE2ShuffleTbl[] = {
2032 {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
2033 {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
2034 {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
2035 {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
2036 {TTI::SK_Broadcast, MVT::v8f16, 2}, // pshuflw + pshufd
2037 {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
2038
2039 {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
2040 {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
2041 {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
2042 {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
2043 {TTI::SK_Reverse, MVT::v8f16, 3}, // pshuflw + pshufhw + pshufd
2044 {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
2045 // + 2*pshufd + 2*unpck + packus
2046
2047 {TTI::SK_Select, MVT::v2i64, 1}, // movsd
2048 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
2049 {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
2050 {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
2051 {TTI::SK_Select, MVT::v8f16, 3}, // pand + pandn + por
2052 {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
2053
2054 {TTI::SK_Splice, MVT::v2i64, 1}, // shufpd
2055 {TTI::SK_Splice, MVT::v2f64, 1}, // shufpd
2056 {TTI::SK_Splice, MVT::v4i32, 2}, // 2*{unpck,movsd,pshufd}
2057 {TTI::SK_Splice, MVT::v8i16, 3}, // psrldq + psrlldq + por
2058 {TTI::SK_Splice, MVT::v8f16, 3}, // psrldq + psrlldq + por
2059 {TTI::SK_Splice, MVT::v16i8, 3}, // psrldq + psrlldq + por
2060
2061 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
2062 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
2063 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
2064 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
2065 // + pshufd/unpck
2066 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 5}, // 2*pshuflw + 2*pshufhw
2067 // + pshufd/unpck
2068 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
2069 // + 2*pshufd + 2*unpck + 2*packus
2070
2071 { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
2072 { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
2073 { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
2074 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
2075 { TTI::SK_PermuteTwoSrc, MVT::v8f16, 8 }, // blend+permute
2076 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
2077 };
2078
2079 static const CostTblEntry SSE3BroadcastLoadTbl[] = {
2080 {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
2081 };
2082
2083 if (ST->hasSSE2()) {
2084 bool IsLoad =
2085 llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
2086 if (ST->hasSSE3() && IsLoad)
2087 if (const auto *Entry =
2088 CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
2090 LT.second.getVectorElementCount()) &&
2091 "Table entry missing from isLegalBroadcastLoad()");
2092 return LT.first * Entry->Cost;
2093 }
2094
2095 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
2096 return LT.first * Entry->Cost;
2097 }
2098
2099 static const CostTblEntry SSE1ShuffleTbl[] = {
2100 { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
2101 { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
2102 { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
2103 { TTI::SK_Splice, MVT::v4f32, 2 }, // 2*shufps
2104 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
2105 { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
2106 };
2107
2108 if (ST->hasSSE1())
2109 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
2110 return LT.first * Entry->Cost;
2111
2112 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
2113}
2114
2116 Type *Src,
2119 const Instruction *I) {
2120 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2121 assert(ISD && "Invalid opcode");
2122
2123 // TODO: Allow non-throughput costs that aren't binary.
2124 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
2126 return Cost == 0 ? 0 : 1;
2127 return Cost;
2128 };
2129
2130 // The cost tables include both specific, custom (non-legal) src/dst type
2131 // conversions and generic, legalized types. We test for customs first, before
2132 // falling back to legalization.
2133 // FIXME: Need a better design of the cost table to handle non-simple types of
2134 // potential massive combinations (elem_num x src_type x dst_type).
2135 static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
2136 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
2137 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
2138
2139 // Mask sign extend has an instruction.
2140 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
2141 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 },
2142 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
2143 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 },
2144 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
2145 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 },
2146 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
2147 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 },
2148 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
2149 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 },
2150 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
2151 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
2152 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
2153 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
2154 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 },
2155 { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 },
2156 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, 1 },
2157
2158 // Mask zero extend is a sext + shift.
2159 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
2160 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 },
2161 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
2162 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 },
2163 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
2164 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 },
2165 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
2166 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 },
2167 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
2168 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 },
2169 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
2170 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
2171 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
2172 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
2173 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 },
2174 { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 },
2175 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, 2 },
2176
2177 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 },
2178 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 },
2179 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 },
2180 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 },
2181 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 },
2182 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 },
2183 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 },
2184 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 },
2185 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 },
2186 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 },
2187 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 },
2188 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 },
2189 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 },
2190 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 },
2191 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, 2 },
2192 { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, 2 },
2193 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, 2 },
2194
2195 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 2 },
2196 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm
2197 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // vpmovwb
2198 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // vpmovwb
2199 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // vpmovwb
2200 };
2201
2202 static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
2203 // Mask sign extend has an instruction.
2204 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 },
2205 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 },
2206 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 },
2207 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 },
2208 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 },
2209 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, 1 },
2210 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 },
2211 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 },
2212
2213 // Mask zero extend is a sext + shift.
2214 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 },
2215 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 },
2216 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 },
2217 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 },
2218 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 },
2219 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, 2 },
2220 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 },
2221 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 },
2222
2223 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 },
2224 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 },
2225 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 },
2226 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 },
2227 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
2228 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 },
2229 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 },
2230 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, 2 },
2231
2232 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
2233 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
2234
2235 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
2236 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
2237
2238 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 },
2239 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 },
2240
2241 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 },
2242 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 },
2243 };
2244
2245 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
2246 // 256-bit wide vectors.
2247
2248 static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
2249 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 },
2250 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 },
2251 { ISD::FP_EXTEND, MVT::v16f64, MVT::v16f32, 4 }, // 2*vcvtps2pd+vextractf64x4
2252 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 },
2253
2254 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
2255 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
2256 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
2257 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd
2258 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
2259 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
2260 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
2261 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd
2262 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd
2263 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd
2264 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd
2265 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd
2266 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq
2267 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq
2268 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq
2269 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 2 }, // vpmovdb
2270 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 2 }, // vpmovdb
2271 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 }, // vpmovdb
2272 { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, 2 }, // vpmovdb
2273 { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, 2 }, // vpmovdb
2274 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 }, // vpmovdw
2275 { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, 2 }, // vpmovdw
2276 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 2 }, // vpmovqb
2277 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1 }, // vpshufb
2278 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 }, // vpmovqb
2279 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, 2 }, // vpmovqb
2280 { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, 2 }, // vpmovqb
2281 { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, 2 }, // vpmovqb
2282 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 }, // vpmovqw
2283 { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, 2 }, // vpmovqw
2284 { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, 2 }, // vpmovqw
2285 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, // vpmovqd
2286 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd
2287 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb
2288
2289 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32
2290 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 8 },
2291 { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, 8 },
2292
2293 // Sign extend is zmm vpternlogd+vptruncdb.
2294 // Zero extend is zmm broadcast load+vptruncdw.
2295 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 3 },
2296 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 4 },
2297 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 3 },
2298 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 4 },
2299 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 3 },
2300 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 4 },
2301 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 3 },
2302 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 4 },
2303
2304 // Sign extend is zmm vpternlogd+vptruncdw.
2305 // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
2306 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 3 },
2307 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 4 },
2308 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 3 },
2309 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 4 },
2310 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 3 },
2311 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 4 },
2312 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 3 },
2313 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
2314
2315 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd
2316 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld
2317 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd
2318 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld
2319 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd
2320 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld
2321 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq
2322 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq
2323 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq
2324 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq
2325
2326 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd
2327 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld
2328 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq
2329 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq
2330
2331 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
2332 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
2333 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
2334 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
2335 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
2336 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
2337 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
2338 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
2339 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
2340 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
2341
2342 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
2343 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
2344
2345 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
2346 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
2347 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 },
2348 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 },
2349 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
2350 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 },
2351 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
2352 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
2353
2354 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
2355 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
2356 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 },
2357 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 },
2358 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
2359 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 },
2360 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
2361 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
2362 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 },
2363 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 },
2364
2365 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 },
2366 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, 7 },
2367 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64,15 },
2368 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32,11 },
2369 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64,31 },
2370 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, 3 },
2371 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, 7 },
2372 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, 5 },
2373 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64,15 },
2374 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 1 },
2375 { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, 3 },
2376
2377 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 },
2378 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 3 },
2379 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 3 },
2380 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 },
2381 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 3 },
2382 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 3 },
2383 };
2384
2385 static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] {
2386 // Mask sign extend has an instruction.
2387 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
2388 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 },
2389 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
2390 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 },
2391 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
2392 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 },
2393 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
2394 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 },
2395 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
2396 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 },
2397 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
2398 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
2399 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
2400 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
2401 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, 1 },
2402 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, 1 },
2403 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, 1 },
2404
2405 // Mask zero extend is a sext + shift.
2406 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
2407 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 },
2408 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
2409 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 },
2410 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
2411 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 },
2412 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
2413 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 },
2414 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
2415 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 },
2416 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
2417 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
2418 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
2419 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
2420 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, 2 },
2421 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, 2 },
2422 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, 2 },
2423
2424 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 },
2425 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 },
2426 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 },
2427 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 },
2428 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 },
2429 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 },
2430 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 },
2431 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 },
2432 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 },
2433 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 },
2434 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 },
2435 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 },
2436 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 },
2437 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 },
2438 { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, 2 },
2439 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, 2 },
2440 { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, 2 },
2441
2442 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 },
2443 };
2444
2445 static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = {
2446 // Mask sign extend has an instruction.
2447 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 },
2448 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 },
2449 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 },
2450 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, 1 },
2451 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 },
2452 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, 1 },
2453 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, 1 },
2454 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 },
2455
2456 // Mask zero extend is a sext + shift.
2457 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 },
2458 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 },
2459 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 },
2460 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, 2 },
2461 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 },
2462 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, 2 },
2463 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, 2 },
2464 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 },
2465
2466 { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, 2 },
2467 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, 2 },
2468 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 },
2469 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 },
2470 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 },
2471 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 },
2472 { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, 2 },
2473 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
2474
2475 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
2476 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
2477 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
2478 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
2479
2480 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
2481 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
2482 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
2483 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
2484
2485 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, 1 },
2486 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 },
2487 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
2488 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 },
2489
2490 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, 1 },
2491 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 },
2492 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
2493 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 },
2494 };
2495
2496 static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = {
2497 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
2498 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
2499 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
2500 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8
2501 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
2502 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
2503 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
2504 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16
2505 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd
2506 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd
2507 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd
2508 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, 2 }, // vpslld+vptestmd
2509 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq
2510 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq
2511 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd
2512 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, // vpmovqb
2513 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, // vpmovqw
2514 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, // vpmovwb
2515
2516 // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
2517 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
2518 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 5 },
2519 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 6 },
2520 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 5 },
2521 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 6 },
2522 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 5 },
2523 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 6 },
2524 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 10 },
2525 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 12 },
2526
2527 // sign extend is vpcmpeq+maskedmove+vpmovdw
2528 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
2529 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 4 },
2530 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 5 },
2531 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 4 },
2532 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 5 },
2533 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 4 },
2534 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 5 },
2535 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 10 },
2536 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 12 },
2537
2538 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd
2539 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld
2540 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd
2541 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld
2542 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd
2543 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld
2544 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, 1 }, // vpternlogd
2545 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, 2 }, // vpternlogd+psrld
2546
2547 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq
2548 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq
2549 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq
2550 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq
2551
2552 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 1 },
2553 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 1 },
2554 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 1 },
2555 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 1 },
2556 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
2557 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
2558 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 1 },
2559 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 1 },
2560 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
2561 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
2562 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
2563 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
2564
2565 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
2566 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 },
2567 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
2568 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 },
2569
2570 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 },
2571 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 },
2572 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
2573 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 },
2574 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
2575 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 },
2576 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
2577 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
2578 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
2579 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
2580 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 },
2581 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
2582 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 5 },
2583
2584 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 },
2585 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 },
2586 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, 5 },
2587
2588 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 },
2589 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 },
2590 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
2591 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 1 },
2592 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 },
2593 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 },
2594 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 },
2595 };
2596
2597 static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
2598 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
2599 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
2600 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
2601 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
2602 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
2603 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
2604
2605 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 2 },
2606 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 2 },
2607 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 2 },
2608 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 2 },
2609 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
2610 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
2611 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 2 },
2612 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 2 },
2613 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
2614 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
2615 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
2616 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
2617 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
2618 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
2619
2620 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
2621
2622 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 4 },
2623 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 4 },
2624 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 1 },
2625 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 1 },
2626 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 1 },
2627 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 4 },
2628 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 4 },
2629 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 1 },
2630 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 1 },
2631 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 5 },
2632 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 },
2633 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 },
2634
2635 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 },
2636 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 },
2637
2638 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 1 },
2639 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 1 },
2640 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 1 },
2641 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 3 },
2642
2643 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 3 },
2644 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 3 },
2645 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 1 },
2646 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 },
2647 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
2648 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4 },
2649 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 3 },
2650 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 4 },
2651
2652 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 },
2653 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 },
2654 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 },
2655 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 },
2656 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
2657 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
2658 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 3 },
2659
2660 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 },
2661 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 },
2662 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 },
2663 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 },
2664 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 },
2665 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
2666 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 2 },
2667 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
2668 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
2669 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 },
2670 };
2671
2672 static const TypeConversionCostTblEntry AVXConversionTbl[] = {
2673 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 4 },
2674 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 },
2675 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 4 },
2676 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 },
2677 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
2678 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
2679
2680 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 3 },
2681 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 3 },
2682 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 3 },
2683 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 3 },
2684 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
2685 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
2686 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 3 },
2687 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 3 },
2688 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
2689 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
2690 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
2691 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
2692
2693 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 4 },
2694 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 5 },
2695 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 4 },
2696 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 9 },
2697 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, 11 },
2698
2699 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 },
2700 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
2701 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // and+extract+packuswb
2702 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 5 },
2703 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
2704 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 5 },
2705 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 3 }, // and+extract+2*packusdw
2706 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 },
2707
2708 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
2709 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 },
2710 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 },
2711 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 },
2712 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 },
2713 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
2714 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 },
2715 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
2716 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
2717 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 },
2718 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 5 },
2719 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 8 },
2720
2721 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 },
2722 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 },
2723 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 },
2724 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 },
2725 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 },
2726 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
2727 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 },
2728 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 4 },
2729 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 4 },
2730 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
2731 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 },
2732 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 },
2733 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 10 },
2734 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 10 },
2735 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 18 },
2736 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
2737 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 10 },
2738
2739 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 },
2740 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, 2 },
2741 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, 2 },
2742 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, 2 },
2743 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 2 },
2744 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, 2 },
2745 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 2 },
2746 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, 2 },
2747 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 2 },
2748 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 2 },
2749 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 5 },
2750
2751 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, 2 },
2752 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, 2 },
2753 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, 2 },
2754 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, 2 },
2755 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 2 },
2756 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, 2 },
2757 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 2 },
2758 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, 2 },
2759 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 },
2760 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
2761 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 6 },
2762 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 7 },
2763 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 7 },
2764
2765 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 },
2766 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 },
2767 };
2768
2769 static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
2770 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 1 },
2771 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 1 },
2772 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 1 },
2773 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 1 },
2774 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
2775 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
2776 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 1 },
2777 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 1 },
2778 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
2779 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
2780 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
2781 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
2782
2783 // These truncates end up widening elements.
2784 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ
2785 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ
2786 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD
2787
2788 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 2 },
2789 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 2 },
2790 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 2 },
2791
2792 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 1 },
2793 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 1 },
2794 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 1 },
2795 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 1 },
2796 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 },
2797 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
2798 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 },
2799 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
2800 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
2801 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 1 },
2802 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
2803
2804 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 1 },
2805 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 1 },
2806 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 4 },
2807 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 },
2808 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 },
2809 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
2810 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 },
2811 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
2812 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 3 },
2813 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 },
2814 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 2 },
2815 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 12 },
2816 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 22 },
2817 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 4 },
2818
2819 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 1 },
2820 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 1 },
2821 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 1 },
2822 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 1 },
2823 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 2 },
2824 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 2 },
2825 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 1 },
2826 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 1 },
2827 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
2828 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 1 },
2829
2830 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 1 },
2831 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 },
2832 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 1 },
2833 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 },
2834 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 2 },
2835 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 2 },
2836 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 1 },
2837 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 1 },
2838 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 4 },
2839 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
2840 };
2841
2842 static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
2843 // These are somewhat magic numbers justified by comparing the
2844 // output of llvm-mca for our various supported scheduler models
2845 // and basing it off the worst case scenario.
2846 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 3 },
2847 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 3 },
2848 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 3 },
2849 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 3 },
2850 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 3 },
2851 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 },
2852 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 3 },
2853 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 },
2854 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 },
2855 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4 },
2856 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 8 },
2857 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 8 },
2858
2859 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 3 },
2860 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 3 },
2861 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 8 },
2862 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 9 },
2863 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 },
2864 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 4 },
2865 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 4 },
2866 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 },
2867 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 7 },
2868 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 7 },
2869 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
2870 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 15 },
2871 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 18 },
2872
2873 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 4 },
2874 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 4 },
2875 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 4 },
2876 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 4 },
2877 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 6 },
2878 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 6 },
2879 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 5 },
2880 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 5 },
2881 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 4 },
2882 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 4 },
2883
2884 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 4 },
2885 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 },
2886 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 4 },
2887 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 15 },
2888 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 6 },
2889 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 6 },
2890 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 5 },
2891 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 5 },
2892 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 8 },
2893 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 8 },
2894
2895 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 4 },
2896 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 4 },
2897 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 2 },
2898 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 3 },
2899 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
2900 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 2 },
2901 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 2 },
2902 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 3 },
2903 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
2904 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 2 },
2905 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
2906 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 2 },
2907
2908 // These truncates are really widening elements.
2909 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD
2910 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // PUNPCKLWD+DQ
2911 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // PUNPCKLBW+WD+PSHUFD
2912 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 1 }, // PUNPCKLWD
2913 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD
2914 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW
2915
2916 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 2 }, // PAND+PACKUSWB
2917 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 },
2918 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 3 }, // PAND+2*PACKUSWB
2919 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 },
2920 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 },
2921 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 3 },
2922 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
2923 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32,10 },
2924 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB
2925 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW
2926 { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, 1 }, // PSHUFD
2927 };
2928
2929 // Attempt to map directly to (simple) MVT types to let us match custom entries.
2930 EVT SrcTy = TLI->getValueType(DL, Src);
2931 EVT DstTy = TLI->getValueType(DL, Dst);
2932
2933 // The function getSimpleVT only handles simple value types.
2934 if (SrcTy.isSimple() && DstTy.isSimple()) {
2935 MVT SimpleSrcTy = SrcTy.getSimpleVT();
2936 MVT SimpleDstTy = DstTy.getSimpleVT();
2937
2938 if (ST->useAVX512Regs()) {
2939 if (ST->hasBWI())
2940 if (const auto *Entry = ConvertCostTableLookup(
2941 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2942 return AdjustCost(Entry->Cost);
2943
2944 if (ST->hasDQI())
2945 if (const auto *Entry = ConvertCostTableLookup(
2946 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2947 return AdjustCost(Entry->Cost);
2948
2949 if (ST->hasAVX512())
2950 if (const auto *Entry = ConvertCostTableLookup(
2951 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2952 return AdjustCost(Entry->Cost);
2953 }
2954
2955 if (ST->hasBWI())
2956 if (const auto *Entry = ConvertCostTableLookup(
2957 AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2958 return AdjustCost(Entry->Cost);
2959
2960 if (ST->hasDQI())
2961 if (const auto *Entry = ConvertCostTableLookup(
2962 AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2963 return AdjustCost(Entry->Cost);
2964
2965 if (ST->hasAVX512())
2966 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2967 SimpleDstTy, SimpleSrcTy))
2968 return AdjustCost(Entry->Cost);
2969
2970 if (ST->hasAVX2()) {
2971 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2972 SimpleDstTy, SimpleSrcTy))
2973 return AdjustCost(Entry->Cost);
2974 }
2975
2976 if (ST->hasAVX()) {
2977 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2978 SimpleDstTy, SimpleSrcTy))
2979 return AdjustCost(Entry->Cost);
2980 }
2981
2982 if (ST->hasSSE41()) {
2983 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
2984 SimpleDstTy, SimpleSrcTy))
2985 return AdjustCost(Entry->Cost);
2986 }
2987
2988 if (ST->hasSSE2()) {
2989 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2990 SimpleDstTy, SimpleSrcTy))
2991 return AdjustCost(Entry->Cost);
2992 }
2993 }
2994
2995 // Fall back to legalized types.
2996 std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src);
2997 std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst);
2998
2999 // If we're truncating to the same legalized type - just assume its free.
3000 if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
3001 return TTI::TCC_Free;
3002
3003 if (ST->useAVX512Regs()) {
3004 if (ST->hasBWI())
3005 if (const auto *Entry = ConvertCostTableLookup(
3006 AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
3007 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3008
3009 if (ST->hasDQI())
3010 if (const auto *Entry = ConvertCostTableLookup(
3011 AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
3012 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3013
3014 if (ST->hasAVX512())
3015 if (const auto *Entry = ConvertCostTableLookup(
3016 AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
3017 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3018 }
3019
3020 if (ST->hasBWI())
3021 if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
3022 LTDest.second, LTSrc.second))
3023 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3024
3025 if (ST->hasDQI())
3026 if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
3027 LTDest.second, LTSrc.second))
3028 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3029
3030 if (ST->hasAVX512())
3031 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3032 LTDest.second, LTSrc.second))
3033 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3034
3035 if (ST->hasAVX2())
3036 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3037 LTDest.second, LTSrc.second))
3038 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3039
3040 if (ST->hasAVX())
3041 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3042 LTDest.second, LTSrc.second))
3043 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3044
3045 if (ST->hasSSE41())
3046 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3047 LTDest.second, LTSrc.second))
3048 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3049
3050 if (ST->hasSSE2())
3051 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3052 LTDest.second, LTSrc.second))
3053 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3054
3055 // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
3056 // sitofp.
3057 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3058 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
3059 Type *ExtSrc = Src->getWithNewBitWidth(32);
3060 unsigned ExtOpc =
3061 (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
3062
3063 // For scalar loads the extend would be free.
3064 InstructionCost ExtCost = 0;
3065 if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
3066 ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
3067
3068 return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
3070 }
3071
3072 // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
3073 // i32.
3074 if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
3075 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
3076 Type *TruncDst = Dst->getWithNewBitWidth(32);
3077 return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
3078 getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
3080 }
3081
3082 return AdjustCost(
3083 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3084}
3085
3087 Type *CondTy,
3088 CmpInst::Predicate VecPred,
3090 const Instruction *I) {
3091 // Early out if this type isn't scalar/vector integer/float.
3092 if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy()))
3093 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3094 I);
3095
3096 // Legalize the type.
3097 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3098
3099 MVT MTy = LT.second;
3100
3101 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3102 assert(ISD && "Invalid opcode");
3103
3104 InstructionCost ExtraCost = 0;
3105 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
3106 // Some vector comparison predicates cost extra instructions.
3107 // TODO: Adjust ExtraCost based on CostKind?
3108 // TODO: Should we invert this and assume worst case cmp costs
3109 // and reduce for particular predicates?
3110 if (MTy.isVector() &&
3111 !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
3112 (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
3113 ST->hasBWI())) {
3114 // Fallback to I if a specific predicate wasn't specified.
3115 CmpInst::Predicate Pred = VecPred;
3116 if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
3118 Pred = cast<CmpInst>(I)->getPredicate();
3119
3120 bool CmpWithConstant = false;
3121 if (auto *CmpInstr = dyn_cast_or_null<CmpInst>(I))
3122 CmpWithConstant = isa<Constant>(CmpInstr->getOperand(1));
3123
3124 switch (Pred) {
3126 // xor(cmpeq(x,y),-1)
3127 ExtraCost = CmpWithConstant ? 0 : 1;
3128 break;
3131 // xor(cmpgt(x,y),-1)
3132 ExtraCost = CmpWithConstant ? 0 : 1;
3133 break;
3136 // cmpgt(xor(x,signbit),xor(y,signbit))
3137 // xor(cmpeq(pmaxu(x,y),x),-1)
3138 ExtraCost = CmpWithConstant ? 1 : 2;
3139 break;
3142 if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
3143 (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
3144 // cmpeq(psubus(x,y),0)
3145 // cmpeq(pminu(x,y),x)
3146 ExtraCost = 1;
3147 } else {
3148 // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
3149 ExtraCost = CmpWithConstant ? 2 : 3;
3150 }
3151 break;
3154 // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases.
3155 // Use FCMP_UEQ expansion - FCMP_ONE should be the same.
3156 if (CondTy && !ST->hasAVX())
3157 return getCmpSelInstrCost(Opcode, ValTy, CondTy,
3159 getCmpSelInstrCost(Opcode, ValTy, CondTy,
3161 getArithmeticInstrCost(Instruction::Or, CondTy, CostKind);
3162
3163 break;
3166 // Assume worst case scenario and add the maximum extra cost.
3167 ExtraCost = 3;
3168 break;
3169 default:
3170 break;
3171 }
3172 }
3173 }
3174
3175 static const CostKindTblEntry SLMCostTbl[] = {
3176 // slm pcmpeq/pcmpgt throughput is 2
3177 { ISD::SETCC, MVT::v2i64, { 2, 5, 1, 2 } },
3178 // slm pblendvb/blendvpd/blendvps throughput is 4
3179 { ISD::SELECT, MVT::v2f64, { 4, 4, 1, 3 } }, // vblendvpd
3180 { ISD::SELECT, MVT::v4f32, { 4, 4, 1, 3 } }, // vblendvps
3181 { ISD::SELECT, MVT::v2i64, { 4, 4, 1, 3 } }, // pblendvb
3182 { ISD::SELECT, MVT::v8i32, { 4, 4, 1, 3 } }, // pblendvb
3183 { ISD::SELECT, MVT::v8i16, { 4, 4, 1, 3 } }, // pblendvb
3184 { ISD::SELECT, MVT::v16i8, { 4, 4, 1, 3 } }, // pblendvb
3185 };
3186
3187 static const CostKindTblEntry AVX512BWCostTbl[] = {
3188 { ISD::SETCC, MVT::v32i16, { 1, 1, 1, 1 } },
3189 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 1 } },
3190 { ISD::SETCC, MVT::v64i8, { 1, 1, 1, 1 } },
3191 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 1 } },
3192
3193 { ISD::SELECT, MVT::v32i16, { 1, 1, 1, 1 } },
3194 { ISD::SELECT, MVT::v64i8, { 1, 1, 1, 1 } },
3195 };
3196
3197 static const CostKindTblEntry AVX512CostTbl[] = {
3198 { ISD::SETCC, MVT::v8f64, { 1, 4, 1, 1 } },
3199 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 1 } },
3200 { ISD::SETCC, MVT::v16f32, { 1, 4, 1, 1 } },
3201 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 1 } },
3202
3203 { ISD::SETCC, MVT::v8i64, { 1, 1, 1, 1 } },
3204 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 1 } },
3205 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3206 { ISD::SETCC, MVT::v16i32, { 1, 1, 1, 1 } },
3207 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 1 } },
3208 { ISD::SETCC, MVT::v32i16, { 3, 7, 5, 5 } },
3209 { ISD::SETCC, MVT::v64i8, { 3, 7, 5, 5 } },
3210
3211 { ISD::SELECT, MVT::v8i64, { 1, 1, 1, 1 } },
3212 { ISD::SELECT, MVT::v4i64, { 1, 1, 1, 1 } },
3213 { ISD::SELECT, MVT::v2i64, { 1, 1, 1, 1 } },
3214 { ISD::SELECT, MVT::v16i32, { 1, 1, 1, 1 } },
3215 { ISD::SELECT, MVT::v8i32, { 1, 1, 1, 1 } },
3216 { ISD::SELECT, MVT::v4i32, { 1, 1, 1, 1 } },
3217 { ISD::SELECT, MVT::v8f64, { 1, 1, 1, 1 } },
3218 { ISD::SELECT, MVT::v4f64, { 1, 1, 1, 1 } },
3219 { ISD::SELECT, MVT::v2f64, { 1, 1, 1, 1 } },
3220 { ISD::SELECT, MVT::f64, { 1, 1, 1, 1 } },
3221 { ISD::SELECT, MVT::v16f32, { 1, 1, 1, 1 } },
3222 { ISD::SELECT, MVT::v8f32 , { 1, 1, 1, 1 } },
3223 { ISD::SELECT, MVT::v4f32, { 1, 1, 1, 1 } },
3224 { ISD::SELECT, MVT::f32 , { 1, 1, 1, 1 } },
3225
3226 { ISD::SELECT, MVT::v32i16, { 2, 2, 4, 4 } },
3227 { ISD::SELECT, MVT::v16i16, { 1, 1, 1, 1 } },
3228 { ISD::SELECT, MVT::v8i16, { 1, 1, 1, 1 } },
3229 { ISD::SELECT, MVT::v64i8, { 2, 2, 4, 4 } },
3230 { ISD::SELECT, MVT::v32i8, { 1, 1, 1, 1 } },
3231 { ISD::SELECT, MVT::v16i8, { 1, 1, 1, 1 } },
3232 };
3233
3234 static const CostKindTblEntry AVX2CostTbl[] = {
3235 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 2 } },
3236 { ISD::SETCC, MVT::v2f64, { 1, 4, 1, 1 } },
3237 { ISD::SETCC, MVT::f64, { 1, 4, 1, 1 } },
3238 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 2 } },
3239 { ISD::SETCC, MVT::v4f32, { 1, 4, 1, 1 } },
3240 { ISD::SETCC, MVT::f32, { 1, 4, 1, 1 } },
3241
3242 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 2 } },
3243 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 2 } },
3244 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 2 } },
3245 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 2 } },
3246
3247 { ISD::SELECT, MVT::v4f64, { 2, 2, 1, 2 } }, // vblendvpd
3248 { ISD::SELECT, MVT::v8f32, { 2, 2, 1, 2 } }, // vblendvps
3249 { ISD::SELECT, MVT::v4i64, { 2, 2, 1, 2 } }, // pblendvb
3250 { ISD::SELECT, MVT::v8i32, { 2, 2, 1, 2 } }, // pblendvb
3251 { ISD::SELECT, MVT::v16i16, { 2, 2, 1, 2 } }, // pblendvb
3252 { ISD::SELECT, MVT::v32i8, { 2, 2, 1, 2 } }, // pblendvb
3253 };
3254
3255 static const CostKindTblEntry XOPCostTbl[] = {
3256 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3257 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3258 };
3259
3260 static const CostKindTblEntry AVX1CostTbl[] = {
3261 { ISD::SETCC, MVT::v4f64, { 2, 3, 1, 2 } },
3262 { ISD::SETCC, MVT::v2f64, { 1, 3, 1, 1 } },
3263 { ISD::SETCC, MVT::f64, { 1, 3, 1, 1 } },
3264 { ISD::SETCC, MVT::v8f32, { 2, 3, 1, 2 } },
3265 { ISD::SETCC, MVT::v4f32, { 1, 3, 1, 1 } },
3266 { ISD::SETCC, MVT::f32, { 1, 3, 1, 1 } },
3267
3268 // AVX1 does not support 8-wide integer compare.
3269 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3270 { ISD::SETCC, MVT::v8i32, { 4, 2, 5, 6 } },
3271 { ISD::SETCC, MVT::v16i16, { 4, 2, 5, 6 } },
3272 { ISD::SETCC, MVT::v32i8, { 4, 2, 5, 6 } },
3273
3274 { ISD::SELECT, MVT::v4f64, { 3, 3, 1, 2 } }, // vblendvpd
3275 { ISD::SELECT, MVT::v8f32, { 3, 3, 1, 2 } }, // vblendvps
3276 { ISD::SELECT, MVT::v4i64, { 3, 3, 1, 2 } }, // vblendvpd
3277 { ISD::SELECT, MVT::v8i32, { 3, 3, 1, 2 } }, // vblendvps
3278 { ISD::SELECT, MVT::v16i16, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3279 { ISD::SELECT, MVT::v32i8, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3280 };
3281
3282 static const CostKindTblEntry SSE42CostTbl[] = {
3283 { ISD::SETCC, MVT::v2i64, { 1, 2, 1, 2 } },
3284 };
3285
3286 static const CostKindTblEntry SSE41CostTbl[] = {
3287 { ISD::SETCC, MVT::v2f64, { 1, 5, 1, 1 } },
3288 { ISD::SETCC, MVT::v4f32, { 1, 5, 1, 1 } },
3289
3290 { ISD::SELECT, MVT::v2f64, { 2, 2, 1, 2 } }, // blendvpd
3291 { ISD::SELECT, MVT::f64, { 2, 2, 1, 2 } }, // blendvpd
3292 { ISD::SELECT, MVT::v4f32, { 2, 2, 1, 2 } }, // blendvps
3293 { ISD::SELECT, MVT::f32 , { 2, 2, 1, 2 } }, // blendvps
3294 { ISD::SELECT, MVT::v2i64, { 2, 2, 1, 2 } }, // pblendvb
3295 { ISD::SELECT, MVT::v4i32, { 2, 2, 1, 2 } }, // pblendvb
3296 { ISD::SELECT, MVT::v8i16, { 2, 2, 1, 2 } }, // pblendvb
3297 { ISD::SELECT, MVT::v16i8, { 2, 2, 1, 2 } }, // pblendvb
3298 };
3299
3300 static const CostKindTblEntry SSE2CostTbl[] = {
3301 { ISD::SETCC, MVT::v2f64, { 2, 5, 1, 1 } },
3302 { ISD::SETCC, MVT::f64, { 1, 5, 1, 1 } },
3303
3304 { ISD::SETCC, MVT::v2i64, { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion
3305 { ISD::SETCC, MVT::v4i32, { 1, 1, 1, 1 } },
3306 { ISD::SETCC, MVT::v8i16, { 1, 1, 1, 1 } },
3307 { ISD::SETCC, MVT::v16i8, { 1, 1, 1, 1 } },
3308
3309 { ISD::SELECT, MVT::v2f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3310 { ISD::SELECT, MVT::f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3311 { ISD::SELECT, MVT::v2i64, { 2, 2, 3, 3 } }, // pand + pandn + por
3312 { ISD::SELECT, MVT::v4i32, { 2, 2, 3, 3 } }, // pand + pandn + por
3313 { ISD::SELECT, MVT::v8i16, { 2, 2, 3, 3 } }, // pand + pandn + por
3314 { ISD::SELECT, MVT::v16i8, { 2, 2, 3, 3 } }, // pand + pandn + por
3315 };
3316
3317 static const CostKindTblEntry SSE1CostTbl[] = {
3318 { ISD::SETCC, MVT::v4f32, { 2, 5, 1, 1 } },
3319 { ISD::SETCC, MVT::f32, { 1, 5, 1, 1 } },
3320
3321 { ISD::SELECT, MVT::v4f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3322 { ISD::SELECT, MVT::f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3323 };
3324
3325 if (ST->useSLMArithCosts())
3326 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
3327 if (auto KindCost = Entry->Cost[CostKind])
3328 return LT.first * (ExtraCost + *KindCost);
3329
3330 if (ST->hasBWI())
3331 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3332 if (auto KindCost = Entry->Cost[CostKind])
3333 return LT.first * (ExtraCost + *KindCost);
3334
3335 if (ST->hasAVX512())
3336 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3337 if (auto KindCost = Entry->Cost[CostKind])
3338 return LT.first * (ExtraCost + *KindCost);
3339
3340 if (ST->hasAVX2())
3341 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3342 if (auto KindCost = Entry->Cost[CostKind])
3343 return LT.first * (ExtraCost + *KindCost);
3344
3345 if (ST->hasXOP())
3346 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3347 if (auto KindCost = Entry->Cost[CostKind])
3348 return LT.first * (ExtraCost + *KindCost);
3349
3350 if (ST->hasAVX())
3351 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3352 if (auto KindCost = Entry->Cost[CostKind])
3353 return LT.first * (ExtraCost + *KindCost);
3354
3355 if (ST->hasSSE42())
3356 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3357 if (auto KindCost = Entry->Cost[CostKind])
3358 return LT.first * (ExtraCost + *KindCost);
3359
3360 if (ST->hasSSE41())
3361 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3362 if (auto KindCost = Entry->Cost[CostKind])
3363 return LT.first * (ExtraCost + *KindCost);
3364
3365 if (ST->hasSSE2())
3366 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3367 if (auto KindCost = Entry->Cost[CostKind])
3368 return LT.first * (ExtraCost + *KindCost);
3369
3370 if (ST->hasSSE1())
3371 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3372 if (auto KindCost = Entry->Cost[CostKind])
3373 return LT.first * (ExtraCost + *KindCost);
3374
3375 // Assume a 3cy latency for fp select ops.
3376 if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select)
3377 if (ValTy->getScalarType()->isFloatingPointTy())
3378 return 3;
3379
3380 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
3381}
3382
3384
3388 // Costs should match the codegen from:
3389 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
3390 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
3391 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
3392 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
3393 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
3394
3395 // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
3396 // specialized in these tables yet.
3397 static const CostKindTblEntry AVX512VBMI2CostTbl[] = {
3398 { ISD::FSHL, MVT::v8i64, { 1, 1, 1, 1 } },
3399 { ISD::FSHL, MVT::v4i64, { 1, 1, 1, 1 } },
3400 { ISD::FSHL, MVT::v2i64, { 1, 1, 1, 1 } },
3401 { ISD::FSHL, MVT::v16i32, { 1, 1, 1, 1 } },
3402 { ISD::FSHL, MVT::v8i32, { 1, 1, 1, 1 } },
3403 { ISD::FSHL, MVT::v4i32, { 1, 1, 1, 1 } },
3404 { ISD::FSHL, MVT::v32i16, { 1, 1, 1, 1 } },
3405 { ISD::FSHL, MVT::v16i16, { 1, 1, 1, 1 } },
3406 { ISD::FSHL, MVT::v8i16, { 1, 1, 1, 1 } },
3407 { ISD::ROTL, MVT::v32i16, { 1, 1, 1, 1 } },
3408 { ISD::ROTL, MVT::v16i16, { 1, 1, 1, 1 } },
3409 { ISD::ROTL, MVT::v8i16, { 1, 1, 1, 1 } },
3410 { ISD::ROTR, MVT::v32i16, { 1, 1, 1, 1 } },
3411 { ISD::ROTR, MVT::v16i16, { 1, 1, 1, 1 } },
3412 { ISD::ROTR, MVT::v8i16, { 1, 1, 1, 1 } },
3413 { X86ISD::VROTLI, MVT::v32i16, { 1, 1, 1, 1 } },
3414 { X86ISD::VROTLI, MVT::v16i16, { 1, 1, 1, 1 } },
3415 { X86ISD::VROTLI, MVT::v8i16, { 1, 1, 1, 1 } },
3416 };
3417 static const CostKindTblEntry AVX512BITALGCostTbl[] = {
3418 { ISD::CTPOP, MVT::v32i16, { 1, 1, 1, 1 } },
3419 { ISD::CTPOP, MVT::v64i8, { 1, 1, 1, 1 } },
3420 { ISD::CTPOP, MVT::v16i16, { 1, 1, 1, 1 } },
3421 { ISD::CTPOP, MVT::v32i8, { 1, 1, 1, 1 } },
3422 { ISD::CTPOP, MVT::v8i16, { 1, 1, 1, 1 } },
3423 { ISD::CTPOP, MVT::v16i8, { 1, 1, 1, 1 } },
3424 };
3425 static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = {
3426 { ISD::CTPOP, MVT::v8i64, { 1, 1, 1, 1 } },
3427 { ISD::CTPOP, MVT::v16i32, { 1, 1, 1, 1 } },
3428 { ISD::CTPOP, MVT::v4i64, { 1, 1, 1, 1 } },
3429 { ISD::CTPOP, MVT::v8i32, { 1, 1, 1, 1 } },
3430 { ISD::CTPOP, MVT::v2i64, { 1, 1, 1, 1 } },
3431 { ISD::CTPOP, MVT::v4i32, { 1, 1, 1, 1 } },
3432 };
3433 static const CostKindTblEntry AVX512CDCostTbl[] = {
3434 { ISD::CTLZ, MVT::v8i64, { 1, 5, 1, 1 } },
3435 { ISD::CTLZ, MVT::v16i32, { 1, 5, 1, 1 } },
3436 { ISD::CTLZ, MVT::v32i16, { 18, 27, 23, 27 } },
3437 { ISD::CTLZ, MVT::v64i8, { 3, 16, 9, 11 } },
3438 { ISD::CTLZ, MVT::v4i64, { 1, 5, 1, 1 } },
3439 { ISD::CTLZ, MVT::v8i32, { 1, 5, 1, 1 } },
3440 { ISD::CTLZ, MVT::v16i16, { 8, 19, 11, 13 } },
3441 { ISD::CTLZ, MVT::v32i8, { 2, 11, 9, 10 } },
3442 { ISD::CTLZ, MVT::v2i64, { 1, 5, 1, 1 } },
3443 { ISD::CTLZ, MVT::v4i32, { 1, 5, 1, 1 } },
3444 { ISD::CTLZ, MVT::v8i16, { 3, 15, 4, 6 } },
3445 { ISD::CTLZ, MVT::v16i8, { 2, 10, 9, 10 } },
3446
3447 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3448 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3449 { ISD::CTTZ, MVT::v4i64, { 1, 8, 6, 6 } },
3450 { ISD::CTTZ, MVT::v8i32, { 1, 8, 6, 6 } },
3451 { ISD::CTTZ, MVT::v2i64, { 1, 8, 6, 6 } },
3452 { ISD::CTTZ, MVT::v4i32, { 1, 8, 6, 6 } },
3453 };
3454 static const CostKindTblEntry AVX512BWCostTbl[] = {
3455 { ISD::ABS, MVT::v32i16, { 1, 1, 1, 1 } },
3456 { ISD::ABS, MVT::v64i8, { 1, 1, 1, 1 } },
3457 { ISD::BITREVERSE, MVT::v2i64, { 3, 10, 10, 11 } },
3458 { ISD::BITREVERSE, MVT::v4i64, { 3, 11, 10, 11 } },
3459 { ISD::BITREVERSE, MVT::v8i64, { 3, 12, 10, 14 } },
3460 { ISD::BITREVERSE, MVT::v4i32, { 3, 10, 10, 11 } },
3461 { ISD::BITREVERSE, MVT::v8i32, { 3, 11, 10, 11 } },
3462 { ISD::BITREVERSE, MVT::v16i32, { 3, 12, 10, 14 } },
3463 { ISD::BITREVERSE, MVT::v8i16, { 3, 10, 10, 11 } },
3464 { ISD::BITREVERSE, MVT::v16i16, { 3, 11, 10, 11 } },
3465 { ISD::BITREVERSE, MVT::v32i16, { 3, 12, 10, 14 } },
3466 { ISD::BITREVERSE, MVT::v16i8, { 2, 5, 9, 9 } },
3467 { ISD::BITREVERSE, MVT::v32i8, { 2, 5, 9, 9 } },
3468 { ISD::BITREVERSE, MVT::v64i8, { 2, 5, 9, 12 } },
3469 { ISD::BSWAP, MVT::v2i64, { 1, 1, 1, 2 } },
3470 { ISD::BSWAP, MVT::v4i64, { 1, 1, 1, 2 } },
3471 { ISD::BSWAP, MVT::v8i64, { 1, 1, 1, 2 } },
3472 { ISD::BSWAP, MVT::v4i32, { 1, 1, 1, 2 } },
3473 { ISD::BSWAP, MVT::v8i32, { 1, 1, 1, 2 } },
3474 { ISD::BSWAP, MVT::v16i32, { 1, 1, 1, 2 } },
3475 { ISD::BSWAP, MVT::v8i16, { 1, 1, 1, 2 } },
3476 { ISD::BSWAP, MVT::v16i16, { 1, 1, 1, 2 } },
3477 { ISD::BSWAP, MVT::v32i16, { 1, 1, 1, 2 } },
3478 { ISD::CTLZ, MVT::v8i64, { 8, 22, 23, 23 } },
3479 { ISD::CTLZ, MVT::v16i32, { 8, 23, 25, 25 } },
3480 { ISD::CTLZ, MVT::v32i16, { 4, 15, 15, 16 } },
3481 { ISD::CTLZ, MVT::v64i8, { 3, 12, 10, 9 } },
3482 { ISD::CTPOP, MVT::v2i64, { 3, 7, 10, 10 } },
3483 { ISD::CTPOP, MVT::v4i64, { 3, 7, 10, 10 } },
3484 { ISD::CTPOP, MVT::v8i64, { 3, 8, 10, 12 } },
3485 { ISD::CTPOP, MVT::v4i32, { 7, 11, 14, 14 } },
3486 { ISD::CTPOP, MVT::v8i32, { 7, 11, 14, 14 } },
3487 { ISD::CTPOP, MVT::v16i32, { 7, 12, 14, 16 } },
3488 { ISD::CTPOP, MVT::v8i16, { 2, 7, 11, 11 } },
3489 { ISD::CTPOP, MVT::v16i16, { 2, 7, 11, 11 } },
3490 { ISD::CTPOP, MVT::v32i16, { 3, 7, 11, 13 } },
3491 { ISD::CTPOP, MVT::v16i8, { 2, 4, 8, 8 } },
3492 { ISD::CTPOP, MVT::v32i8, { 2, 4, 8, 8 } },
3493 { ISD::CTPOP, MVT::v64i8, { 2, 5, 8, 10 } },
3494 { ISD::CTTZ, MVT::v8i16, { 3, 9, 14, 14 } },
3495 { ISD::CTTZ, MVT::v16i16, { 3, 9, 14, 14 } },
3496 { ISD::CTTZ, MVT::v32i16, { 3, 10, 14, 16 } },
3497 { ISD::CTTZ, MVT::v16i8, { 2, 6, 11, 11 } },
3498 { ISD::CTTZ, MVT::v32i8, { 2, 6, 11, 11 } },
3499 { ISD::CTTZ, MVT::v64i8, { 3, 7, 11, 13 } },
3500 { ISD::ROTL, MVT::v32i16, { 2, 8, 6, 8 } },
3501 { ISD::ROTL, MVT::v16i16, { 2, 8, 6, 7 } },
3502 { ISD::ROTL, MVT::v8i16, { 2, 7, 6, 7 } },
3503 { ISD::ROTL, MVT::v64i8, { 5, 6, 11, 12 } },
3504 { ISD::ROTL, MVT::v32i8, { 5, 15, 7, 10 } },
3505 { ISD::ROTL, MVT::v16i8, { 5, 15, 7, 10 } },
3506 { ISD::ROTR, MVT::v32i16, { 2, 8, 6, 8 } },
3507 { ISD::ROTR, MVT::v16i16, { 2, 8, 6, 7 } },
3508 { ISD::ROTR, MVT::v8i16, { 2, 7, 6, 7 } },
3509 { ISD::ROTR, MVT::v64i8, { 5, 6, 12, 14 } },
3510 { ISD::ROTR, MVT::v32i8, { 5, 14, 6, 9 } },
3511 { ISD::ROTR, MVT::v16i8, { 5, 14, 6, 9 } },
3512 { X86ISD::VROTLI, MVT::v32i16, { 2, 5, 3, 3 } },
3513 { X86ISD::VROTLI, MVT::v16i16, { 1, 5, 3, 3 } },
3514 { X86ISD::VROTLI, MVT::v8i16, { 1, 5, 3, 3 } },
3515 { X86ISD::VROTLI, MVT::v64i8, { 2, 9, 3, 4 } },
3516 { X86ISD::VROTLI, MVT::v32i8, { 1, 9, 3, 4 } },
3517 { X86ISD::VROTLI, MVT::v16i8, { 1, 8, 3, 4 } },
3518 { ISD::SADDSAT, MVT::v32i16, { 1 } },
3519 { ISD::SADDSAT, MVT::v64i8, { 1 } },
3520 { ISD::SMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3521 { ISD::SMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3522 { ISD::SMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3523 { ISD::SMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3524 { ISD::SSUBSAT, MVT::v32i16, { 1 } },
3525 { ISD::SSUBSAT, MVT::v64i8, { 1 } },
3526 { ISD::UADDSAT, MVT::v32i16, { 1 } },
3527 { ISD::UADDSAT, MVT::v64i8, { 1 } },
3528 { ISD::UMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3529 { ISD::UMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3530 { ISD::UMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3531 { ISD::UMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3532 { ISD::USUBSAT, MVT::v32i16, { 1 } },
3533 { ISD::USUBSAT, MVT::v64i8, { 1 } },
3534 };
3535 static const CostKindTblEntry AVX512CostTbl[] = {
3536 { ISD::ABS, MVT::v8i64, { 1, 1, 1, 1 } },
3537 { ISD::ABS, MVT::v4i64, { 1, 1, 1, 1 } },
3538 { ISD::ABS, MVT::v2i64, { 1, 1, 1, 1 } },
3539 { ISD::ABS, MVT::v16i32, { 1, 1, 1, 1 } },
3540 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 1 } },
3541 { ISD::ABS, MVT::v32i16, { 2, 7, 4, 4 } },
3542 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 1 } },
3543 { ISD::ABS, MVT::v64i8, { 2, 7, 4, 4 } },
3544 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 1 } },
3545 { ISD::BITREVERSE, MVT::v8i64, { 9, 13, 20, 20 } },
3546 { ISD::BITREVERSE, MVT::v16i32, { 9, 13, 20, 20 } },
3547 { ISD::BITREVERSE, MVT::v32i16, { 9, 13, 20, 20 } },
3548 { ISD::BITREVERSE, MVT::v64i8, { 6, 11, 17, 17 } },
3549 { ISD::BSWAP, MVT::v8i64, { 4, 7, 5, 5 } },
3550 { ISD::BSWAP, MVT::v16i32, { 4, 7, 5, 5 } },
3551 { ISD::BSWAP, MVT::v32i16, { 4, 7, 5, 5 } },
3552 { ISD::CTLZ, MVT::v8i64, { 10, 28, 32, 32 } },
3553 { ISD::CTLZ, MVT::v16i32, { 12, 30, 38, 38 } },
3554 { ISD::CTLZ, MVT::v32i16, { 8, 15, 29, 29 } },
3555 { ISD::CTLZ, MVT::v64i8, { 6, 11, 19, 19 } },
3556 { ISD::CTPOP, MVT::v8i64, { 16, 16, 19, 19 } },
3557 { ISD::CTPOP, MVT::v16i32, { 24, 19, 27, 27 } },
3558 { ISD::CTPOP, MVT::v32i16, { 18, 15, 22, 22 } },
3559 { ISD::CTPOP, MVT::v64i8, { 12, 11, 16, 16 } },
3560 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3561 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3562 { ISD::CTTZ, MVT::v32i16, { 7, 17, 27, 27 } },
3563 { ISD::CTTZ, MVT::v64i8, { 6, 13, 21, 21 } },
3564 { ISD::ROTL, MVT::v8i64, { 1, 1, 1, 1 } },
3565 { ISD::ROTL, MVT::v4i64, { 1, 1, 1, 1 } },
3566 { ISD::ROTL, MVT::v2i64, { 1, 1, 1, 1 } },
3567 { ISD::ROTL, MVT::v16i32, { 1, 1, 1, 1 } },
3568 { ISD::ROTL, MVT::v8i32, { 1, 1, 1, 1 } },
3569 { ISD::ROTL, MVT::v4i32, { 1, 1, 1, 1 } },
3570 { ISD::ROTR, MVT::v8i64, { 1, 1, 1, 1 } },
3571 { ISD::ROTR, MVT::v4i64, { 1, 1, 1, 1 } },
3572 { ISD::ROTR, MVT::v2i64, { 1, 1, 1, 1 } },
3573 { ISD::ROTR, MVT::v16i32, { 1, 1, 1, 1 } },
3574 { ISD::ROTR, MVT::v8i32, { 1, 1, 1, 1 } },
3575 { ISD::ROTR, MVT::v4i32, { 1, 1, 1, 1 } },
3576 { X86ISD::VROTLI, MVT::v8i64, { 1, 1, 1, 1 } },
3577 { X86ISD::VROTLI, MVT::v4i64, { 1, 1, 1, 1 } },
3578 { X86ISD::VROTLI, MVT::v2i64, { 1, 1, 1, 1 } },
3579 { X86ISD::VROTLI, MVT::v16i32, { 1, 1, 1, 1 } },
3580 { X86ISD::VROTLI, MVT::v8i32, { 1, 1, 1, 1 } },
3581 { X86ISD::VROTLI, MVT::v4i32, { 1, 1, 1, 1 } },
3582 { ISD::SMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3583 { ISD::SMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3584 { ISD::SMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3585 { ISD::SMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3586 { ISD::SMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3587 { ISD::SMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3588 { ISD::SMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3589 { ISD::SMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3590 { ISD::SMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3591 { ISD::SMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3592 { ISD::SMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3593 { ISD::SMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3594 { ISD::UMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3595 { ISD::UMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3596 { ISD::UMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3597 { ISD::UMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3598 { ISD::UMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3599 { ISD::UMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3600 { ISD::UMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3601 { ISD::UMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3602 { ISD::UMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3603 { ISD::UMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3604 { ISD::UMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3605 { ISD::UMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3606 { ISD::USUBSAT, MVT::v16i32, { 2 } }, // pmaxud + psubd
3607 { ISD::USUBSAT, MVT::v2i64, { 2 } }, // pmaxuq + psubq
3608 { ISD::USUBSAT, MVT::v4i64, { 2 } }, // pmaxuq + psubq
3609 { ISD::USUBSAT, MVT::v8i64, { 2 } }, // pmaxuq + psubq
3610 { ISD::UADDSAT, MVT::v16i32, { 3 } }, // not + pminud + paddd
3611 { ISD::UADDSAT, MVT::v2i64, { 3 } }, // not + pminuq + paddq
3612 { ISD::UADDSAT, MVT::v4i64, { 3 } }, // not + pminuq + paddq
3613 { ISD::UADDSAT, MVT::v8i64, { 3 } }, // not + pminuq + paddq
3614 { ISD::SADDSAT, MVT::v32i16, { 2 } },
3615 { ISD::SADDSAT, MVT::v64i8, { 2 } },
3616 { ISD::SSUBSAT, MVT::v32i16, { 2 } },
3617 { ISD::SSUBSAT, MVT::v64i8, { 2 } },
3618 { ISD::UADDSAT, MVT::v32i16, { 2 } },
3619 { ISD::UADDSAT, MVT::v64i8, { 2 } },
3620 { ISD::USUBSAT, MVT::v32i16, { 2 } },
3621 { ISD::USUBSAT, MVT::v64i8, { 2 } },
3622 { ISD::FMAXNUM, MVT::f32, { 2, 2, 3, 3 } },
3623 { ISD::FMAXNUM, MVT::v4f32, { 1, 1, 3, 3 } },
3624 { ISD::FMAXNUM, MVT::v8f32, { 2, 2, 3, 3 } },
3625 { ISD::FMAXNUM, MVT::v16f32, { 4, 4, 3, 3 } },
3626 { ISD::FMAXNUM, MVT::f64, { 2, 2, 3, 3 } },
3627 { ISD::FMAXNUM, MVT::v2f64, { 1, 1, 3, 3 } },
3628 { ISD::FMAXNUM, MVT::v4f64, { 2, 2, 3, 3 } },
3629 { ISD::FMAXNUM, MVT::v8f64, { 3, 3, 3, 3 } },
3630 { ISD::FSQRT, MVT::f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3631 { ISD::FSQRT, MVT::v4f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3632 { ISD::FSQRT, MVT::v8f32, { 6, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3633 { ISD::FSQRT, MVT::v16f32, { 12, 20, 1, 3 } }, // Skylake from http://www.agner.org/
3634 { ISD::FSQRT, MVT::f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3635 { ISD::FSQRT, MVT::v2f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3636 { ISD::FSQRT, MVT::v4f64, { 12, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3637 { ISD::FSQRT, MVT::v8f64, { 24, 32, 1, 3 } }, // Skylake from http://www.agner.org/
3638 };
3639 static const CostKindTblEntry XOPCostTbl[] = {
3640 { ISD::BITREVERSE, MVT::v4i64, { 3, 6, 5, 6 } },
3641 { ISD::BITREVERSE, MVT::v8i32, { 3, 6, 5, 6 } },
3642 { ISD::BITREVERSE, MVT::v16i16, { 3, 6, 5, 6 } },
3643 { ISD::BITREVERSE, MVT::v32i8, { 3, 6, 5, 6 } },
3644 { ISD::BITREVERSE, MVT::v2i64, { 2, 7, 1, 1 } },
3645 { ISD::BITREVERSE, MVT::v4i32, { 2, 7, 1, 1 } },
3646 { ISD::BITREVERSE, MVT::v8i16, { 2, 7, 1, 1 } },
3647 { ISD::BITREVERSE, MVT::v16i8, { 2, 7, 1, 1 } },
3648 { ISD::BITREVERSE, MVT::i64, { 2, 2, 3, 4 } },
3649 { ISD::BITREVERSE, MVT::i32, { 2, 2, 3, 4 } },
3650 { ISD::BITREVERSE, MVT::i16, { 2, 2, 3, 4 } },
3651 { ISD::BITREVERSE, MVT::i8, { 2, 2, 3, 4 } },
3652 // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
3653 { ISD::ROTL, MVT::v4i64, { 4, 7, 5, 6 } },
3654 { ISD::ROTL, MVT::v8i32, { 4, 7, 5, 6 } },
3655 { ISD::ROTL, MVT::v16i16, { 4, 7, 5, 6 } },
3656 { ISD::ROTL, MVT::v32i8, { 4, 7, 5, 6 } },
3657 { ISD::ROTL, MVT::v2i64, { 1, 3, 1, 1 } },
3658 { ISD::ROTL, MVT::v4i32, { 1, 3, 1, 1 } },
3659 { ISD::ROTL, MVT::v8i16, { 1, 3, 1, 1 } },
3660 { ISD::ROTL, MVT::v16i8, { 1, 3, 1, 1 } },
3661 { ISD::ROTR, MVT::v4i64, { 4, 7, 8, 9 } },
3662 { ISD::ROTR, MVT::v8i32, { 4, 7, 8, 9 } },
3663 { ISD::ROTR, MVT::v16i16, { 4, 7, 8, 9 } },
3664 { ISD::ROTR, MVT::v32i8, { 4, 7, 8, 9 } },
3665 { ISD::ROTR, MVT::v2i64, { 1, 3, 3, 3 } },
3666 { ISD::ROTR, MVT::v4i32, { 1, 3, 3, 3 } },
3667 { ISD::ROTR, MVT::v8i16, { 1, 3, 3, 3 } },
3668 { ISD::ROTR, MVT::v16i8, { 1, 3, 3, 3 } },
3669 { X86ISD::VROTLI, MVT::v4i64, { 4, 7, 5, 6 } },
3670 { X86ISD::VROTLI, MVT::v8i32, { 4, 7, 5, 6 } },
3671 { X86ISD::VROTLI, MVT::v16i16, { 4, 7, 5, 6 } },
3672 { X86ISD::VROTLI, MVT::v32i8, { 4, 7, 5, 6 } },
3673 { X86ISD::VROTLI, MVT::v2i64, { 1, 3, 1, 1 } },
3674 { X86ISD::VROTLI, MVT::v4i32, { 1, 3, 1, 1 } },
3675 { X86ISD::VROTLI, MVT::v8i16, { 1, 3, 1, 1 } },
3676 { X86ISD::VROTLI, MVT::v16i8, { 1, 3, 1, 1 } },
3677 };
3678 static const CostKindTblEntry AVX2CostTbl[] = {
3679 { ISD::ABS, MVT::v2i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3680 { ISD::ABS, MVT::v4i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3681 { ISD::ABS, MVT::v4i32, { 1, 1, 1, 1 } },
3682 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 2 } },
3683 { ISD::ABS, MVT::v8i16, { 1, 1, 1, 1 } },
3684 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 2 } },
3685 { ISD::ABS, MVT::v16i8, { 1, 1, 1, 1 } },
3686 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 2 } },
3687 { ISD::BITREVERSE, MVT::v2i64, { 3, 11, 10, 11 } },
3688 { ISD::BITREVERSE, MVT::v4i64, { 5, 11, 10, 17 } },
3689 { ISD::BITREVERSE, MVT::v4i32, { 3, 11, 10, 11 } },
3690 { ISD::BITREVERSE, MVT::v8i32, { 5, 11, 10, 17 } },
3691 { ISD::BITREVERSE, MVT::v8i16, { 3, 11, 10, 11 } },
3692 { ISD::BITREVERSE, MVT::v16i16, { 5, 11, 10, 17 } },
3693 { ISD::BITREVERSE, MVT::v16i8, { 3, 6, 9, 9 } },
3694 { ISD::BITREVERSE, MVT::v32i8, { 4, 5, 9, 15 } },
3695 { ISD::BSWAP, MVT::v2i64, { 1, 2, 1, 2 } },
3696 { ISD::BSWAP, MVT::v4i64, { 1, 3, 1, 2 } },
3697 { ISD::BSWAP, MVT::v4i32, { 1, 2, 1, 2 } },
3698 { ISD::BSWAP, MVT::v8i32, { 1, 3, 1, 2 } },
3699 { ISD::BSWAP, MVT::v8i16, { 1, 2, 1, 2 } },
3700 { ISD::BSWAP, MVT::v16i16, { 1, 3, 1, 2 } },
3701 { ISD::CTLZ, MVT::v2i64, { 7, 18, 24, 25 } },
3702 { ISD::CTLZ, MVT::v4i64, { 14, 18, 24, 44 } },
3703 { ISD::CTLZ, MVT::v4i32, { 5, 16, 19, 20 } },
3704 { ISD::CTLZ, MVT::v8i32, { 10, 16, 19, 34 } },
3705 { ISD::CTLZ, MVT::v8i16, { 4, 13, 14, 15 } },
3706 { ISD::CTLZ, MVT::v16i16, { 6, 14, 14, 24 } },
3707 { ISD::CTLZ, MVT::v16i8, { 3, 12, 9, 10 } },
3708 { ISD::CTLZ, MVT::v32i8, { 4, 12, 9, 14 } },
3709 { ISD::CTPOP, MVT::v2i64, { 3, 9, 10, 10 } },
3710 { ISD::CTPOP, MVT::v4i64, { 4, 9, 10, 14 } },
3711 { ISD::CTPOP, MVT::v4i32, { 7, 12, 14, 14 } },
3712 { ISD::CTPOP, MVT::v8i32, { 7, 12, 14, 18 } },
3713 { ISD::CTPOP, MVT::v8i16, { 3, 7, 11, 11 } },
3714 { ISD::CTPOP, MVT::v16i16, { 6, 8, 11, 18 } },
3715 { ISD::CTPOP, MVT::v16i8, { 2, 5, 8, 8 } },
3716 { ISD::CTPOP, MVT::v32i8, { 3, 5, 8, 12 } },
3717 { ISD::CTTZ, MVT::v2i64, { 4, 11, 13, 13 } },
3718 { ISD::CTTZ, MVT::v4i64, { 5, 11, 13, 20 } },
3719 { ISD::CTTZ, MVT::v4i32, { 7, 14, 17, 17 } },
3720 { ISD::CTTZ, MVT::v8i32, { 7, 15, 17, 24 } },
3721 { ISD::CTTZ, MVT::v8i16, { 4, 9, 14, 14 } },
3722 { ISD::CTTZ, MVT::v16i16, { 6, 9, 14, 24 } },
3723 { ISD::CTTZ, MVT::v16i8, { 3, 7, 11, 11 } },
3724 { ISD::CTTZ, MVT::v32i8, { 5, 7, 11, 18 } },
3725 { ISD::SADDSAT, MVT::v16i16, { 1 } },
3726 { ISD::SADDSAT, MVT::v32i8, { 1 } },
3727 { ISD::SMAX, MVT::v2i64, { 2, 7, 2, 3 } },
3728 { ISD::SMAX, MVT::v4i64, { 2, 7, 2, 3 } },
3729 { ISD::SMAX, MVT::v8i32, { 1, 1, 1, 2 } },
3730 { ISD::SMAX, MVT::v16i16, { 1, 1, 1, 2 } },
3731 { ISD::SMAX, MVT::v32i8, { 1, 1, 1, 2 } },
3732 { ISD::SMIN, MVT::v2i64, { 2, 7, 2, 3 } },
3733 { ISD::SMIN, MVT::v4i64, { 2, 7, 2, 3 } },
3734 { ISD::SMIN, MVT::v8i32, { 1, 1, 1, 2 } },
3735 { ISD::SMIN, MVT::v16i16, { 1, 1, 1, 2 } },
3736 { ISD::SMIN, MVT::v32i8, { 1, 1, 1, 2 } },
3737 { ISD::SSUBSAT, MVT::v16i16, { 1 } },
3738 { ISD::SSUBSAT, MVT::v32i8, { 1 } },
3739 { ISD::UADDSAT, MVT::v16i16, { 1 } },
3740 { ISD::UADDSAT, MVT::v32i8, { 1 } },
3741 { ISD::UADDSAT, MVT::v8i32, { 3 } }, // not + pminud + paddd
3742 { ISD::UMAX, MVT::v2i64, { 2, 8, 5, 6 } },
3743 { ISD::UMAX, MVT::v4i64, { 2, 8, 5, 8 } },
3744 { ISD::UMAX, MVT::v8i32, { 1, 1, 1, 2 } },
3745 { ISD::UMAX, MVT::v16i16, { 1, 1, 1, 2 } },
3746 { ISD::UMAX, MVT::v32i8, { 1, 1, 1, 2 } },
3747 { ISD::UMIN, MVT::v2i64, { 2, 8, 5, 6 } },
3748 { ISD::UMIN, MVT::v4i64, { 2, 8, 5, 8 } },
3749 { ISD::UMIN, MVT::v8i32, { 1, 1, 1, 2 } },
3750 { ISD::UMIN, MVT::v16i16, { 1, 1, 1, 2 } },
3751 { ISD::UMIN, MVT::v32i8, { 1, 1, 1, 2 } },
3752 { ISD::USUBSAT, MVT::v16i16, { 1 } },
3753 { ISD::USUBSAT, MVT::v32i8, { 1 } },
3754 { ISD::USUBSAT, MVT::v8i32, { 2 } }, // pmaxud + psubd
3755 { ISD::FMAXNUM, MVT::f32, { 2, 7, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3756 { ISD::FMAXNUM, MVT::v4f32, { 2, 7, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3757 { ISD::FMAXNUM, MVT::v8f32, { 3, 7, 3, 6 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3758 { ISD::FMAXNUM, MVT::f64, { 2, 7, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3759 { ISD::FMAXNUM, MVT::v2f64, { 2, 7, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3760 { ISD::FMAXNUM, MVT::v4f64, { 3, 7, 3, 6 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3761 { ISD::FSQRT, MVT::f32, { 7, 15, 1, 1 } }, // vsqrtss
3762 { ISD::FSQRT, MVT::v4f32, { 7, 15, 1, 1 } }, // vsqrtps
3763 { ISD::FSQRT, MVT::v8f32, { 14, 21, 1, 3 } }, // vsqrtps
3764 { ISD::FSQRT, MVT::f64, { 14, 21, 1, 1 } }, // vsqrtsd
3765 { ISD::FSQRT, MVT::v2f64, { 14, 21, 1, 1 } }, // vsqrtpd
3766 { ISD::FSQRT, MVT::v4f64, { 28, 35, 1, 3 } }, // vsqrtpd
3767 };
3768 static const CostKindTblEntry AVX1CostTbl[] = {
3769 { ISD::ABS, MVT::v4i64, { 6, 8, 6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3770 { ISD::ABS, MVT::v8i32, { 3, 6, 4, 5 } },
3771 { ISD::ABS, MVT::v16i16, { 3, 6, 4, 5 } },
3772 { ISD::ABS, MVT::v32i8, { 3, 6, 4, 5 } },
3773 { ISD::BITREVERSE, MVT::v4i64, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3774 { ISD::BITREVERSE, MVT::v2i64, { 8, 13, 10, 16 } },
3775 { ISD::BITREVERSE, MVT::v8i32, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3776 { ISD::BITREVERSE, MVT::v4i32, { 8, 13, 10, 16 } },
3777 { ISD::BITREVERSE, MVT::v16i16, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3778 { ISD::BITREVERSE, MVT::v8i16, { 8, 13, 10, 16 } },
3779 { ISD::BITREVERSE, MVT::v32i8, { 13, 15, 17, 26 } }, // 2 x 128-bit Op + extract/insert
3780 { ISD::BITREVERSE, MVT::v16i8, { 7, 7, 9, 13 } },
3781 { ISD::BSWAP, MVT::v4i64, { 5, 6, 5, 10 } },
3782 { ISD::BSWAP, MVT::v2i64, { 2, 2, 1, 3 } },
3783 { ISD::BSWAP, MVT::v8i32, { 5, 6, 5, 10 } },
3784 { ISD::BSWAP, MVT::v4i32, { 2, 2, 1, 3 } },
3785 { ISD::BSWAP, MVT::v16i16, { 5, 6, 5, 10 } },
3786 { ISD::BSWAP, MVT::v8i16, { 2, 2, 1, 3 } },
3787 { ISD::CTLZ, MVT::v4i64, { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert
3788 { ISD::CTLZ, MVT::v2i64, { 14, 24, 24, 28 } },
3789 { ISD::CTLZ, MVT::v8i32, { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert
3790 { ISD::CTLZ, MVT::v4i32, { 12, 20, 19, 23 } },
3791 { ISD::CTLZ, MVT::v16i16, { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert
3792 { ISD::CTLZ, MVT::v8i16, { 9, 16, 14, 18 } },
3793 { ISD::CTLZ, MVT::v32i8, { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert
3794 { ISD::CTLZ, MVT::v16i8, { 7, 12, 9, 13 } },
3795 { ISD::CTPOP, MVT::v4i64, { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert
3796 { ISD::CTPOP, MVT::v2i64, { 7, 14, 10, 14 } },
3797 { ISD::CTPOP, MVT::v8i32, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
3798 { ISD::CTPOP, MVT::v4i32, { 9, 20, 14, 18 } },
3799 { ISD::CTPOP, MVT::v16i16, { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert
3800 { ISD::CTPOP, MVT::v8i16, { 8, 18, 11, 15 } },
3801 { ISD::CTPOP, MVT::v32i8, { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert
3802 { ISD::CTPOP, MVT::v16i8, { 6, 12, 8, 12 } },
3803 { ISD::CTTZ, MVT::v4i64, { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert
3804 { ISD::CTTZ, MVT::v2i64, { 9, 19, 13, 17 } },
3805 { ISD::CTTZ, MVT::v8i32, { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert
3806 { ISD::CTTZ, MVT::v4i32, { 11, 24, 17, 21 } },
3807 { ISD::CTTZ, MVT::v16i16, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
3808 { ISD::CTTZ, MVT::v8i16, { 9, 21, 14, 18 } },
3809 { ISD::CTTZ, MVT::v32i8, { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert
3810 { ISD::CTTZ, MVT::v16i8, { 8, 16, 11, 15 } },
3811 { ISD::SADDSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3812 { ISD::SADDSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3813 { ISD::SMAX, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
3814 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 4 } },
3815 { ISD::SMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3816 { ISD::SMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3817 { ISD::SMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3818 { ISD::SMIN, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
3819 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
3820 { ISD::SMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3821 { ISD::SMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3822 { ISD::SMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3823 { ISD::SSUBSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3824 { ISD::SSUBSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3825 { ISD::UADDSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3826 { ISD::UADDSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3827 { ISD::UADDSAT, MVT::v8i32, { 8 } }, // 2 x 128-bit Op + extract/insert
3828 { ISD::UMAX, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
3829 { ISD::UMAX, MVT::v2i64, { 4, 8, 5, 7 } },
3830 { ISD::UMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3831 { ISD::UMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3832 { ISD::UMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3833 { ISD::UMIN, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
3834 { ISD::UMIN, MVT::v2i64, { 4, 8, 5, 7 } },
3835 { ISD::UMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3836 { ISD::UMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3837 { ISD::UMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3838 { ISD::USUBSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3839 { ISD::USUBSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3840 { ISD::USUBSAT, MVT::v8i32, { 6 } }, // 2 x 128-bit Op + extract/insert
3841 { ISD::FMAXNUM, MVT::f32, { 3, 6, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3842 { ISD::FMAXNUM, MVT::v4f32, { 3, 6, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3843 { ISD::FMAXNUM, MVT::v8f32, { 5, 7, 3, 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3844 { ISD::FMAXNUM, MVT::f64, { 3, 6, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3845 { ISD::FMAXNUM, MVT::v2f64, { 3, 6, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3846 { ISD::FMAXNUM, MVT::v4f64, { 5, 7, 3, 10 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3847 { ISD::FSQRT, MVT::f32, { 21, 21, 1, 1 } }, // vsqrtss
3848 { ISD::FSQRT, MVT::v4f32, { 21, 21, 1, 1 } }, // vsqrtps
3849 { ISD::FSQRT, MVT::v8f32, { 42, 42, 1, 3 } }, // vsqrtps
3850 { ISD::FSQRT, MVT::f64, { 27, 27, 1, 1 } }, // vsqrtsd
3851 { ISD::FSQRT, MVT::v2f64, { 27, 27, 1, 1 } }, // vsqrtpd
3852 { ISD::FSQRT, MVT::v4f64, { 54, 54, 1, 3 } }, // vsqrtpd
3853 };
3854 static const CostKindTblEntry GFNICostTbl[] = {
3855 { ISD::BITREVERSE, MVT::i8, { 3, 3, 3, 4 } }, // gf2p8affineqb
3856 { ISD::BITREVERSE, MVT::i16, { 3, 3, 4, 6 } }, // gf2p8affineqb
3857 { ISD::BITREVERSE, MVT::i32, { 3, 3, 4, 5 } }, // gf2p8affineqb
3858 { ISD::BITREVERSE, MVT::i64, { 3, 3, 4, 6 } }, // gf2p8affineqb
3859 { ISD::BITREVERSE, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3860 { ISD::BITREVERSE, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3861 { ISD::BITREVERSE, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3862 { ISD::BITREVERSE, MVT::v8i16, { 1, 8, 2, 4 } }, // gf2p8affineqb
3863 { ISD::BITREVERSE, MVT::v16i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
3864 { ISD::BITREVERSE, MVT::v32i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
3865 { ISD::BITREVERSE, MVT::v4i32, { 1, 8, 2, 4 } }, // gf2p8affineqb
3866 { ISD::BITREVERSE, MVT::v8i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
3867 { ISD::BITREVERSE, MVT::v16i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
3868 { ISD::BITREVERSE, MVT::v2i64, { 1, 8, 2, 4 } }, // gf2p8affineqb
3869 { ISD::BITREVERSE, MVT::v4i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
3870 { ISD::BITREVERSE, MVT::v8i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
3871 };
3872 static const CostKindTblEntry GLMCostTbl[] = {
3873 { ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss
3874 { ISD::FSQRT, MVT::v4f32, { 37, 41, 1, 5 } }, // sqrtps
3875 { ISD::FSQRT, MVT::f64, { 34, 35, 1, 1 } }, // sqrtsd
3876 { ISD::FSQRT, MVT::v2f64, { 67, 71, 1, 5 } }, // sqrtpd
3877 };
3878 static const CostKindTblEntry SLMCostTbl[] = {
3879 { ISD::BSWAP, MVT::v2i64, { 5, 5, 1, 5 } },
3880 { ISD::BSWAP, MVT::v4i32, { 5, 5, 1, 5 } },
3881 { ISD::BSWAP, MVT::v8i16, { 5, 5, 1, 5 } },
3882 { ISD::FSQRT, MVT::f32, { 20, 20, 1, 1 } }, // sqrtss
3883 { ISD::FSQRT, MVT::v4f32, { 40, 41, 1, 5 } }, // sqrtps
3884 { ISD::FSQRT, MVT::f64, { 35, 35, 1, 1 } }, // sqrtsd
3885 { ISD::FSQRT, MVT::v2f64, { 70, 71, 1, 5 } }, // sqrtpd
3886 };
3887 static const CostKindTblEntry SSE42CostTbl[] = {
3888 { ISD::USUBSAT, MVT::v4i32, { 2 } }, // pmaxud + psubd
3889 { ISD::UADDSAT, MVT::v4i32, { 3 } }, // not + pminud + paddd
3890 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3891 { ISD::FMAXNUM, MVT::v4f32, { 4, 4, 4, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3892 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3893 { ISD::FMAXNUM, MVT::v2f64, { 4, 4, 4, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3894 { ISD::FSQRT, MVT::f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
3895 { ISD::FSQRT, MVT::v4f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
3896 };
3897 static const CostKindTblEntry SSE41CostTbl[] = {
3898 { ISD::ABS, MVT::v2i64, { 3, 4, 3, 5 } }, // BLENDVPD(X,PSUBQ(0,X),X)
3899 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 3 } },
3900 { ISD::SMAX, MVT::v4i32, { 1, 1, 1, 1 } },
3901 { ISD::SMAX, MVT::v16i8, { 1, 1, 1, 1 } },
3902 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
3903 { ISD::SMIN, MVT::v4i32, { 1, 1, 1, 1 } },
3904 { ISD::SMIN, MVT::v16i8, { 1, 1, 1, 1 } },
3905 { ISD::UMAX, MVT::v2i64, { 2, 11, 6, 7 } },
3906 { ISD::UMAX, MVT::v4i32, { 1, 1, 1, 1 } },
3907 { ISD::UMAX, MVT::v8i16, { 1, 1, 1, 1 } },
3908 { ISD::UMIN, MVT::v2i64, { 2, 11, 6, 7 } },
3909 { ISD::UMIN, MVT::v4i32, { 1, 1, 1, 1 } },
3910 { ISD::UMIN, MVT::v8i16, { 1, 1, 1, 1 } },
3911 };
3912 static const CostKindTblEntry SSSE3CostTbl[] = {
3913 { ISD::ABS, MVT::v4i32, { 1, 2, 1, 1 } },
3914 { ISD::ABS, MVT::v8i16, { 1, 2, 1, 1 } },
3915 { ISD::ABS, MVT::v16i8, { 1, 2, 1, 1 } },
3916 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 11, 21 } },
3917 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 11, 21 } },
3918 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 11, 21 } },
3919 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 10, 16 } },
3920 { ISD::BSWAP, MVT::v2i64, { 2, 3, 1, 5 } },
3921 { ISD::BSWAP, MVT::v4i32, { 2, 3, 1, 5 } },
3922 { ISD::BSWAP, MVT::v8i16, { 2, 3, 1, 5 } },
3923 { ISD::CTLZ, MVT::v2i64, { 18, 28, 28, 35 } },
3924 { ISD::CTLZ, MVT::v4i32, { 15, 20, 22, 28 } },
3925 { ISD::CTLZ, MVT::v8i16, { 13, 17, 16, 22 } },
3926 { ISD::CTLZ, MVT::v16i8, { 11, 15, 10, 16 } },
3927 { ISD::CTPOP, MVT::v2i64, { 13, 19, 12, 18 } },
3928 { ISD::CTPOP, MVT::v4i32, { 18, 24, 16, 22 } },
3929 { ISD::CTPOP, MVT::v8i16, { 13, 18, 14, 20 } },
3930 { ISD::CTPOP, MVT::v16i8, { 11, 12, 10, 16 } },
3931 { ISD::CTTZ, MVT::v2i64, { 13, 25, 15, 22 } },
3932 { ISD::CTTZ, MVT::v4i32, { 18, 26, 19, 25 } },
3933 { ISD::CTTZ, MVT::v8i16, { 13, 20, 17, 23 } },
3934 { ISD::CTTZ, MVT::v16i8, { 11, 16, 13, 19 } }
3935 };
3936 static const CostKindTblEntry SSE2CostTbl[] = {
3937 { ISD::ABS, MVT::v2i64, { 3, 6, 5, 5 } },
3938 { ISD::ABS, MVT::v4i32, { 1, 4, 4, 4 } },
3939 { ISD::ABS, MVT::v8i16, { 1, 2, 3, 3 } },
3940 { ISD::ABS, MVT::v16i8, { 1, 2, 3, 3 } },
3941 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 32, 32 } },
3942 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 30, 30 } },
3943 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 25, 25 } },
3944 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 21, 21 } },
3945 { ISD::BSWAP, MVT::v2i64, { 5, 6, 11, 11 } },
3946 { ISD::BSWAP, MVT::v4i32, { 5, 5, 9, 9 } },
3947 { ISD::BSWAP, MVT::v8i16, { 5, 5, 4, 5 } },
3948 { ISD::CTLZ, MVT::v2i64, { 10, 45, 36, 38 } },
3949 { ISD::CTLZ, MVT::v4i32, { 10, 45, 38, 40 } },
3950 { ISD::CTLZ, MVT::v8i16, { 9, 38, 32, 34 } },
3951 { ISD::CTLZ, MVT::v16i8, { 8, 39, 29, 32 } },
3952 { ISD::CTPOP, MVT::v2i64, { 12, 26, 16, 18 } },
3953 { ISD::CTPOP, MVT::v4i32, { 15, 29, 21, 23 } },
3954 { ISD::CTPOP, MVT::v8i16, { 13, 25, 18, 20 } },
3955 { ISD::CTPOP, MVT::v16i8, { 10, 21, 14, 16 } },
3956 { ISD::CTTZ, MVT::v2i64, { 14, 28, 19, 21 } },
3957 { ISD::CTTZ, MVT::v4i32, { 18, 31, 24, 26 } },
3958 { ISD::CTTZ, MVT::v8i16, { 16, 27, 21, 23 } },
3959 { ISD::CTTZ, MVT::v16i8, { 13, 23, 17, 19 } },
3960 { ISD::SADDSAT, MVT::v8i16, { 1 } },
3961 { ISD::SADDSAT, MVT::v16i8, { 1 } },
3962 { ISD::SMAX, MVT::v2i64, { 4, 8, 15, 15 } },
3963 { ISD::SMAX, MVT::v4i32, { 2, 4, 5, 5 } },
3964 { ISD::SMAX, MVT::v8i16, { 1, 1, 1, 1 } },
3965 { ISD::SMAX, MVT::v16i8, { 2, 4, 5, 5 } },
3966 { ISD::SMIN, MVT::v2i64, { 4, 8, 15, 15 } },
3967 { ISD::SMIN, MVT::v4i32, { 2, 4, 5, 5 } },
3968 { ISD::SMIN, MVT::v8i16, { 1, 1, 1, 1 } },
3969 { ISD::SMIN, MVT::v16i8, { 2, 4, 5, 5 } },
3970 { ISD::SSUBSAT, MVT::v8i16, { 1 } },
3971 { ISD::SSUBSAT, MVT::v16i8, { 1 } },
3972 { ISD::UADDSAT, MVT::v8i16, { 1 } },
3973 { ISD::UADDSAT, MVT::v16i8, { 1 } },
3974 { ISD::UMAX, MVT::v2i64, { 4, 8, 15, 15 } },
3975 { ISD::UMAX, MVT::v4i32, { 2, 5, 8, 8 } },
3976 { ISD::UMAX, MVT::v8i16, { 1, 3, 3, 3 } },
3977 { ISD::UMAX, MVT::v16i8, { 1, 1, 1, 1 } },
3978 { ISD::UMIN, MVT::v2i64, { 4, 8, 15, 15 } },
3979 { ISD::UMIN, MVT::v4i32, { 2, 5, 8, 8 } },
3980 { ISD::UMIN, MVT::v8i16, { 1, 3, 3, 3 } },
3981 { ISD::UMIN, MVT::v16i8, { 1, 1, 1, 1 } },
3982 { ISD::USUBSAT, MVT::v8i16, { 1 } },
3983 { ISD::USUBSAT, MVT::v16i8, { 1 } },
3984 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } },
3985 { ISD::FMAXNUM, MVT::v2f64, { 4, 6, 6, 6 } },
3986 { ISD::FSQRT, MVT::f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
3987 { ISD::FSQRT, MVT::v2f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
3988 };
3989 static const CostKindTblEntry SSE1CostTbl[] = {
3990 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } },
3991 { ISD::FMAXNUM, MVT::v4f32, { 4, 6, 6, 6 } },
3992 { ISD::FSQRT, MVT::f32, { 28, 30, 1, 2 } }, // Pentium III from http://www.agner.org/
3993 { ISD::FSQRT, MVT::v4f32, { 56, 56, 1, 2 } }, // Pentium III from http://www.agner.org/
3994 };
3995 static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets
3996 { ISD::CTTZ, MVT::i64, { 1 } },
3997 };
3998 static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
3999 { ISD::CTTZ, MVT::i32, { 1 } },
4000 { ISD::CTTZ, MVT::i16, { 1 } },
4001 { ISD::CTTZ, MVT::i8, { 1 } },
4002 };
4003 static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets
4004 { ISD::CTLZ, MVT::i64, { 1 } },
4005 };
4006 static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
4007 { ISD::CTLZ, MVT::i32, { 1 } },
4008 { ISD::CTLZ, MVT::i16, { 2 } },
4009 { ISD::CTLZ, MVT::i8, { 2 } },
4010 };
4011 static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets
4012 { ISD::CTPOP, MVT::i64, { 1, 1, 1, 1 } }, // popcnt
4013 };
4014 static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
4015 { ISD::CTPOP, MVT::i32, { 1, 1, 1, 1 } }, // popcnt
4016 { ISD::CTPOP, MVT::i16, { 1, 1, 2, 2 } }, // popcnt(zext())
4017 { ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext())
4018 };
4019 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
4020 { ISD::ABS, MVT::i64, { 1, 2, 3, 4 } }, // SUB+CMOV
4021 { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } },
4022 { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } },
4023 { ISD::CTLZ, MVT::i64, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
4024 { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 1, 1, 1 } }, // BSR+XOR
4025 { ISD::CTTZ, MVT::i64, { 3 } }, // TEST+BSF+CMOV/BRANCH
4026 { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 1, 1, 1 } }, // BSR
4027 { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } },
4028 { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } },
4029 { ISD::ROTR, MVT::i64, { 2, 3, 1, 3 } },
4030 { X86ISD::VROTLI, MVT::i64, { 1, 1, 1, 1 } },
4031 { ISD::FSHL, MVT::i64, { 4, 4, 1, 4 } },
4032 { ISD::SMAX, MVT::i64, { 1, 3, 2, 3 } },
4033 { ISD::SMIN, MVT::i64, { 1, 3, 2, 3 } },
4034 { ISD::UMAX, MVT::i64, { 1, 3, 2, 3 } },
4035 { ISD::UMIN, MVT::i64, { 1, 3, 2, 3 } },
4036 { ISD::SADDO, MVT::i64, { 1 } },
4037 { ISD::UADDO, MVT::i64, { 1 } },
4038 { ISD::UMULO, MVT::i64, { 2 } }, // mulq + seto
4039 };
4040 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
4041 { ISD::ABS, MVT::i32, { 1, 2, 3, 4 } }, // SUB+XOR+SRA or SUB+CMOV
4042 { ISD::ABS, MVT::i16, { 2, 2, 3, 4 } }, // SUB+XOR+SRA or SUB+CMOV
4043 { ISD::ABS, MVT::i8, { 2, 4, 4, 4 } }, // SUB+XOR+SRA
4044 { ISD::BITREVERSE, MVT::i32, { 9, 12, 17, 19 } },
4045 { ISD::BITREVERSE, MVT::i16, { 9, 12, 17, 19 } },
4046 { ISD::BITREVERSE, MVT::i8, { 7, 9, 13, 14 } },
4047 { ISD::BSWAP, MVT::i32, { 1, 1, 1, 1 } },
4048 { ISD::BSWAP, MVT::i16, { 1, 2, 1, 2 } }, // ROL
4049 { ISD::CTLZ, MVT::i32, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
4050 { ISD::CTLZ, MVT::i16, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
4051 { ISD::CTLZ, MVT::i8, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
4052 { ISD::CTLZ_ZERO_UNDEF, MVT::i32,{ 1, 1, 1, 1 } }, // BSR+XOR
4053 { ISD::CTLZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 3, 3 } }, // BSR+XOR
4054 { ISD::CTLZ_ZERO_UNDEF, MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR
4055 { ISD::CTTZ, MVT::i32, { 3 } }, // TEST+BSF+CMOV/BRANCH
4056 { ISD::CTTZ, MVT::i16, { 3 } }, // TEST+BSF+CMOV/BRANCH
4057 { ISD::CTTZ, MVT::i8, { 3 } }, // TEST+BSF+CMOV/BRANCH
4058 { ISD::CTTZ_ZERO_UNDEF, MVT::i32,{ 1, 1, 1, 1 } }, // BSF
4059 { ISD::CTTZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 1, 1 } }, // BSF
4060 { ISD::CTTZ_ZERO_UNDEF, MVT::i8, { 2, 2, 1, 1 } }, // BSF
4061 { ISD::CTPOP, MVT::i32, { 8, 7, 15, 15 } },
4062 { ISD::CTPOP, MVT::i16, { 9, 8, 17, 17 } },
4063 { ISD::CTPOP, MVT::i8, { 7, 6, 6, 6 } },
4064 { ISD::ROTL, MVT::i32, { 2, 3, 1, 3 } },
4065 { ISD::ROTL, MVT::i16, { 2, 3, 1, 3 } },
4066 { ISD::ROTL, MVT::i8, { 2, 3, 1, 3 } },
4067 { ISD::ROTR, MVT::i32, { 2, 3, 1, 3 } },
4068 { ISD::ROTR, MVT::i16, { 2, 3, 1, 3 } },
4069 { ISD::ROTR, MVT::i8, { 2, 3, 1, 3 } },
4070 { X86ISD::VROTLI, MVT::i32, { 1, 1, 1, 1 } },
4071 { X86ISD::VROTLI, MVT::i16, { 1, 1, 1, 1 } },
4072 { X86ISD::VROTLI, MVT::i8, { 1, 1, 1, 1 } },
4073 { ISD::FSHL, MVT::i32, { 4, 4, 1, 4 } },
4074 { ISD::FSHL, MVT::i16, { 4, 4, 2, 5 } },
4075 { ISD::FSHL, MVT::i8, { 4, 4, 2, 5 } },
4076 { ISD::SMAX, MVT::i32, { 1, 2, 2, 3 } },
4077 { ISD::SMAX, MVT::i16, { 1, 4, 2, 4 } },
4078 { ISD::SMAX, MVT::i8, { 1, 4, 2, 4 } },
4079 { ISD::SMIN, MVT::i32, { 1, 2, 2, 3 } },
4080 { ISD::SMIN, MVT::i16, { 1, 4, 2, 4 } },
4081 { ISD::SMIN, MVT::i8, { 1, 4, 2, 4 } },
4082 { ISD::UMAX, MVT::i32, { 1, 2, 2, 3 } },
4083 { ISD::UMAX, MVT::i16, { 1, 4, 2, 4 } },
4084 { ISD::UMAX, MVT::i8, { 1, 4, 2, 4 } },
4085 { ISD::UMIN, MVT::i32, { 1, 2, 2, 3 } },
4086 { ISD::UMIN, MVT::i16, { 1, 4, 2, 4 } },
4087 { ISD::UMIN, MVT::i8, { 1, 4, 2, 4 } },
4088 { ISD::SADDO, MVT::i32, { 1 } },
4089 { ISD::SADDO, MVT::i16, { 1 } },
4090 { ISD::SADDO, MVT::i8, { 1 } },
4091 { ISD::UADDO, MVT::i32, { 1 } },
4092 { ISD::UADDO, MVT::i16, { 1 } },
4093 { ISD::UADDO, MVT::i8, { 1 } },
4094 { ISD::UMULO, MVT::i32, { 2 } }, // mul + seto
4095 { ISD::UMULO, MVT::i16, { 2 } },
4096 { ISD::UMULO, MVT::i8, { 2 } },
4097 };
4098
4099 Type *RetTy = ICA.getReturnType();
4100 Type *OpTy = RetTy;
4101 Intrinsic::ID IID = ICA.getID();
4102 unsigned ISD = ISD::DELETED_NODE;
4103 switch (IID) {
4104 default:
4105 break;
4106 case Intrinsic::abs:
4107 ISD = ISD::ABS;
4108 break;
4109 case Intrinsic::bitreverse:
4110 ISD = ISD::BITREVERSE;
4111 break;
4112 case Intrinsic::bswap:
4113 ISD = ISD::BSWAP;
4114 break;
4115 case Intrinsic::ctlz:
4116 ISD = ISD::CTLZ;
4117 break;
4118 case Intrinsic::ctpop:
4119 ISD = ISD::CTPOP;
4120 break;
4121 case Intrinsic::cttz:
4122 ISD = ISD::CTTZ;
4123 break;
4124 case Intrinsic::fshl:
4125 ISD = ISD::FSHL;
4126 if (!ICA.isTypeBasedOnly()) {
4127 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4128 if (Args[0] == Args[1]) {
4129 ISD = ISD::ROTL;
4130 // Handle uniform constant rotation amounts.
4131 // TODO: Handle funnel-shift cases.
4132 const APInt *Amt;
4133 if (Args[2] &&
4135 ISD = X86ISD::VROTLI;
4136 }
4137 }
4138 break;
4139 case Intrinsic::fshr:
4140 // FSHR has same costs so don't duplicate.
4141 ISD = ISD::FSHL;
4142 if (!ICA.isTypeBasedOnly()) {
4143 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4144 if (Args[0] == Args[1]) {
4145 ISD = ISD::ROTR;
4146 // Handle uniform constant rotation amount.
4147 // TODO: Handle funnel-shift cases.
4148 const APInt *Amt;
4149 if (Args[2] &&
4151 ISD = X86ISD::VROTLI;
4152 }
4153 }
4154 break;
4155 case Intrinsic::maxnum:
4156 case Intrinsic::minnum:
4157 // FMINNUM has same costs so don't duplicate.
4158 ISD = ISD::FMAXNUM;
4159 break;
4160 case Intrinsic::sadd_sat:
4161 ISD = ISD::SADDSAT;
4162 break;
4163 case Intrinsic::smax:
4164 ISD = ISD::SMAX;
4165 break;
4166 case Intrinsic::smin:
4167 ISD = ISD::SMIN;
4168 break;
4169 case Intrinsic::ssub_sat:
4170 ISD = ISD::SSUBSAT;
4171 break;
4172 case Intrinsic::uadd_sat:
4173 ISD = ISD::UADDSAT;
4174 break;
4175 case Intrinsic::umax:
4176 ISD = ISD::UMAX;
4177 break;
4178 case Intrinsic::umin:
4179 ISD = ISD::UMIN;
4180 break;
4181 case Intrinsic::usub_sat:
4182 ISD = ISD::USUBSAT;
4183 break;
4184 case Intrinsic::sqrt:
4185 ISD = ISD::FSQRT;
4186 break;
4187 case Intrinsic::sadd_with_overflow:
4188 case Intrinsic::ssub_with_overflow:
4189 // SSUBO has same costs so don't duplicate.
4190 ISD = ISD::SADDO;
4191 OpTy = RetTy->getContainedType(0);
4192 break;
4193 case Intrinsic::uadd_with_overflow:
4194 case Intrinsic::usub_with_overflow:
4195 // USUBO has same costs so don't duplicate.
4196 ISD = ISD::UADDO;
4197 OpTy = RetTy->getContainedType(0);
4198 break;
4199 case Intrinsic::umul_with_overflow:
4200 case Intrinsic::smul_with_overflow:
4201 // SMULO has same costs so don't duplicate.
4202 ISD = ISD::UMULO;
4203 OpTy = RetTy->getContainedType(0);
4204 break;
4205 }
4206
4207 if (ISD != ISD::DELETED_NODE) {
4208 // Legalize the type.
4209 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy);
4210 MVT MTy = LT.second;
4211
4212 // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost.
4213 if (((ISD == ISD::CTTZ && !ST->hasBMI()) ||
4214 (ISD == ISD::CTLZ && !ST->hasLZCNT())) &&
4215 !MTy.isVector() && !ICA.isTypeBasedOnly()) {
4216 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4217 if (auto *Cst = dyn_cast<ConstantInt>(Args[1]))
4218 if (Cst->isAllOnesValue())
4220 }
4221
4222 // FSQRT is a single instruction.
4223 if (ISD == ISD::FSQRT && CostKind == TTI::TCK_CodeSize)
4224 return LT.first;
4225
4226 auto adjustTableCost = [](int ISD, unsigned Cost,
4227 InstructionCost LegalizationCost,
4228 FastMathFlags FMF) {
4229 // If there are no NANs to deal with, then these are reduced to a
4230 // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
4231 // assume is used in the non-fast case.
4232 if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) {
4233 if (FMF.noNaNs())
4234 return LegalizationCost * 1;
4235 }
4236 return LegalizationCost * (int)Cost;
4237 };
4238
4239 if (ST->useGLMDivSqrtCosts())
4240 if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
4241 if (auto KindCost = Entry->Cost[CostKind])
4242 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4243 ICA.getFlags());
4244
4245 if (ST->useSLMArithCosts())
4246 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
4247 if (auto KindCost = Entry->Cost[CostKind])
4248 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4249 ICA.getFlags());
4250
4251 if (ST->hasVBMI2())
4252 if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy))
4253 if (auto KindCost = Entry->Cost[CostKind])
4254 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4255 ICA.getFlags());
4256
4257 if (ST->hasBITALG())
4258 if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
4259 if (auto KindCost = Entry->Cost[CostKind])
4260 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4261 ICA.getFlags());
4262
4263 if (ST->hasVPOPCNTDQ())
4264 if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
4265 if (auto KindCost = Entry->Cost[CostKind])
4266 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4267 ICA.getFlags());
4268
4269 if (ST->hasGFNI())
4270 if (const auto *Entry = CostTableLookup(GFNICostTbl, ISD, MTy))
4271 if (auto KindCost = Entry->Cost[CostKind])
4272 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4273 ICA.getFlags());
4274
4275 if (ST->hasCDI())
4276 if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
4277 if (auto KindCost = Entry->Cost[CostKind])
4278 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4279 ICA.getFlags());
4280
4281 if (ST->hasBWI())
4282 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
4283 if (auto KindCost = Entry->Cost[CostKind])
4284 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4285 ICA.getFlags());
4286
4287 if (ST->hasAVX512())
4288 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
4289 if (auto KindCost = Entry->Cost[CostKind])
4290 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4291 ICA.getFlags());
4292
4293 if (ST->hasXOP())
4294 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
4295 if (auto KindCost = Entry->Cost[CostKind])
4296 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4297 ICA.getFlags());
4298
4299 if (ST->hasAVX2())
4300 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
4301 if (auto KindCost = Entry->Cost[CostKind])
4302 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4303 ICA.getFlags());
4304
4305 if (ST->hasAVX())
4306 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
4307 if (auto KindCost = Entry->Cost[CostKind])
4308 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4309 ICA.getFlags());
4310
4311 if (ST->hasSSE42())
4312 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
4313 if (auto KindCost = Entry->Cost[CostKind])
4314 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4315 ICA.getFlags());
4316
4317 if (ST->hasSSE41())
4318 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
4319 if (auto KindCost = Entry->Cost[CostKind])
4320 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4321 ICA.getFlags());
4322
4323 if (ST->hasSSSE3())
4324 if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
4325 if (auto KindCost = Entry->Cost[CostKind])
4326 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4327 ICA.getFlags());
4328
4329 if (ST->hasSSE2())
4330 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
4331 if (auto KindCost = Entry->Cost[CostKind])
4332 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4333 ICA.getFlags());
4334
4335 if (ST->hasSSE1())
4336 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
4337 if (auto KindCost = Entry->Cost[CostKind])
4338 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4339 ICA.getFlags());
4340
4341 if (ST->hasBMI()) {
4342 if (ST->is64Bit())
4343 if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
4344 if (auto KindCost = Entry->Cost[CostKind])
4345 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4346 ICA.getFlags());
4347
4348 if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
4349 if (auto KindCost = Entry->Cost[CostKind])
4350 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4351 ICA.getFlags());
4352 }
4353
4354 if (ST->hasLZCNT()) {
4355 if (ST->is64Bit())
4356 if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
4357 if (auto KindCost = Entry->Cost[CostKind])
4358 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4359 ICA.getFlags());
4360
4361 if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
4362 if (auto KindCost = Entry->Cost[CostKind])
4363 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4364 ICA.getFlags());
4365 }
4366
4367 if (ST->hasPOPCNT()) {
4368 if (ST->is64Bit())
4369 if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
4370 if (auto KindCost = Entry->Cost[CostKind])
4371 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4372 ICA.getFlags());
4373
4374 if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
4375 if (auto KindCost = Entry->Cost[CostKind])
4376 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4377 ICA.getFlags());
4378 }
4379
4380 if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
4381 if (const Instruction *II = ICA.getInst()) {
4382 if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
4383 return TTI::TCC_Free;
4384 if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
4385 if (LI->hasOneUse())
4386 return TTI::TCC_Free;
4387 }
4388 }
4389 }
4390
4391 if (ST->is64Bit())
4392 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
4393 if (auto KindCost = Entry->Cost[CostKind])
4394 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4395 ICA.getFlags());
4396
4397 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
4398 if (auto KindCost = Entry->Cost[CostKind])
4399 return adjustTableCost(Entry->ISD, *KindCost, LT.first, ICA.getFlags());
4400 }
4401
4403}
4404
4407 unsigned Index, Value *Op0,
4408 Value *Op1) {
4409 static const CostTblEntry SLMCostTbl[] = {
4410 { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },
4411 { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 },
4412 { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 },
4413 { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 }
4414 };
4415
4416 assert(Val->isVectorTy() && "This must be a vector type");
4417 Type *ScalarType = Val->getScalarType();
4418 InstructionCost RegisterFileMoveCost = 0;
4419
4420 // Non-immediate extraction/insertion can be handled as a sequence of
4421 // aliased loads+stores via the stack.
4422 if (Index == -1U && (Opcode == Instruction::ExtractElement ||
4423 Opcode == Instruction::InsertElement)) {
4424 // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
4425 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
4426
4427 // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
4428 assert(isa<FixedVectorType>(Val) && "Fixed vector type expected");
4429 Align VecAlign = DL.getPrefTypeAlign(Val);
4430 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
4431
4432 // Extract - store vector to stack, load scalar.
4433 if (Opcode == Instruction::ExtractElement) {
4434 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4435 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
4436 CostKind);
4437 }
4438 // Insert - store vector to stack, store scalar, load vector.
4439 if (Opcode == Instruction::InsertElement) {
4440 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4441 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
4442 CostKind) +
4443 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind);
4444 }
4445 }
4446
4447 if (Index != -1U && (Opcode == Instruction::ExtractElement ||
4448 Opcode == Instruction::InsertElement)) {
4449 // Extraction of vXi1 elements are now efficiently handled by MOVMSK.
4450 if (Opcode == Instruction::ExtractElement &&
4451 ScalarType->getScalarSizeInBits() == 1 &&
4452 cast<FixedVectorType>(Val)->getNumElements() > 1)
4453 return 1;
4454
4455 // Legalize the type.
4456 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4457
4458 // This type is legalized to a scalar type.
4459 if (!LT.second.isVector())
4460 return 0;
4461
4462 // The type may be split. Normalize the index to the new type.
4463 unsigned SizeInBits = LT.second.getSizeInBits();
4464 unsigned NumElts = LT.second.getVectorNumElements();
4465 unsigned SubNumElts = NumElts;
4466 Index = Index % NumElts;
4467
4468 // For >128-bit vectors, we need to extract higher 128-bit subvectors.
4469 // For inserts, we also need to insert the subvector back.
4470 if (SizeInBits > 128) {
4471 assert((SizeInBits % 128) == 0 && "Illegal vector");
4472 unsigned NumSubVecs = SizeInBits / 128;
4473 SubNumElts = NumElts / NumSubVecs;
4474 if (SubNumElts <= Index) {
4475 RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
4476 Index %= SubNumElts;
4477 }
4478 }
4479
4480 MVT MScalarTy = LT.second.getScalarType();
4481 auto IsCheapPInsrPExtrInsertPS = [&]() {
4482 // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
4483 // Also, assume insertps is relatively cheap on all >= SSE41 targets.
4484 return (MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4485 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4486 (MScalarTy == MVT::f32 && ST->hasSSE41() &&
4487 Opcode == Instruction::InsertElement);
4488 };
4489
4490 if (Index == 0) {
4491 // Floating point scalars are already located in index #0.
4492 // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
4493 // true for all.
4494 if (ScalarType->isFloatingPointTy() &&
4495 (Opcode != Instruction::InsertElement || !Op0 ||
4496 isa<UndefValue>(Op0)))
4497 return RegisterFileMoveCost;
4498
4499 if (Opcode == Instruction::InsertElement &&
4500 isa_and_nonnull<UndefValue>(Op0)) {
4501 // Consider the gather cost to be cheap.
4502 if (isa_and_nonnull<LoadInst>(Op1))
4503 return RegisterFileMoveCost;
4504 if (!IsCheapPInsrPExtrInsertPS()) {
4505 // mov constant-to-GPR + movd/movq GPR -> XMM.
4506 if (isa_and_nonnull<Constant>(Op1) && Op1->getType()->isIntegerTy())
4507 return 2 + RegisterFileMoveCost;
4508 // Assume movd/movq GPR -> XMM is relatively cheap on all targets.
4509 return 1 + RegisterFileMoveCost;
4510 }
4511 }
4512
4513 // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
4514 if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
4515 return 1 + RegisterFileMoveCost;
4516 }
4517
4518 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4519 assert(ISD && "Unexpected vector opcode");
4520 if (ST->useSLMArithCosts())
4521 if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
4522 return Entry->Cost + RegisterFileMoveCost;
4523
4524 // Consider cheap cases.
4525 if (IsCheapPInsrPExtrInsertPS())
4526 return 1 + RegisterFileMoveCost;
4527
4528 // For extractions we just need to shuffle the element to index 0, which
4529 // should be very cheap (assume cost = 1). For insertions we need to shuffle
4530 // the elements to its destination. In both cases we must handle the
4531 // subvector move(s).
4532 // If the vector type is already less than 128-bits then don't reduce it.
4533 // TODO: Under what circumstances should we shuffle using the full width?
4534 InstructionCost ShuffleCost = 1;
4535 if (Opcode == Instruction::InsertElement) {
4536 auto *SubTy = cast<VectorType>(Val);
4537 EVT VT = TLI->getValueType(DL, Val);
4538 if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
4539 SubTy = FixedVectorType::get(ScalarType, SubNumElts);
4540 ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, std::nullopt,
4541 CostKind, 0, SubTy);
4542 }
4543 int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
4544 return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
4545 }
4546
4547 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1) +
4548 RegisterFileMoveCost;
4549}
4550
4553 bool Insert, bool Extract,
4555 assert(DemandedElts.getBitWidth() ==
4556 cast<FixedVectorType>(Ty)->getNumElements() &&
4557 "Vector size mismatch");
4558
4559 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4560 MVT MScalarTy = LT.second.getScalarType();
4561 unsigned LegalVectorBitWidth = LT.second.getSizeInBits();
4563
4564 constexpr unsigned LaneBitWidth = 128;
4565 assert((LegalVectorBitWidth < LaneBitWidth ||
4566 (LegalVectorBitWidth % LaneBitWidth) == 0) &&
4567 "Illegal vector");
4568
4569 const int NumLegalVectors = *LT.first.getValue();
4570 assert(NumLegalVectors >= 0 && "Negative cost!");
4571
4572 // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
4573 // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
4574 if (Insert) {
4575 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4576 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4577 (MScalarTy == MVT::f32 && ST->hasSSE41())) {
4578 // For types we can insert directly, insertion into 128-bit sub vectors is
4579 // cheap, followed by a cheap chain of concatenations.
4580 if (LegalVectorBitWidth <= LaneBitWidth) {
4581 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
4582 /*Extract*/ false, CostKind);
4583 } else {
4584 // In each 128-lane, if at least one index is demanded but not all
4585 // indices are demanded and this 128-lane is not the first 128-lane of
4586 // the legalized-vector, then this 128-lane needs a extracti128; If in
4587 // each 128-lane, there is at least one demanded index, this 128-lane
4588 // needs a inserti128.
4589
4590 // The following cases will help you build a better understanding:
4591 // Assume we insert several elements into a v8i32 vector in avx2,
4592 // Case#1: inserting into 1th index needs vpinsrd + inserti128.
4593 // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
4594 // inserti128.
4595 // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
4596 assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector");
4597 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
4598 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
4599 unsigned NumLegalElts =
4600 LT.second.getVectorNumElements() * NumLegalVectors;
4601 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
4602 "Vector has been legalized to smaller element count");
4603 assert((NumLegalElts % NumLanesTotal) == 0 &&
4604 "Unexpected elts per lane");
4605 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
4606
4607 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
4608 auto *LaneTy =
4609 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
4610
4611 for (unsigned I = 0; I != NumLanesTotal; ++I) {
4612 APInt LaneEltMask = WidenedDemandedElts.extractBits(
4613 NumEltsPerLane, NumEltsPerLane * I);
4614 if (LaneEltMask.isZero())
4615 continue;
4616 // FIXME: we don't need to extract if all non-demanded elements
4617 // are legalization-inserted padding.
4618 if (!LaneEltMask.isAllOnes())
4619 Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
4620 CostKind, I * NumEltsPerLane, LaneTy);
4621 Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert,
4622 /*Extract*/ false, CostKind);
4623 }
4624
4625 APInt AffectedLanes =
4626 APIntOps::ScaleBitMask(WidenedDemandedElts, NumLanesTotal);
4627 APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask(
4628 AffectedLanes, NumLegalVectors, /*MatchAllBits=*/true);
4629 for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) {
4630 for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) {
4631 unsigned I = NumLegalLanes * LegalVec + Lane;
4632 // No need to insert unaffected lane; or lane 0 of each legal vector
4633 // iff ALL lanes of that vector were affected and will be inserted.
4634 if (!AffectedLanes[I] ||
4635 (Lane == 0 && FullyAffectedLegalVectors[LegalVec]))
4636 continue;
4637 Cost += getShuffleCost(TTI::SK_InsertSubvector, Ty, std::nullopt,
4638 CostKind, I * NumEltsPerLane, LaneTy);
4639 }
4640 }
4641 }
4642 } else if (LT.second.isVector()) {
4643 // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
4644 // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
4645 // series of UNPCK followed by CONCAT_VECTORS - all of these can be
4646 // considered cheap.
4647 if (Ty->isIntOrIntVectorTy())
4648 Cost += DemandedElts.popcount();
4649
4650 // Get the smaller of the legalized or original pow2-extended number of
4651 // vector elements, which represents the number of unpacks we'll end up
4652 // performing.
4653 unsigned NumElts = LT.second.getVectorNumElements();
4654 unsigned Pow2Elts =
4655 PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements());
4656 Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
4657 }
4658 }
4659
4660 if (Extract) {
4661 // vXi1 can be efficiently extracted with MOVMSK.
4662 // TODO: AVX512 predicate mask handling.
4663 // NOTE: This doesn't work well for roundtrip scalarization.
4664 if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) {
4665 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
4666 unsigned MaxElts = ST->hasAVX2() ? 32 : 16;
4667 unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts;
4668 return MOVMSKCost;
4669 }
4670
4671 if (LT.second.isVector()) {
4672 unsigned NumLegalElts =
4673 LT.second.getVectorNumElements() * NumLegalVectors;
4674 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
4675 "Vector has been legalized to smaller element count");
4676
4677 // If we're extracting elements from a 128-bit subvector lane,
4678 // we only need to extract each lane once, not for every element.
4679 if (LegalVectorBitWidth > LaneBitWidth) {
4680 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
4681 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
4682 assert((NumLegalElts % NumLanesTotal) == 0 &&
4683 "Unexpected elts per lane");
4684 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
4685
4686 // Add cost for each demanded 128-bit subvector extraction.
4687 // Luckily this is a lot easier than for insertion.
4688 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
4689 auto *LaneTy =
4690 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
4691
4692 for (unsigned I = 0; I != NumLanesTotal; ++I) {
4693 APInt LaneEltMask = WidenedDemandedElts.extractBits(
4694 NumEltsPerLane, I * NumEltsPerLane);
4695 if (LaneEltMask.isZero())
4696 continue;
4697 Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
4698 CostKind, I * NumEltsPerLane, LaneTy);
4700 LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind);
4701 }
4702
4703 return Cost;
4704 }
4705 }
4706
4707 // Fallback to default extraction.
4708 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ false,
4709 Extract, CostKind);
4710 }
4711
4712 return Cost;
4713}
4714
4716X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
4717 int VF, const APInt &DemandedDstElts,
4719 const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy);
4720 // We don't differentiate element types here, only element bit width.
4721 EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits);
4722
4723 auto bailout = [&]() {
4724 return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
4725 DemandedDstElts, CostKind);
4726 };
4727
4728 // For now, only deal with AVX512 cases.
4729 if (!ST->hasAVX512())
4730 return bailout();
4731
4732 // Do we have a native shuffle for this element type, or should we promote?
4733 unsigned PromEltTyBits = EltTyBits;
4734 switch (EltTyBits) {
4735 case 32:
4736 case 64:
4737 break; // AVX512F.
4738 case 16:
4739 if (!ST->hasBWI())
4740 PromEltTyBits = 32; // promote to i32, AVX512F.
4741 break; // AVX512BW
4742 case 8:
4743 if (!ST->hasVBMI())
4744 PromEltTyBits = 32; // promote to i32, AVX512F.
4745 break; // AVX512VBMI
4746 case 1:
4747 // There is no support for shuffling i1 elements. We *must* promote.
4748 if (ST->hasBWI()) {
4749 if (ST->hasVBMI())
4750 PromEltTyBits = 8; // promote to i8, AVX512VBMI.
4751 else
4752 PromEltTyBits = 16; // promote to i16, AVX512BW.
4753 break;
4754 }
4755 PromEltTyBits = 32; // promote to i32, AVX512F.
4756 break;
4757 default:
4758 return bailout();
4759 }
4760 auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits);
4761
4762 auto *SrcVecTy = FixedVectorType::get(EltTy, VF);
4763 auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF);
4764
4765 int NumDstElements = VF * ReplicationFactor;
4766 auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements);
4767 auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements);
4768
4769 // Legalize the types.
4770 MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second;
4771 MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second;
4772 MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second;
4773 MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second;
4774 // They should have legalized into vector types.
4775 if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() ||
4776 !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector())
4777 return bailout();
4778
4779 if (PromEltTyBits != EltTyBits) {
4780 // If we have to perform the shuffle with wider elt type than our data type,
4781 // then we will first need to anyext (we don't care about the new bits)
4782 // the source elements, and then truncate Dst elements.
4783 InstructionCost PromotionCost;
4784 PromotionCost += getCastInstrCost(
4785 Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy,
4787 PromotionCost +=
4788 getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy,
4789 /*Src=*/PromDstVecTy,
4791 return PromotionCost + getReplicationShuffleCost(PromEltTy,
4792 ReplicationFactor, VF,
4793 DemandedDstElts, CostKind);
4794 }
4795
4796 assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&
4797 LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&
4798 "We expect that the legalization doesn't affect the element width, "
4799 "doesn't coalesce/split elements.");
4800
4801 unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements();
4802 unsigned NumDstVectors =
4803 divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec);
4804
4805 auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec);
4806
4807 // Not all the produced Dst elements may be demanded. In our case,
4808 // given that a single Dst vector is formed by a single shuffle,
4809 // if all elements that will form a single Dst vector aren't demanded,
4810 // then we won't need to do that shuffle, so adjust the cost accordingly.
4811 APInt DemandedDstVectors = APIntOps::ScaleBitMask(
4812 DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors);
4813 unsigned NumDstVectorsDemanded = DemandedDstVectors.popcount();
4814
4815 InstructionCost SingleShuffleCost = getShuffleCost(
4816 TTI::SK_PermuteSingleSrc, SingleDstVecTy, /*Mask=*/std::nullopt, CostKind,
4817 /*Index=*/0, /*SubTp=*/nullptr);
4818 return NumDstVectorsDemanded * SingleShuffleCost;
4819}
4820
4822 MaybeAlign Alignment,
4823 unsigned AddressSpace,
4825 TTI::OperandValueInfo OpInfo,
4826 const Instruction *I) {
4827 // TODO: Handle other cost kinds.
4829 if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
4830 // Store instruction with index and scale costs 2 Uops.
4831 // Check the preceding GEP to identify non-const indices.
4832 if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
4833 if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
4834 return TTI::TCC_Basic * 2;
4835 }
4836 }
4837 return TTI::TCC_Basic;
4838 }
4839
4840 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
4841 "Invalid Opcode");
4842 // Type legalization can't handle structs
4843 if (TLI->getValueType(DL, Src, true) == MVT::Other)
4844 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4845 CostKind);
4846
4847 // Legalize the type.
4848 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
4849
4850 auto *VTy = dyn_cast<FixedVectorType>(Src);
4851
4853
4854 // Add a cost for constant load to vector.
4855 if (Opcode == Instruction::Store && OpInfo.isConstant())
4856 Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src),
4857 /*AddressSpace=*/0, CostKind);
4858
4859 // Handle the simple case of non-vectors.
4860 // NOTE: this assumes that legalization never creates vector from scalars!
4861 if (!VTy || !LT.second.isVector()) {
4862 // Each load/store unit costs 1.
4863 return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1;
4864 }
4865
4866 bool IsLoad = Opcode == Instruction::Load;
4867
4868 Type *EltTy = VTy->getElementType();
4869
4870 const int EltTyBits = DL.getTypeSizeInBits(EltTy);
4871
4872 // Source of truth: how many elements were there in the original IR vector?
4873 const unsigned SrcNumElt = VTy->getNumElements();
4874
4875 // How far have we gotten?
4876 int NumEltRemaining = SrcNumElt;
4877 // Note that we intentionally capture by-reference, NumEltRemaining changes.
4878 auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
4879
4880 const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
4881
4882 // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
4883 const unsigned XMMBits = 128;
4884 if (XMMBits % EltTyBits != 0)
4885 // Vector size must be a multiple of the element size. I.e. no padding.
4886 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4887 CostKind);
4888 const int NumEltPerXMM = XMMBits / EltTyBits;
4889
4890 auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
4891
4892 for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
4893 NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
4894 // How many elements would a single op deal with at once?
4895 if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
4896 // Vector size must be a multiple of the element size. I.e. no padding.
4897 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4898 CostKind);
4899 int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
4900
4901 assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
4902 assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||
4903 (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
4904 "Unless we haven't halved the op size yet, "
4905 "we have less than two op's sized units of work left.");
4906
4907 auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
4908 ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
4909 : XMMVecTy;
4910
4911 assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
4912 "After halving sizes, the vector elt count is no longer a multiple "
4913 "of number of elements per operation?");
4914 auto *CoalescedVecTy =
4915 CurrNumEltPerOp == 1
4916 ? CurrVecTy
4918 IntegerType::get(Src->getContext(),
4919 EltTyBits * CurrNumEltPerOp),
4920 CurrVecTy->getNumElements() / CurrNumEltPerOp);
4921 assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
4922 DL.getTypeSizeInBits(CurrVecTy) &&
4923 "coalesciing elements doesn't change vector width.");
4924
4925 while (NumEltRemaining > 0) {
4926 assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
4927
4928 // Can we use this vector size, as per the remaining element count?
4929 // Iff the vector is naturally aligned, we can do a wide load regardless.
4930 if (NumEltRemaining < CurrNumEltPerOp &&
4931 (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) &&
4932 CurrOpSizeBytes != 1)
4933 break; // Try smalled vector size.
4934
4935 bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
4936
4937 // If we have fully processed the previous reg, we need to replenish it.
4938 if (SubVecEltsLeft == 0) {
4939 SubVecEltsLeft += CurrVecTy->getNumElements();
4940 // And that's free only for the 0'th subvector of a legalized vector.
4941 if (!Is0thSubVec)
4944 VTy, std::nullopt, CostKind, NumEltDone(),
4945 CurrVecTy);
4946 }
4947
4948 // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
4949 // for smaller widths (32/16/8) we have to insert/extract them separately.
4950 // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
4951 // but let's pretend that it is also true for 16/8 bit wide ops...)
4952 if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
4953 int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
4954 assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
4955 int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
4956 APInt DemandedElts =
4957 APInt::getBitsSet(CoalescedVecTy->getNumElements(),
4958 CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
4959 assert(DemandedElts.popcount() == 1 && "Inserting single value");
4960 Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
4961 !IsLoad, CostKind);
4962 }
4963
4964 // This isn't exactly right. We're using slow unaligned 32-byte accesses
4965 // as a proxy for a double-pumped AVX memory interface such as on
4966 // Sandybridge.
4967 // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
4968 // will be scalarized.
4969 if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
4970 Cost += 2;
4971 else if (CurrOpSizeBytes < 4)
4972 Cost += 2;
4973 else
4974 Cost += 1;
4975
4976 SubVecEltsLeft -= CurrNumEltPerOp;
4977 NumEltRemaining -= CurrNumEltPerOp;
4978 Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes);
4979 }
4980 }
4981
4982 assert(NumEltRemaining <= 0 && "Should have processed all the elements.");
4983
4984 return Cost;
4985}
4986
4988X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
4989 unsigned AddressSpace,
4991 bool IsLoad = (Instruction::Load == Opcode);
4992 bool IsStore = (Instruction::Store == Opcode);
4993
4994 auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
4995 if (!SrcVTy)
4996 // To calculate scalar take the regular cost, without mask
4997 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
4998
4999 unsigned NumElem = SrcVTy->getNumElements();
5000 auto *MaskTy =
5001 FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
5002 if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) ||
5003 (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) {
5004 // Scalarization
5005 APInt DemandedElts = APInt::getAllOnes(NumElem);
5007 MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind);
5008 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
5009 Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
5011 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
5012 InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
5014 SrcVTy, DemandedElts, IsLoad, IsStore, CostKind);
5015 InstructionCost MemopCost =
5016 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5017 Alignment, AddressSpace, CostKind);
5018 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
5019 }
5020
5021 // Legalize the type.
5022 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
5023 auto VT = TLI->getValueType(DL, SrcVTy);
5025 if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
5026 LT.second.getVectorNumElements() == NumElem)
5027 // Promotion requires extend/truncate for data and a shuffle for mask.
5028 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, std::nullopt,
5029 CostKind, 0, nullptr) +
5030 getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, std::nullopt,
5031 CostKind, 0, nullptr);
5032
5033 else if (LT.first * LT.second.getVectorNumElements() > NumElem) {
5034 auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
5035 LT.second.getVectorNumElements());
5036 // Expanding requires fill mask with zeroes
5037 Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, std::nullopt,
5038 CostKind, 0, MaskTy);
5039 }
5040
5041 // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
5042 if (!ST->hasAVX512())
5043 return Cost + LT.first * (IsLoad ? 2 : 8);
5044
5045 // AVX-512 masked load/store is cheaper
5046 return Cost + LT.first;
5047}
5048
5051 const Value *Base,
5052 const TTI::PointersChainInfo &Info,
5053 Type *AccessTy, TTI::TargetCostKind CostKind) {
5054 if (Info.isSameBase() && Info.isKnownStride()) {
5055 // If all the pointers have known stride all the differences are translated
5056 // into constants. X86 memory addressing allows encoding it into
5057 // displacement. So we just need to take the base GEP cost.
5058 if (const auto *BaseGEP = dyn_cast<GetElementPtrInst>(Base)) {
5059 SmallVector<const Value *> Indices(BaseGEP->indices());
5060 return getGEPCost(BaseGEP->getSourceElementType(),
5061 BaseGEP->getPointerOperand(), Indices, nullptr,
5062 CostKind);
5063 }
5064 return TTI::TCC_Free;
5065 }
5066 return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind);
5067}
5068
5070 ScalarEvolution *SE,
5071 const SCEV *Ptr) {
5072 // Address computations in vectorized code with non-consecutive addresses will
5073 // likely result in more instructions compared to scalar code where the
5074 // computation can more often be merged into the index mode. The resulting
5075 // extra micro-ops can significantly decrease throughput.
5076 const unsigned NumVectorInstToHideOverhead = 10;
5077
5078 // Cost modeling of Strided Access Computation is hidden by the indexing
5079 // modes of X86 regardless of the stride value. We dont believe that there
5080 // is a difference between constant strided access in gerenal and constant
5081 // strided value which is less than or equal to 64.
5082 // Even in the case of (loop invariant) stride whose value is not known at
5083 // compile time, the address computation will not incur more than one extra
5084 // ADD instruction.
5085 if (Ty->isVectorTy() && SE && !ST->hasAVX2()) {
5086 // TODO: AVX2 is the current cut-off because we don't have correct
5087 // interleaving costs for prior ISA's.
5089 return NumVectorInstToHideOverhead;
5091 return 1;
5092 }
5093
5094 return BaseT::getAddressComputationCost(Ty, SE, Ptr);
5095}
5096
5099 std::optional<FastMathFlags> FMF,
5102 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5103
5104 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5105 // and make it as the cost.
5106
5107 static const CostTblEntry SLMCostTbl[] = {
5108 { ISD::FADD, MVT::v2f64, 3 },
5109 { ISD::ADD, MVT::v2i64, 5 },
5110 };
5111
5112 static const CostTblEntry SSE2CostTbl[] = {
5113 { ISD::FADD, MVT::v2f64, 2 },
5114 { ISD::FADD, MVT::v2f32, 2 },
5115 { ISD::FADD, MVT::v4f32, 4 },
5116 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
5117 { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
5118 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
5119 { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
5120 { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
5121 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
5122 { ISD::ADD, MVT::v2i8, 2 },
5123 { ISD::ADD, MVT::v4i8, 2 },
5124 { ISD::ADD, MVT::v8i8, 2 },
5125 { ISD::ADD, MVT::v16i8, 3 },
5126 };
5127
5128 static const CostTblEntry AVX1CostTbl[] = {
5129 { ISD::FADD, MVT::v4f64, 3 },
5130 { ISD::FADD, MVT::v4f32, 3 },
5131 { ISD::FADD, MVT::v8f32, 4 },
5132 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
5133 { ISD::ADD, MVT::v4i64, 3 },
5134 { ISD::ADD, MVT::v8i32, 5 },
5135 { ISD::ADD, MVT::v16i16, 5 },
5136 { ISD::ADD, MVT::v32i8, 4 },
5137 };
5138
5139 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5140 assert(ISD && "Invalid opcode");
5141
5142 // Before legalizing the type, give a chance to look up illegal narrow types
5143 // in the table.
5144 // FIXME: Is there a better way to do this?
5145 EVT VT = TLI->getValueType(DL, ValTy);
5146 if (VT.isSimple()) {
5147 MVT MTy = VT.getSimpleVT();
5148 if (ST->useSLMArithCosts())
5149 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5150 return Entry->Cost;
5151
5152 if (ST->hasAVX())
5153 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5154 return Entry->Cost;
5155
5156 if (ST->hasSSE2())
5157 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5158 return Entry->Cost;
5159 }
5160
5161 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5162
5163 MVT MTy = LT.second;
5164
5165 auto *ValVTy = cast<FixedVectorType>(ValTy);
5166
5167 // Special case: vXi8 mul reductions are performed as vXi16.
5168 if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
5169 auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
5170 auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
5171 return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
5173 CostKind) +
5174 getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind);
5175 }
5176
5177 InstructionCost ArithmeticCost = 0;
5178 if (LT.first != 1 && MTy.isVector() &&
5179 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5180 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5181 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5182 MTy.getVectorNumElements());
5183 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5184 ArithmeticCost *= LT.first - 1;
5185 }
5186
5187 if (ST->useSLMArithCosts())
5188 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5189 return ArithmeticCost + Entry->Cost;
5190
5191 if (ST->hasAVX())
5192 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5193 return ArithmeticCost + Entry->Cost;
5194
5195 if (ST->hasSSE2())
5196 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5197 return ArithmeticCost + Entry->Cost;
5198
5199 // FIXME: These assume a naive kshift+binop lowering, which is probably
5200 // conservative in most cases.
5201 static const CostTblEntry AVX512BoolReduction[] = {
5202 { ISD::AND, MVT::v2i1, 3 },
5203 { ISD::AND, MVT::v4i1, 5 },
5204 { ISD::AND, MVT::v8i1, 7 },
5205 { ISD::AND, MVT::v16i1, 9 },
5206 { ISD::AND, MVT::v32i1, 11 },
5207 { ISD::AND, MVT::v64i1, 13 },
5208 { ISD::OR, MVT::v2i1, 3 },
5209 { ISD::OR, MVT::v4i1, 5 },
5210 { ISD::OR, MVT::v8i1, 7 },
5211 { ISD::OR, MVT::v16i1, 9 },
5212 { ISD::OR, MVT::v32i1, 11 },
5213 { ISD::OR, MVT::v64i1, 13 },
5214 };
5215
5216 static const CostTblEntry AVX2BoolReduction[] = {
5217 { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
5218 { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
5219 { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
5220 { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
5221 };
5222
5223 static const CostTblEntry AVX1BoolReduction[] = {
5224 { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
5225 { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
5226 { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5227 { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5228 { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
5229 { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
5230 { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5231 { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5232 };
5233
5234 static const CostTblEntry SSE2BoolReduction[] = {
5235 { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
5236 { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
5237 { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
5238 { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
5239 { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
5240 { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
5241 { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
5242 { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
5243 };
5244
5245 // Handle bool allof/anyof patterns.
5246 if (ValVTy->getElementType()->isIntegerTy(1)) {
5247 InstructionCost ArithmeticCost = 0;
5248 if (LT.first != 1 && MTy.isVector() &&
5249 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5250 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5251 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5252 MTy.getVectorNumElements());
5253 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5254 ArithmeticCost *= LT.first - 1;
5255 }
5256
5257 if (ST->hasAVX512())
5258 if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
5259 return ArithmeticCost + Entry->Cost;
5260 if (ST->hasAVX2())
5261 if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
5262 return ArithmeticCost + Entry->Cost;
5263 if (ST->hasAVX())
5264 if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
5265 return ArithmeticCost + Entry->Cost;
5266 if (ST->hasSSE2())
5267 if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
5268 return ArithmeticCost + Entry->Cost;
5269
5270 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5271 }
5272
5273 unsigned NumVecElts = ValVTy->getNumElements();
5274 unsigned ScalarSize = ValVTy->getScalarSizeInBits();
5275
5276 // Special case power of 2 reductions where the scalar type isn't changed
5277 // by type legalization.
5278 if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
5279 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5280
5281 InstructionCost ReductionCost = 0;
5282
5283 auto *Ty = ValVTy;
5284 if (LT.first != 1 && MTy.isVector() &&
5285 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5286 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5287 Ty = FixedVectorType::get(ValVTy->getElementType(),
5288 MTy.getVectorNumElements());
5289 ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5290 ReductionCost *= LT.first - 1;
5291 NumVecElts = MTy.getVectorNumElements();
5292 }
5293
5294 // Now handle reduction with the legal type, taking into account size changes
5295 // at each level.
5296 while (NumVecElts > 1) {
5297 // Determine the size of the remaining vector we need to reduce.
5298 unsigned Size = NumVecElts * ScalarSize;
5299 NumVecElts /= 2;
5300 // If we're reducing from 256/512 bits, use an extract_subvector.
5301 if (Size > 128) {
5302 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5303 ReductionCost +=
5305 NumVecElts, SubTy);
5306 Ty = SubTy;
5307 } else if (Size == 128) {
5308 // Reducing from 128 bits is a permute of v2f64/v2i64.
5309 FixedVectorType *ShufTy;
5310 if (ValVTy->isFloatingPointTy())
5311 ShufTy =
5312 FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
5313 else
5314 ShufTy =
5315 FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
5316 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5317 std::nullopt, CostKind, 0, nullptr);
5318 } else if (Size == 64) {
5319 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5320 FixedVectorType *ShufTy;
5321 if (ValVTy->isFloatingPointTy())
5322 ShufTy =
5323 FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
5324 else
5325 ShufTy =
5326 FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
5327 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5328 std::nullopt, CostKind, 0, nullptr);
5329 } else {
5330 // Reducing from smaller size is a shift by immediate.
5331 auto *ShiftTy = FixedVectorType::get(
5332 Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
5333 ReductionCost += getArithmeticInstrCost(
5334 Instruction::LShr, ShiftTy, CostKind,
5337 }
5338
5339 // Add the arithmetic op for this level.
5340 ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
5341 }
5342
5343 // Add the final extract element to the cost.
5344 return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5345 CostKind, 0, nullptr, nullptr);
5346}
5347
5350 FastMathFlags FMF) {
5351 IntrinsicCostAttributes ICA(IID, Ty, {Ty, Ty}, FMF);
5352 return getIntrinsicInstrCost(ICA, CostKind);
5353}
5354
5357 FastMathFlags FMF,
5359 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5360
5361 MVT MTy = LT.second;
5362
5363 int ISD;
5364 if (ValTy->isIntOrIntVectorTy()) {
5365 ISD = (IID == Intrinsic::umin || IID == Intrinsic::umax) ? ISD::UMIN
5366 : ISD::SMIN;
5367 } else {
5368 assert(ValTy->isFPOrFPVectorTy() &&
5369 "Expected float point or integer vector type.");
5370 ISD = (IID == Intrinsic::minnum || IID == Intrinsic::maxnum)
5371 ? ISD::FMINNUM
5372 : ISD::FMINIMUM;
5373 }
5374
5375 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5376 // and make it as the cost.
5377
5378 static const CostTblEntry SSE2CostTbl[] = {
5379 {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
5380 {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
5381 {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
5382 };
5383
5384 static const CostTblEntry SSE41CostTbl[] = {
5385 {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
5386 {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
5387 {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
5388 {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
5389 {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
5390 {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
5391 {ISD::SMIN, MVT::v2i8, 3}, // pminsb
5392 {ISD::SMIN, MVT::v4i8, 5}, // pminsb
5393 {ISD::SMIN, MVT::v8i8, 7}, // pminsb
5394 {ISD::SMIN, MVT::v16i8, 6},
5395 {ISD::UMIN, MVT::v2i8, 3}, // same as sse2
5396 {ISD::UMIN, MVT::v4i8, 5}, // same as sse2
5397 {ISD::UMIN, MVT::v8i8, 7}, // same as sse2
5398 {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
5399 };
5400
5401 static const CostTblEntry AVX1CostTbl[] = {
5402 {ISD::SMIN, MVT::v16i16, 6},
5403 {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
5404 {ISD::SMIN, MVT::v32i8, 8},
5405 {ISD::UMIN, MVT::v32i8, 8},
5406 };
5407
5408 static const CostTblEntry AVX512BWCostTbl[] = {
5409 {ISD::SMIN, MVT::v32i16, 8},
5410 {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
5411 {ISD::SMIN, MVT::v64i8, 10},
5412 {ISD::UMIN, MVT::v64i8, 10},
5413 };
5414
5415 // Before legalizing the type, give a chance to look up illegal narrow types
5416 // in the table.
5417 // FIXME: Is there a better way to do this?
5418 EVT VT = TLI->getValueType(DL, ValTy);
5419 if (VT.isSimple()) {
5420 MVT MTy = VT.getSimpleVT();
5421 if (ST->hasBWI())
5422 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5423 return Entry->Cost;
5424
5425 if (ST->hasAVX())
5426 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5427 return Entry->Cost;
5428
5429 if (ST->hasSSE41())
5430 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5431 return Entry->Cost;
5432
5433 if (ST->hasSSE2())
5434 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5435 return Entry->Cost;
5436 }
5437
5438 auto *ValVTy = cast<FixedVectorType>(ValTy);
5439 unsigned NumVecElts = ValVTy->getNumElements();
5440
5441 auto *Ty = ValVTy;
5442 InstructionCost MinMaxCost = 0;
5443 if (LT.first != 1 && MTy.isVector() &&
5444 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5445 // Type needs to be split. We need LT.first - 1 operations ops.
5446 Ty = FixedVectorType::get(ValVTy->getElementType(),
5447 MTy.getVectorNumElements());
5448 MinMaxCost = getMinMaxCost(IID, Ty, CostKind, FMF);
5449 MinMaxCost *= LT.first - 1;
5450 NumVecElts = MTy.getVectorNumElements();
5451 }
5452
5453 if (ST->hasBWI())
5454 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5455 return MinMaxCost + Entry->Cost;
5456
5457 if (ST->hasAVX())
5458 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5459 return MinMaxCost + Entry->Cost;
5460
5461 if (ST->hasSSE41())
5462 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5463 return MinMaxCost + Entry->Cost;
5464
5465 if (ST->hasSSE2())
5466 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5467 return MinMaxCost + Entry->Cost;
5468
5469 unsigned ScalarSize = ValTy->getScalarSizeInBits();
5470
5471 // Special case power of 2 reductions where the scalar type isn't changed
5472 // by type legalization.
5473 if (!isPowerOf2_32(ValVTy->getNumElements()) ||
5474 ScalarSize != MTy.getScalarSizeInBits())
5475 return BaseT::getMinMaxReductionCost(IID, ValTy, FMF, CostKind);
5476
5477 // Now handle reduction with the legal type, taking into account size changes
5478 // at each level.
5479 while (NumVecElts > 1) {
5480 // Determine the size of the remaining vector we need to reduce.
5481 unsigned Size = NumVecElts * ScalarSize;
5482 NumVecElts /= 2;
5483 // If we're reducing from 256/512 bits, use an extract_subvector.
5484 if (Size > 128) {
5485 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5486 MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
5487 CostKind, NumVecElts, SubTy);
5488 Ty = SubTy;
5489 } else if (Size == 128) {
5490 // Reducing from 128 bits is a permute of v2f64/v2i64.
5491 VectorType *ShufTy;
5492 if (ValTy->isFloatingPointTy())
5493 ShufTy =
5495 else
5496 ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
5497 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5498 std::nullopt, CostKind, 0, nullptr);
5499 } else if (Size == 64) {
5500 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5501 FixedVectorType *ShufTy;
5502 if (ValTy->isFloatingPointTy())
5503 ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
5504 else
5505 ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
5506 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5507 std::nullopt, CostKind, 0, nullptr);
5508 } else {
5509 // Reducing from smaller size is a shift by immediate.
5510 auto *ShiftTy = FixedVectorType::get(
5511 Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
5512 MinMaxCost += getArithmeticInstrCost(
5513 Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
5516 }
5517
5518 // Add the arithmetic op for this level.
5519 MinMaxCost += getMinMaxCost(IID, Ty, CostKind, FMF);
5520 }
5521
5522 // Add the final extract element to the cost.
5523 return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5524 CostKind, 0, nullptr, nullptr);
5525}
5526
5527/// Calculate the cost of materializing a 64-bit value. This helper
5528/// method might only calculate a fraction of a larger immediate. Therefore it
5529/// is valid to return a cost of ZERO.
5531 if (Val == 0)
5532 return TTI::TCC_Free;
5533
5534 if (isInt<32>(Val))
5535 return TTI::TCC_Basic;
5536
5537 return 2 * TTI::TCC_Basic;
5538}
5539
5542 assert(Ty->isIntegerTy());
5543
5544 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5545 if (BitSize == 0)
5546 return ~0U;
5547
5548 // Never hoist constants larger than 128bit, because this might lead to
5549 // incorrect code generation or assertions in codegen.
5550 // Fixme: Create a cost model for types larger than i128 once the codegen
5551 // issues have been fixed.
5552 if (BitSize > 128)
5553 return TTI::TCC_Free;
5554
5555 if (Imm == 0)
5556 return TTI::TCC_Free;
5557
5558 // Sign-extend all constants to a multiple of 64-bit.
5559 APInt ImmVal = Imm;
5560 if (BitSize % 64 != 0)
5561 ImmVal = Imm.sext(alignTo(BitSize, 64));
5562
5563 // Split the constant into 64-bit chunks and calculate the cost for each
5564 // chunk.
5566 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
5567 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
5568 int64_t Val = Tmp.getSExtValue();
5569 Cost += getIntImmCost(Val);
5570 }
5571 // We need at least one instruction to materialize the constant.
5572 return std::max<InstructionCost>(1, Cost);
5573}
5574
5576 const APInt &Imm, Type *Ty,
5578 Instruction *Inst) {
5579 assert(Ty->isIntegerTy());
5580
5581 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5582 // There is no cost model for constants with a bit size of 0. Return TCC_Free
5583 // here, so that constant hoisting will ignore this constant.
5584 if (BitSize == 0)
5585 return TTI::TCC_Free;
5586
5587 unsigned ImmIdx = ~0U;
5588 switch (Opcode) {
5589 default:
5590 return TTI::TCC_Free;
5591 case Instruction::GetElementPtr:
5592 // Always hoist the base address of a GetElementPtr. This prevents the
5593 // creation of new constants for every base constant that gets constant
5594 // folded with the offset.
5595 if (Idx == 0)
5596 return 2 * TTI::TCC_Basic;
5597 return TTI::TCC_Free;
5598 case Instruction::Store:
5599 ImmIdx = 0;
5600 break;
5601 case Instruction::ICmp:
5602 // This is an imperfect hack to prevent constant hoisting of
5603 // compares that might be trying to check if a 64-bit value fits in
5604 // 32-bits. The backend can optimize these cases using a right shift by 32.
5605 // Ideally we would check the compare predicate here. There also other
5606 // similar immediates the backend can use shifts for.
5607 if (Idx == 1 && Imm.getBitWidth() == 64) {
5608 uint64_t ImmVal = Imm.getZExtValue();
5609 if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
5610 return TTI::TCC_Free;
5611 }
5612 ImmIdx = 1;
5613 break;
5614 case Instruction::And:
5615 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
5616 // by using a 32-bit operation with implicit zero extension. Detect such
5617 // immediates here as the normal path expects bit 31 to be sign extended.
5618 if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.isIntN(32))
5619 return TTI::TCC_Free;
5620 ImmIdx = 1;
5621 break;
5622 case Instruction::Add:
5623 case Instruction::Sub:
5624 // For add/sub, we can use the opposite instruction for INT32_MIN.
5625 if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000)
5626 return TTI::TCC_Free;
5627 ImmIdx = 1;
5628 break;
5629 case Instruction::UDiv:
5630 case Instruction::SDiv:
5631 case Instruction::URem:
5632 case Instruction::SRem:
5633 // Division by constant is typically expanded later into a different
5634 // instruction sequence. This completely changes the constants.
5635 // Report them as "free" to stop ConstantHoist from marking them as opaque.
5636 return TTI::TCC_Free;
5637 case Instruction::Mul:
5638 case Instruction::Or:
5639 case Instruction::Xor:
5640 ImmIdx = 1;
5641 break;
5642 // Always return TCC_Free for the shift value of a shift instruction.
5643 case Instruction::Shl:
5644 case Instruction::LShr:
5645 case Instruction::AShr:
5646 if (Idx == 1)
5647 return TTI::TCC_Free;
5648 break;
5649 case Instruction::Trunc:
5650 case Instruction::ZExt:
5651 case Instruction::SExt:
5652 case Instruction::IntToPtr:
5653 case Instruction::PtrToInt:
5654 case Instruction::BitCast:
5655 case Instruction::PHI:
5656 case Instruction::Call:
5657 case Instruction::Select:
5658 case Instruction::Ret:
5659 case Instruction::Load:
5660 break;
5661 }
5662
5663 if (Idx == ImmIdx) {
5664 uint64_t NumConstants = divideCeil(BitSize, 64);
5666 return (Cost <= NumConstants * TTI::TCC_Basic)
5667 ? static_cast<int>(TTI::TCC_Free)
5668 : Cost;
5669 }
5670
5671 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
5672}
5673
5675 const APInt &Imm, Type *Ty,
5677 assert(Ty->isIntegerTy());
5678
5679 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5680 // There is no cost model for constants with a bit size of 0. Return TCC_Free
5681 // here, so that constant hoisting will ignore this constant.
5682 if (BitSize == 0)
5683 return TTI::TCC_Free;
5684
5685 switch (IID) {
5686 default:
5687 return TTI::TCC_Free;
5688 case Intrinsic::sadd_with_overflow:
5689 case Intrinsic::uadd_with_overflow:
5690 case Intrinsic::ssub_with_overflow:
5691 case Intrinsic::usub_with_overflow:
5692 case Intrinsic::smul_with_overflow:
5693 case Intrinsic::umul_with_overflow:
5694 if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(32))
5695 return TTI::TCC_Free;
5696 break;
5697 case Intrinsic::experimental_stackmap:
5698 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
5699 return TTI::TCC_Free;
5700 break;
5701 case Intrinsic::experimental_patchpoint_void:
5702 case Intrinsic::experimental_patchpoint:
5703 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
5704 return TTI::TCC_Free;
5705 break;
5706 }
5707 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
5708}
5709
5712 const Instruction *I) {
5714 return Opcode == Instruction::PHI ? 0 : 1;
5715 // Branches are assumed to be predicted.
5716 return 0;
5717}
5718
5719int X86TTIImpl::getGatherOverhead() const {
5720 // Some CPUs have more overhead for gather. The specified overhead is relative
5721 // to the Load operation. "2" is the number provided by Intel architects. This
5722 // parameter is used for cost estimation of Gather Op and comparison with
5723 // other alternatives.
5724 // TODO: Remove the explicit hasAVX512()?, That would mean we would only
5725 // enable gather with a -march.
5726 if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
5727 return 2;
5728
5729 return 1024;
5730}
5731
5732int X86TTIImpl::getScatterOverhead() const {
5733 if (ST->hasAVX512())
5734 return 2;
5735
5736 return 1024;
5737}
5738
5739// Return an average cost of Gather / Scatter instruction, maybe improved later.
5740// FIXME: Add TargetCostKind support.
5741InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode,
5743 Type *SrcVTy, const Value *Ptr,
5744 Align Alignment,
5745 unsigned AddressSpace) {
5746
5747 assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
5748 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
5749
5750 // Try to reduce index size from 64 bit (default for GEP)
5751 // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
5752 // operation will use 16 x 64 indices which do not fit in a zmm and needs
5753 // to split. Also check that the base pointer is the same for all lanes,
5754 // and that there's at most one variable index.
5755 auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
5756 unsigned IndexSize = DL.getPointerSizeInBits();
5757 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
5758 if (IndexSize < 64 || !GEP)
5759 return IndexSize;
5760
5761 unsigned NumOfVarIndices = 0;
5762 const Value *Ptrs = GEP->getPointerOperand();
5763 if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
5764 return IndexSize;
5765 for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) {
5766 if (isa<Constant>(GEP->getOperand(I)))
5767 continue;
5768 Type *IndxTy = GEP->getOperand(I)->getType();
5769 if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
5770 IndxTy = IndexVTy->getElementType();
5771 if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
5772 !isa<SExtInst>(GEP->getOperand(I))) ||
5773 ++NumOfVarIndices > 1)
5774 return IndexSize; // 64
5775 }
5776 return (unsigned)32;
5777 };
5778
5779 // Trying to reduce IndexSize to 32 bits for vector 16.
5780 // By default the IndexSize is equal to pointer size.
5781 unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
5782 ? getIndexSizeInBits(Ptr, DL)
5784
5785 auto *IndexVTy = FixedVectorType::get(
5786 IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
5787 std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(IndexVTy);
5788 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcVTy);
5789 InstructionCost::CostType SplitFactor =
5790 *std::max(IdxsLT.first, SrcLT.first).getValue();
5791 if (SplitFactor > 1) {
5792 // Handle splitting of vector of pointers
5793 auto *SplitSrcTy =
5794 FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
5795 return SplitFactor * getGSVectorCost(Opcode, CostKind, SplitSrcTy, Ptr,
5796 Alignment, AddressSpace);
5797 }
5798
5799 // The gather / scatter cost is given by Intel architects. It is a rough
5800 // number since we are looking at one instruction in a time.
5801 const int GSOverhead = (Opcode == Instruction::Load)
5802 ? getGatherOverhead()
5803 : getScatterOverhead();
5804 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5805 MaybeAlign(Alignment), AddressSpace,
5807}
5808
5809/// Return the cost of full scalarization of gather / scatter operation.
5810///
5811/// Opcode - Load or Store instruction.
5812/// SrcVTy - The type of the data vector that should be gathered or scattered.
5813/// VariableMask - The mask is non-constant at compile time.
5814/// Alignment - Alignment for one element.
5815/// AddressSpace - pointer[s] address space.
5816/// TODO: Remove this and use getCommonMaskedMemoryOpCost directly.
5817InstructionCost X86TTIImpl::getGSScalarCost(unsigned Opcode,
5819 Type *SrcVTy, bool VariableMask,
5820 Align Alignment,
5821 unsigned AddressSpace) {
5822 Type *ScalarTy = SrcVTy->getScalarType();
5823 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
5824 APInt DemandedElts = APInt::getAllOnes(VF);
5825
5826 InstructionCost MaskUnpackCost = 0;
5827 if (VariableMask) {
5828 auto *MaskTy =
5830 MaskUnpackCost = getScalarizationOverhead(
5831 MaskTy, DemandedElts, /*Insert=*/false, /*Extract=*/true, CostKind);
5832 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
5833 Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr,
5835 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
5836 MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
5837 }
5838
5839 InstructionCost AddressUnpackCost = getScalarizationOverhead(
5841 DemandedElts, /*Insert=*/false, /*Extract=*/true, CostKind);
5842
5843 // The cost of the scalar loads/stores.
5844 InstructionCost MemoryOpCost =
5845 VF * getMemoryOpCost(Opcode, ScalarTy, MaybeAlign(Alignment),
5847
5848 // The cost of forming the vector from loaded scalars/
5849 // scalarizing the vector to perform scalar stores.
5850 InstructionCost InsertExtractCost = getScalarizationOverhead(
5851 cast<FixedVectorType>(SrcVTy), DemandedElts,
5852 /*Insert=*/Opcode == Instruction::Load,
5853 /*Extract=*/Opcode == Instruction::Store, CostKind);
5854
5855 return AddressUnpackCost + MemoryOpCost + MaskUnpackCost + InsertExtractCost;
5856}
5857
5858/// Calculate the cost of Gather / Scatter operation
5860 unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask,
5862 const Instruction *I = nullptr) {
5864 if ((Opcode == Instruction::Load &&
5865 isLegalMaskedGather(SrcVTy, Align(Alignment)) &&
5866 !forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
5867 Align(Alignment))) ||
5868 (Opcode == Instruction::Store &&
5869 isLegalMaskedScatter(SrcVTy, Align(Alignment)) &&
5870 !forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
5871 Align(Alignment))))
5872 return 1;
5873 return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask,
5874 Alignment, CostKind, I);
5875 }
5876
5877 assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
5878 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
5879 if (!PtrTy && Ptr->getType()->isVectorTy())
5880 PtrTy = dyn_cast<PointerType>(
5881 cast<VectorType>(Ptr->getType())->getElementType());
5882 assert(PtrTy && "Unexpected type for Ptr argument");
5883 unsigned AddressSpace = PtrTy->getAddressSpace();
5884
5885 if ((Opcode == Instruction::Load &&
5886 (!isLegalMaskedGather(SrcVTy, Align(Alignment)) ||
5887 forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
5888 Align(Alignment)))) ||
5889 (Opcode == Instruction::Store &&
5890 (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) ||
5891 forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
5892 Align(Alignment)))))
5893 return getGSScalarCost(Opcode, CostKind, SrcVTy, VariableMask, Alignment,
5894 AddressSpace);
5895
5896 return getGSVectorCost(Opcode, CostKind, SrcVTy, Ptr, Alignment,
5897 AddressSpace);
5898}
5899
5901 const TargetTransformInfo::LSRCost &C2) {
5902 // X86 specific here are "instruction number 1st priority".
5903 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
5904 C1.NumIVMuls, C1.NumBaseAdds,
5905 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
5906 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
5907 C2.NumIVMuls, C2.NumBaseAdds,
5908 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
5909}
5910
5912 return ST->hasMacroFusion() || ST->hasBranchFusion();
5913}
5914
5915bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
5916 if (!ST->hasAVX())
5917 return false;
5918
5919 // The backend can't handle a single element vector.
5920 if (isa<VectorType>(DataTy) &&
5921 cast<FixedVectorType>(DataTy)->getNumElements() == 1)
5922 return false;
5923 Type *ScalarTy = DataTy->getScalarType();
5924
5925 if (ScalarTy->isPointerTy())
5926 return true;
5927
5928 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
5929 return true;
5930
5931 if (ScalarTy->isHalfTy() && ST->hasBWI())
5932 return true;
5933
5934 if (ScalarTy->isBFloatTy() && ST->hasBF16())
5935 return true;
5936
5937 if (!ScalarTy->isIntegerTy())
5938 return false;
5939
5940 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
5941 return IntWidth == 32 || IntWidth == 64 ||
5942 ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
5943}
5944
5945bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) {
5946 return isLegalMaskedLoad(DataType, Alignment);
5947}
5948
5949bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) {
5950 unsigned DataSize = DL.getTypeStoreSize(DataType);
5951 // The only supported nontemporal loads are for aligned vectors of 16 or 32
5952 // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
5953 // (the equivalent stores only require AVX).
5954 if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
5955 return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
5956
5957 return false;
5958}
5959
5960bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) {
5961 unsigned DataSize = DL.getTypeStoreSize(DataType);
5962
5963 // SSE4A supports nontemporal stores of float and double at arbitrary
5964 // alignment.
5965 if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
5966 return true;
5967
5968 // Besides the SSE4A subtarget exception above, only aligned stores are
5969 // available nontemporaly on any other subtarget. And only stores with a size
5970 // of 4..32 bytes (powers of 2, only) are permitted.
5971 if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
5972 !isPowerOf2_32(DataSize))
5973 return false;
5974
5975 // 32-byte vector nontemporal stores are supported by AVX (the equivalent
5976 // loads require AVX2).
5977 if (DataSize == 32)
5978 return ST->hasAVX();
5979 if (DataSize == 16)
5980 return ST->hasSSE1();
5981 return true;
5982}
5983
5985 ElementCount NumElements) const {
5986 // movddup
5987 return ST->hasSSE3() && !NumElements.isScalable() &&
5988 NumElements.getFixedValue() == 2 &&
5989 ElementTy == Type::getDoubleTy(ElementTy->getContext());
5990}
5991
5993 if (!isa<VectorType>(DataTy))
5994 return false;
5995
5996 if (!ST->hasAVX512())
5997 return false;
5998
5999 // The backend can't handle a single element vector.
6000 if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6001 return false;
6002
6003 Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
6004
6005 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6006 return true;
6007
6008 if (!ScalarTy->isIntegerTy())
6009 return false;
6010
6011 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6012 return IntWidth == 32 || IntWidth == 64 ||
6013 ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
6014}
6015
6017 return isLegalMaskedExpandLoad(DataTy, Alignment);
6018}
6019
6020bool X86TTIImpl::supportsGather() const {
6021 // Some CPUs have better gather performance than others.
6022 // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
6023 // enable gather with a -march.
6024 return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2());
6025}
6026
6028 // Gather / Scatter for vector 2 is not profitable on KNL / SKX
6029 // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend
6030 // it to 8 elements, but zeroing upper bits of the mask vector will add more
6031 // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO:
6032 // Check, maybe the gather/scatter instruction is better in the VariableMask
6033 // case.
6034 unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements();
6035 return NumElts == 1 ||
6036 (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())));
6037}
6038
6040 Type *ScalarTy = DataTy->getScalarType();
6041 if (ScalarTy->isPointerTy())
6042 return true;
6043
6044 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6045 return true;
6046
6047 if (!ScalarTy->isIntegerTy())
6048 return false;
6049
6050 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6051 return IntWidth == 32 || IntWidth == 64;
6052}
6053
6055 if (!supportsGather() || !ST->preferGather())
6056 return false;
6057 return isLegalMaskedGatherScatter(DataTy, Alignment);
6058}
6059
6060bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
6061 unsigned Opcode1,
6062 const SmallBitVector &OpcodeMask) const {
6063 // ADDSUBPS 4xf32 SSE3
6064 // VADDSUBPS 4xf32 AVX
6065 // VADDSUBPS 8xf32 AVX2
6066 // ADDSUBPD 2xf64 SSE3
6067 // VADDSUBPD 2xf64 AVX
6068 // VADDSUBPD 4xf64 AVX2
6069
6070 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
6071 assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible");
6072 if (!isPowerOf2_32(NumElements))
6073 return false;
6074 // Check the opcode pattern. We apply the mask on the opcode arguments and
6075 // then check if it is what we expect.
6076 for (int Lane : seq<int>(0, NumElements)) {
6077 unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0;
6078 // We expect FSub for even lanes and FAdd for odd lanes.
6079 if (Lane % 2 == 0 && Opc != Instruction::FSub)
6080 return false;
6081 if (Lane % 2 == 1 && Opc != Instruction::FAdd)
6082 return false;
6083 }
6084 // Now check that the pattern is supported by the target ISA.
6085 Type *ElemTy = cast<VectorType>(VecTy)->getElementType();
6086 if (ElemTy->isFloatTy())
6087 return ST->hasSSE3() && NumElements % 4 == 0;
6088 if (ElemTy->isDoubleTy())
6089 return ST->hasSSE3() && NumElements % 2 == 0;
6090 return false;
6091}
6092
6093bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) {
6094 // AVX2 doesn't support scatter
6095 if (!ST->hasAVX512() || !ST->preferScatter())
6096 return false;
6097 return isLegalMaskedGatherScatter(DataType, Alignment);
6098}
6099
6100bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
6101 EVT VT = TLI->getValueType(DL, DataType);
6102 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
6103}
6104
6106 // FDIV is always expensive, even if it has a very low uop count.
6107 // TODO: Still necessary for recent CPUs with low latency/throughput fdiv?
6108 if (I->getOpcode() == Instruction::FDiv)
6109 return true;
6110
6112}
6113
6115 return false;
6116}
6117
6119 const Function *Callee) const {
6120 const TargetMachine &TM = getTLI()->getTargetMachine();
6121
6122 // Work this as a subsetting of subtarget features.
6123 const FeatureBitset &CallerBits =
6124 TM.getSubtargetImpl(*Caller)->getFeatureBits();
6125 const FeatureBitset &CalleeBits =
6126 TM.getSubtargetImpl(*Callee)->getFeatureBits();
6127
6128 // Check whether features are the same (apart from the ignore list).
6129 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
6130 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
6131 if (RealCallerBits == RealCalleeBits)
6132 return true;
6133
6134 // If the features are a subset, we need to additionally check for calls
6135 // that may become ABI-incompatible as a result of inlining.
6136 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
6137 return false;
6138
6139 for (const Instruction &I : instructions(Callee)) {
6140 if (const auto *CB = dyn_cast<CallBase>(&I)) {
6141 // Having more target features is fine for inline ASM.
6142 if (CB->isInlineAsm())
6143 continue;
6144
6146 for (Value *Arg : CB->args())
6147 Types.push_back(Arg->getType());
6148 if (!CB->getType()->isVoidTy())
6149 Types.push_back(CB->getType());
6150
6151 // Simple types are always ABI compatible.
6152 auto IsSimpleTy = [](Type *Ty) {
6153 return !Ty->isVectorTy() && !Ty->isAggregateType();
6154 };
6155 if (all_of(Types, IsSimpleTy))
6156 continue;
6157
6158 if (Function *NestedCallee = CB->getCalledFunction()) {
6159 // Assume that intrinsics are always ABI compatible.
6160 if (NestedCallee->isIntrinsic())
6161 continue;
6162
6163 // Do a precise compatibility check.
6164 if (!areTypesABICompatible(Caller, NestedCallee, Types))
6165 return false;
6166 } else {
6167 // We don't know the target features of the callee,
6168 // assume it is incompatible.
6169 return false;
6170 }
6171 }
6172 }
6173 return true;
6174}
6175
6177 const Function *Callee,
6178 const ArrayRef<Type *> &Types) const {
6179 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
6180 return false;
6181
6182 // If we get here, we know the target features match. If one function
6183 // considers 512-bit vectors legal and the other does not, consider them
6184 // incompatible.
6185 const TargetMachine &TM = getTLI()->getTargetMachine();
6186
6187 if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
6188 TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs())
6189 return true;
6190
6191 // Consider the arguments compatible if they aren't vectors or aggregates.
6192 // FIXME: Look at the size of vectors.
6193 // FIXME: Look at the element types of aggregates to see if there are vectors.
6194 return llvm::none_of(Types,
6195 [](Type *T) { return T->isVectorTy() || T->isAggregateType(); });
6196}
6197
6199X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
6201 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
6202 Options.NumLoadsPerBlock = 2;
6203 // All GPR and vector loads can be unaligned.
6204 Options.AllowOverlappingLoads = true;
6205 if (IsZeroCmp) {
6206 // Only enable vector loads for equality comparison. Right now the vector
6207 // version is not as fast for three way compare (see #33329).
6208 const unsigned PreferredWidth = ST->getPreferVectorWidth();
6209 if (PreferredWidth >= 512 && ST->hasAVX512() && ST->hasEVEX512())
6210 Options.LoadSizes.push_back(64);
6211 if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
6212 if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
6213 }
6214 if (ST->is64Bit()) {
6215 Options.LoadSizes.push_back(8);
6216 }
6217 Options.LoadSizes.push_back(4);
6218 Options.LoadSizes.push_back(2);
6219 Options.LoadSizes.push_back(1);
6220 return Options;
6221}
6222
6224 return supportsGather();
6225}
6226
6228 return false;
6229}
6230
6232 // TODO: We expect this to be beneficial regardless of arch,
6233 // but there are currently some unexplained performance artifacts on Atom.
6234 // As a temporary solution, disable on Atom.
6235 return !(ST->isAtom());
6236}
6237
6238// Get estimation for interleaved load/store operations and strided load.
6239// \p Indices contains indices for strided load.
6240// \p Factor - the factor of interleaving.
6241// AVX-512 provides 3-src shuffles that significantly reduces the cost.
6243 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
6244 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
6245 TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
6246 // VecTy for interleave memop is <VF*Factor x Elt>.
6247 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6248 // VecTy = <12 x i32>.
6249
6250 // Calculate the number of memory operations (NumOfMemOps), required
6251 // for load/store the VecTy.
6252 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6253 unsigned VecTySize = DL.getTypeStoreSize(VecTy);
6254 unsigned LegalVTSize = LegalVT.getStoreSize();
6255 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
6256
6257 // Get the cost of one memory operation.
6258 auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
6259 LegalVT.getVectorNumElements());
6260 InstructionCost MemOpCost;
6261 bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps;
6262 if (UseMaskedMemOp)
6263 MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment,
6265 else
6266 MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, MaybeAlign(Alignment),
6268
6269 unsigned VF = VecTy->getNumElements() / Factor;
6270 MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
6271
6272 InstructionCost MaskCost;
6273 if (UseMaskedMemOp) {
6274 APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements());
6275 for (unsigned Index : Indices) {
6276 assert(Index < Factor && "Invalid index for interleaved memory op");
6277 for (unsigned Elm = 0; Elm < VF; Elm++)
6278 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
6279 }
6280
6281 Type *I1Type = Type::getInt1Ty(VecTy->getContext());
6282
6283 MaskCost = getReplicationShuffleCost(
6284 I1Type, Factor, VF,
6285 UseMaskForGaps ? DemandedLoadStoreElts
6287 CostKind);
6288
6289 // The Gaps mask is invariant and created outside the loop, therefore the
6290 // cost of creating it is not accounted for here. However if we have both
6291 // a MaskForGaps and some other mask that guards the execution of the
6292 // memory access, we need to account for the cost of And-ing the two masks
6293 // inside the loop.
6294 if (UseMaskForGaps) {
6295 auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements());
6296 MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind);
6297 }
6298 }
6299
6300 if (Opcode == Instruction::Load) {
6301 // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
6302 // contain the cost of the optimized shuffle sequence that the
6303 // X86InterleavedAccess pass will generate.
6304 // The cost of loads and stores are computed separately from the table.
6305
6306 // X86InterleavedAccess support only the following interleaved-access group.
6307 static const CostTblEntry AVX512InterleavedLoadTbl[] = {
6308 {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
6309 {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
6310 {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
6311 };
6312
6313 if (const auto *Entry =
6314 CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
6315 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6316 //If an entry does not exist, fallback to the default implementation.
6317
6318 // Kind of shuffle depends on number of loaded values.
6319 // If we load the entire data in one register, we can use a 1-src shuffle.
6320 // Otherwise, we'll merge 2 sources in each operation.
6321 TTI::ShuffleKind ShuffleKind =
6322 (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
6323
6324 InstructionCost ShuffleCost = getShuffleCost(
6325 ShuffleKind, SingleMemOpTy, std::nullopt, CostKind, 0, nullptr);
6326
6327 unsigned NumOfLoadsInInterleaveGrp =
6328 Indices.size() ? Indices.size() : Factor;
6329 auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
6330 VecTy->getNumElements() / Factor);
6331 InstructionCost NumOfResults =
6332 getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp;
6333
6334 // About a half of the loads may be folded in shuffles when we have only
6335 // one result. If we have more than one result, or the loads are masked,
6336 // we do not fold loads at all.
6337 unsigned NumOfUnfoldedLoads =
6338 UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
6339
6340 // Get a number of shuffle operations per result.
6341 unsigned NumOfShufflesPerResult =
6342 std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
6343
6344 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6345 // When we have more than one destination, we need additional instructions
6346 // to keep sources.
6347 InstructionCost NumOfMoves = 0;
6348 if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
6349 NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
6350
6351 InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
6352 MaskCost + NumOfUnfoldedLoads * MemOpCost +
6353 NumOfMoves;
6354
6355 return Cost;
6356 }
6357
6358 // Store.
6359 assert(Opcode == Instruction::Store &&
6360 "Expected Store Instruction at this point");
6361 // X86InterleavedAccess support only the following interleaved-access group.
6362 static const CostTblEntry AVX512InterleavedStoreTbl[] = {
6363 {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
6364 {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
6365 {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
6366
6367 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
6368 {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
6369 {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
6370 {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
6371 };
6372
6373 if (const auto *Entry =
6374 CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
6375 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6376 //If an entry does not exist, fallback to the default implementation.
6377
6378 // There is no strided stores meanwhile. And store can't be folded in
6379 // shuffle.
6380 unsigned NumOfSources = Factor; // The number of values to be merged.
6381 InstructionCost ShuffleCost = getShuffleCost(
6382 TTI::SK_PermuteTwoSrc, SingleMemOpTy, std::nullopt, CostKind, 0, nullptr);
6383 unsigned NumOfShufflesPerStore = NumOfSources - 1;
6384
6385 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6386 // We need additional instructions to keep sources.
6387 unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
6389 MaskCost +
6390 NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
6391 NumOfMoves;
6392 return Cost;
6393}
6394
6396 unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices,
6397 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
6398 bool UseMaskForCond, bool UseMaskForGaps) {
6399 auto *VecTy = cast<FixedVectorType>(BaseTy);
6400
6401 auto isSupportedOnAVX512 = [&](Type *VecTy) {
6402 Type *EltTy = cast<VectorType>(VecTy)->getElementType();
6403 if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
6404 EltTy->isIntegerTy(32) || EltTy->isPointerTy())
6405 return true;
6406 if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy())
6407 return ST->hasBWI();
6408 if (EltTy->isBFloatTy())
6409 return ST->hasBF16();
6410 return false;
6411 };
6412 if (ST->hasAVX512() && isSupportedOnAVX512(VecTy))
6414 Opcode, VecTy, Factor, Indices, Alignment,
6415 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
6416
6417 if (UseMaskForCond || UseMaskForGaps)
6418 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6419 Alignment, AddressSpace, CostKind,
6420 UseMaskForCond, UseMaskForGaps);
6421
6422 // Get estimation for interleaved load/store operations for SSE-AVX2.
6423 // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow
6424 // computing the cost using a generic formula as a function of generic
6425 // shuffles. We therefore use a lookup table instead, filled according to
6426 // the instruction sequences that codegen currently generates.
6427
6428 // VecTy for interleave memop is <VF*Factor x Elt>.
6429 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6430 // VecTy = <12 x i32>.
6431 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6432
6433 // This function can be called with VecTy=<6xi128>, Factor=3, in which case
6434 // the VF=2, while v2i128 is an unsupported MVT vector type
6435 // (see MachineValueType.h::getVectorVT()).
6436 if (!LegalVT.isVector())
6437 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6438 Alignment, AddressSpace, CostKind);
6439
6440 unsigned VF = VecTy->getNumElements() / Factor;
6441 Type *ScalarTy = VecTy->getElementType();
6442 // Deduplicate entries, model floats/pointers as appropriately-sized integers.
6443 if (!ScalarTy->isIntegerTy())
6444 ScalarTy =
6445 Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
6446
6447 // Get the cost of all the memory operations.
6448 // FIXME: discount dead loads.
6449 InstructionCost MemOpCosts = getMemoryOpCost(
6450 Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind);
6451
6452 auto *VT = FixedVectorType::get(ScalarTy, VF);
6453 EVT ETy = TLI->getValueType(DL, VT);
6454 if (!ETy.isSimple())
6455 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6456 Alignment, AddressSpace, CostKind);
6457
6458 // TODO: Complete for other data-types and strides.
6459 // Each combination of Stride, element bit width and VF results in a different
6460 // sequence; The cost tables are therefore accessed with:
6461 // Factor (stride) and VectorType=VFxiN.
6462 // The Cost accounts only for the shuffle sequence;
6463 // The cost of the loads/stores is accounted for separately.
6464 //
6465 static const CostTblEntry AVX2InterleavedLoadTbl[] = {
6466 {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8
6467 {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8
6468 {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8
6469 {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8
6470 {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8
6471
6472 {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16
6473 {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16
6474 {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16
6475
6476 {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32
6477 {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32
6478 {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32
6479
6480 {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64
6481 {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64
6482 {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64
6483 {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64
6484
6485 {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8
6486 {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8
6487 {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8
6488 {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
6489 {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8
6490
6491 {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16
6492 {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16
6493 {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16
6494 {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16
6495 {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16
6496
6497 {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32
6498 {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32
6499 {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32
6500 {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32
6501 {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32
6502
6503 {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64
6504 {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64
6505 {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64
6506 {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64
6507
6508 {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8
6509 {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8
6510 {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8
6511 {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8
6512 {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8
6513
6514 {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16
6515 {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16
6516 {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16
6517 {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16
6518 {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16
6519
6520 {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32
6521 {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32
6522 {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32
6523 {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32
6524 {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32
6525
6526 {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64
6527 {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64
6528 {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64
6529 {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64
6530
6531 {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8
6532 {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8
6533 {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8
6534 {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8
6535 {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8
6536
6537 {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16
6538 {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16
6539 {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16
6540 {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16
6541 {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16
6542
6543 {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32
6544 {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32
6545 {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32
6546 {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32
6547
6548 {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64
6549 {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64
6550 {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64
6551
6552 {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
6553 };
6554
6555 static const CostTblEntry SSSE3InterleavedLoadTbl[] = {
6556 {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16
6557 };
6558
6559 static const CostTblEntry SSE2InterleavedLoadTbl[] = {
6560 {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16
6561 {2, MVT::v4i16, 7}, // (load 8i16 and) deinterleave into 2 x 4i16
6562
6563 {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32
6564 {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32
6565
6566 {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64
6567 };
6568
6569 static const CostTblEntry AVX2InterleavedStoreTbl[] = {
6570 {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store)
6571 {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store)
6572
6573 {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store)
6574 {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store)
6575 {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store)
6576
6577 {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store)
6578 {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store)
6579 {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store)
6580 {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store)
6581
6582 {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store)
6583 {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store)
6584 {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store)
6585 {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store)
6586 {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store)
6587
6588 {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store)
6589 {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store)
6590 {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store)
6591 {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
6592 {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
6593
6594 {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store)
6595 {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store)
6596 {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store)
6597 {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store)
6598 {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store)
6599
6600 {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store)
6601 {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store)
6602 {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store)
6603 {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store)
6604 {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store)
6605
6606 {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store)
6607 {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store)
6608 {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store)
6609 {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store)
6610
6611 {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store)
6612 {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store)
6613 {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store)
6614 {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store)
6615 {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store)
6616
6617 {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store)
6618 {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store)
6619 {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store)
6620 {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store)
6621 {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store)
6622
6623 {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store)
6624 {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store)
6625 {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store)
6626 {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store)
6627 {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store)
6628
6629 {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store)
6630 {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store)
6631 {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store)
6632 {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store)
6633
6634 {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store)
6635 {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store)
6636 {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store)
6637 {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store)
6638 {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store)
6639
6640 {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store)
6641 {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store)
6642 {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store)
6643 {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store)
6644 {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store)
6645
6646 {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store)
6647 {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store)
6648 {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store)
6649 {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store)
6650
6651 {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store)
6652 {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store)
6653 {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store)
6654 };
6655
6656 static const CostTblEntry SSE2InterleavedStoreTbl[] = {
6657 {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store)
6658 {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store)
6659 {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store)
6660
6661 {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store)
6662 {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store)
6663
6664 {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store)
6665 };
6666
6667 if (Opcode == Instruction::Load) {
6668 auto GetDiscountedCost = [Factor, NumMembers = Indices.size(),
6669 MemOpCosts](const CostTblEntry *Entry) {
6670 // NOTE: this is just an approximation!
6671 // It can over/under -estimate the cost!
6672 return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor);
6673 };
6674
6675 if (ST->hasAVX2())
6676 if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor,
6677 ETy.getSimpleVT()))
6678 return GetDiscountedCost(Entry);
6679
6680 if (ST->hasSSSE3())
6681 if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor,
6682 ETy.getSimpleVT()))
6683 return GetDiscountedCost(Entry);
6684
6685 if (ST->hasSSE2())
6686 if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor,
6687 ETy.getSimpleVT()))
6688 return GetDiscountedCost(Entry);
6689 } else {
6690 assert(Opcode == Instruction::Store &&
6691 "Expected Store Instruction at this point");
6692 assert((!Indices.size() || Indices.size() == Factor) &&
6693 "Interleaved store only supports fully-interleaved groups.");
6694 if (ST->hasAVX2())
6695 if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor,
6696 ETy.getSimpleVT()))
6697 return MemOpCosts + Entry->Cost;
6698
6699 if (ST->hasSSE2())
6700 if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor,
6701 ETy.getSimpleVT()))
6702 return MemOpCosts + Entry->Cost;
6703 }
6704
6705 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6706 Alignment, AddressSpace, CostKind,
6707 UseMaskForCond, UseMaskForGaps);
6708}
6709
6711 int64_t BaseOffset,
6712 bool HasBaseReg, int64_t Scale,
6713 unsigned AddrSpace) const {
6714 // Scaling factors are not free at all.
6715 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
6716 // will take 2 allocations in the out of order engine instead of 1
6717 // for plain addressing mode, i.e. inst (reg1).
6718 // E.g.,
6719 // vaddps (%rsi,%rdx), %ymm0, %ymm1
6720 // Requires two allocations (one for the load, one for the computation)
6721 // whereas:
6722 // vaddps (%rsi), %ymm0, %ymm1
6723 // Requires just 1 allocation, i.e., freeing allocations for other operations
6724 // and having less micro operations to execute.
6725 //
6726 // For some X86 architectures, this is even worse because for instance for
6727 // stores, the complex addressing mode forces the instruction to use the
6728 // "load" ports instead of the dedicated "store" port.
6729 // E.g., on Haswell:
6730 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
6731 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
6733 AM.BaseGV = BaseGV;
6734 AM.BaseOffs = BaseOffset;
6735 AM.HasBaseReg = HasBaseReg;
6736 AM.Scale = Scale;
6737 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
6738 // Scale represents reg2 * scale, thus account for 1
6739 // as soon as we use a second register.
6740 return AM.Scale != 0;
6741 return -1;
6742}
Expand Atomic instructions
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
Hexagon Common GEP
static LVOptions Options
Definition: LVOptions.cpp:25
#define I(x, y, z)
Definition: MD5.cpp:58
if(VerifyEach)
const char LLVMTargetMachineRef TM
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
This file a TargetTransformInfo::Concept conforming object specific to the X86 target machine.
Class for arbitrary precision integers.
Definition: APInt.h:76
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1620
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1308
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:349
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:236
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:358
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1439
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:805
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:178
APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition: APInt.cpp:453
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1513
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:969
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:762
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:438
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:891
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:855
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:339
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:993
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:996
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:1023
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:1017
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:1016
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:1001
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:1004
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:1018
@ ICMP_NE
not equal
Definition: InstrTypes.h:1015
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:1021
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:1019
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:1003
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:410
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:865
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:672
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:472
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:874
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:307
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
unsigned getNumElements() const
Definition: DerivedTypes.h:582
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:973
static InstructionCost getInvalid(CostType Val=0)
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
const SmallVectorImpl< const Value * > & getArgs() const
const IntrinsicInst * getInst() const
Machine Value Type.
bool is128BitVector() const
Return true if this is a 128-bit vector type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:585
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Class to represent pointers.
Definition: DerivedTypes.h:646
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:679
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
bool test(unsigned Idx) const
size_type size() const
Returns the number of bits in this bitvector.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
bool isExpensiveToSpeculativelyExecute(const Instruction *I)
bool isStridedAccess(const SCEV *Ptr) const
unsigned minRequiredElementSize(const Value *Val, bool &isSigned) const
const SCEVConstant * getConstantStrideStep(ScalarEvolution *SE, const SCEV *Ptr) const
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
CacheLevel
The possible cache levels.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:330
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:333
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
static Type * getDoubleTy(LLVMContext &C)
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:234
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
static IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:146
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:143
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
static IntegerType * getInt8Ty(LLVMContext &C)
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:157
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:216
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
static VectorType * getExtendedElementVectorType(VectorType *VTy)
This static method is like getInteger except that the element types are twice as wide as the elements...
Definition: DerivedTypes.h:463
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * getDoubleElementsVectorType(VectorType *VTy)
This static method returns a VectorType with twice as many elements as the input type and the same el...
Definition: DerivedTypes.h:517
Type * getElementType() const
Definition: DerivedTypes.h:436
bool hasSSE1() const
Definition: X86Subtarget.h:200
bool hasSSE42() const
Definition: X86Subtarget.h:205
bool useAVX512Regs() const
Definition: X86Subtarget.h:266
bool hasSSE3() const
Definition: X86Subtarget.h:202
bool hasAVX512() const
Definition: X86Subtarget.h:208
bool hasSSE41() const
Definition: X86Subtarget.h:204
bool hasSSE2() const
Definition: X86Subtarget.h:201
bool hasSSSE3() const
Definition: X86Subtarget.h:203
bool hasAVX() const
Definition: X86Subtarget.h:206
unsigned getPreferVectorWidth() const
Definition: X86Subtarget.h:239
bool hasAVX2() const
Definition: X86Subtarget.h:207
InstructionCost getInterleavedMemoryOpCostAVX512(unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
bool isLegalMaskedGather(Type *DataType, Align Alignment)
InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask, TTI::TargetCostKind CostKind) const
std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool isLegalNTStore(Type *DataType, Align Alignment)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
bool isLegalNTLoad(Type *DataType, Align Alignment)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment)
bool isLegalMaskedGatherScatter(Type *DataType, Align Alignment)
bool isLegalMaskedLoad(Type *DataType, Align Alignment)
bool supportsEfficientVectorElementLoadStore() const
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
bool prefersVectorizedAddressing() const
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment)
std::optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const override
bool isLegalMaskedStore(Type *DataType, Align Alignment)
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
Calculate the cost of Gather / Scatter operation.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
unsigned getMaxInterleaveFactor(ElementCount VF)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
bool isLegalMaskedCompressStore(Type *DataType, Align Alignment)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
unsigned getNumberOfRegisters(unsigned ClassID) const
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
unsigned getAtomicMemIntrinsicMaxElementSize() const
bool isLegalMaskedScatter(Type *DataType, Align Alignment)
InstructionCost getIntImmCost(int64_t)
Calculate the cost of materializing a 64-bit value.
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isExpensiveToSpeculativelyExecute(const Instruction *I)
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
InstructionCost getMinMaxCost(Intrinsic::ID IID, Type *Ty, TTI::TargetCostKind CostKind, FastMathFlags FMF)
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Type) const
bool hasDivRemOp(Type *DataType, bool IsSigned)
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:187
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:2978
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:750
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:723
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:714
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:790
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:688
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:255
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:774
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:722
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:930
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:349
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:727
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:323
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:705
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:535
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:780
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:971
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:674
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:888
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:990
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:836
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:680
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:869
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:786
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:340
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
apint_match m_APIntAllowPoison(const APInt *&Res)
Match APInt while allowing poison in splat vector constants.
Definition: PatternMatch.h:305
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:428
AddressSpace
Definition: NVPTXBaseInfo.h:21
Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:372
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
constexpr int PoisonMaskElem
void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1824
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
InstructionCost Cost
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:439
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
std::optional< unsigned > operator[](TargetTransformInfo::TargetCostKind Kind) const
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:34
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition: Alignment.h:141
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Type Conversion Cost Table.
Definition: CostTable.h:55