LLVM 19.0.0git
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements a TargetTransformInfo analysis pass specific to the
10/// X86 target machine. It uses the target's detailed information to provide
11/// more precise answers to certain TTI queries, while letting the target
12/// independent and default TTI implementations handle the rest.
13///
14//===----------------------------------------------------------------------===//
15/// About Cost Model numbers used below it's necessary to say the following:
16/// the numbers correspond to some "generic" X86 CPU instead of usage of a
17/// specific CPU model. Usually the numbers correspond to the CPU where the
18/// feature first appeared. For example, if we do Subtarget.hasSSE42() in
19/// the lookups below the cost is based on Nehalem as that was the first CPU
20/// to support that feature level and thus has most likely the worst case cost,
21/// although we may discard an outlying worst cost from one CPU (e.g. Atom).
22///
23/// Some examples of other technologies/CPUs:
24/// SSE 3 - Pentium4 / Athlon64
25/// SSE 4.1 - Penryn
26/// SSE 4.2 - Nehalem / Silvermont
27/// AVX - Sandy Bridge / Jaguar / Bulldozer
28/// AVX2 - Haswell / Ryzen
29/// AVX-512 - Xeon Phi / Skylake
30///
31/// And some examples of instruction target dependent costs (latency)
32/// divss sqrtss rsqrtss
33/// AMD K7 11-16 19 3
34/// Piledriver 9-24 13-15 5
35/// Jaguar 14 16 2
36/// Pentium II,III 18 30 2
37/// Nehalem 7-14 7-18 3
38/// Haswell 10-13 11 5
39///
40/// Interpreting the 4 TargetCostKind types:
41/// TCK_RecipThroughput and TCK_Latency should try to match the worst case
42/// values reported by the CPU scheduler models (and llvm-mca).
43/// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the
44/// actual encoding size of the instruction.
45/// TCK_SizeAndLatency should match the worst case micro-op counts reported by
46/// by the CPU scheduler models (and llvm-mca), to ensure that they are
47/// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are
48/// often used as the cost thresholds where TCK_SizeAndLatency is requested.
49//===----------------------------------------------------------------------===//
50
58#include "llvm/Support/Debug.h"
59#include <optional>
60
61using namespace llvm;
62
63#define DEBUG_TYPE "x86tti"
64
65//===----------------------------------------------------------------------===//
66//
67// X86 cost model.
68//
69//===----------------------------------------------------------------------===//
70
71// Helper struct to store/access costs for each cost kind.
72// TODO: Move this to allow other targets to use it?
74 unsigned RecipThroughputCost = ~0U;
75 unsigned LatencyCost = ~0U;
76 unsigned CodeSizeCost = ~0U;
77 unsigned SizeAndLatencyCost = ~0U;
78
79 std::optional<unsigned>
81 unsigned Cost = ~0U;
82 switch (Kind) {
85 break;
88 break;
91 break;
94 break;
95 }
96 if (Cost == ~0U)
97 return std::nullopt;
98 return Cost;
99 }
100};
102
105 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
106 // TODO: Currently the __builtin_popcount() implementation using SSE3
107 // instructions is inefficient. Once the problem is fixed, we should
108 // call ST->hasSSE3() instead of ST->hasPOPCNT().
109 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
110}
111
112std::optional<unsigned> X86TTIImpl::getCacheSize(
114 switch (Level) {
116 // - Penryn
117 // - Nehalem
118 // - Westmere
119 // - Sandy Bridge
120 // - Ivy Bridge
121 // - Haswell
122 // - Broadwell
123 // - Skylake
124 // - Kabylake
125 return 32 * 1024; // 32 KByte
127 // - Penryn
128 // - Nehalem
129 // - Westmere
130 // - Sandy Bridge
131 // - Ivy Bridge
132 // - Haswell
133 // - Broadwell
134 // - Skylake
135 // - Kabylake
136 return 256 * 1024; // 256 KByte
137 }
138
139 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
140}
141
142std::optional<unsigned> X86TTIImpl::getCacheAssociativity(
144 // - Penryn
145 // - Nehalem
146 // - Westmere
147 // - Sandy Bridge
148 // - Ivy Bridge
149 // - Haswell
150 // - Broadwell
151 // - Skylake
152 // - Kabylake
153 switch (Level) {
155 [[fallthrough]];
157 return 8;
158 }
159
160 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
161}
162
163unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
164 bool Vector = (ClassID == 1);
165 if (Vector && !ST->hasSSE1())
166 return 0;
167
168 if (ST->is64Bit()) {
169 if (Vector && ST->hasAVX512())
170 return 32;
171 return 16;
172 }
173 return 8;
174}
175
178 unsigned PreferVectorWidth = ST->getPreferVectorWidth();
179 switch (K) {
181 return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
183 if (ST->hasAVX512() && ST->hasEVEX512() && PreferVectorWidth >= 512)
184 return TypeSize::getFixed(512);
185 if (ST->hasAVX() && PreferVectorWidth >= 256)
186 return TypeSize::getFixed(256);
187 if (ST->hasSSE1() && PreferVectorWidth >= 128)
188 return TypeSize::getFixed(128);
189 return TypeSize::getFixed(0);
191 return TypeSize::getScalable(0);
192 }
193
194 llvm_unreachable("Unsupported register kind");
195}
196
199 .getFixedValue();
200}
201
203 // If the loop will not be vectorized, don't interleave the loop.
204 // Let regular unroll to unroll the loop, which saves the overflow
205 // check and memory check cost.
206 if (VF.isScalar())
207 return 1;
208
209 if (ST->isAtom())
210 return 1;
211
212 // Sandybridge and Haswell have multiple execution ports and pipelined
213 // vector units.
214 if (ST->hasAVX())
215 return 4;
216
217 return 2;
218}
219
221 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
224 const Instruction *CxtI) {
225
226 // vXi8 multiplications are always promoted to vXi16.
227 // Sub-128-bit types can be extended/packed more efficiently.
228 if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
229 Ty->getPrimitiveSizeInBits() <= 64 && Ty->getScalarSizeInBits() == 8) {
230 Type *WideVecTy =
231 VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
232 return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
234 CostKind) +
235 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
237 CostKind) +
238 getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info);
239 }
240
241 // Legalize the type.
242 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
243
244 int ISD = TLI->InstructionOpcodeToISD(Opcode);
245 assert(ISD && "Invalid opcode");
246
247 if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
248 (LT.second.getScalarType() == MVT::i32 ||
249 LT.second.getScalarType() == MVT::i64)) {
250 // Check if the operands can be represented as a smaller datatype.
251 bool Op1Signed = false, Op2Signed = false;
252 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
253 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
254 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
255 bool SignedMode = Op1Signed || Op2Signed;
256
257 // If both vXi32 are representable as i15 and at least one is constant,
258 // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
259 // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
260 if (OpMinSize <= 15 && !ST->isPMADDWDSlow() &&
261 LT.second.getScalarType() == MVT::i32) {
262 bool Op1Constant =
263 isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
264 bool Op2Constant =
265 isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
266 bool Op1Sext = isa<SExtInst>(Args[0]) &&
267 (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
268 bool Op2Sext = isa<SExtInst>(Args[1]) &&
269 (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
270
271 bool IsZeroExtended = !Op1Signed || !Op2Signed;
272 bool IsConstant = Op1Constant || Op2Constant;
273 bool IsSext = Op1Sext || Op2Sext;
274 if (IsConstant || IsZeroExtended || IsSext)
275 LT.second =
276 MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
277 }
278
279 // Check if the vXi32 operands can be shrunk into a smaller datatype.
280 // This should match the codegen from reduceVMULWidth.
281 // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()).
282 if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) {
283 if (OpMinSize <= 7)
284 return LT.first * 3; // pmullw/sext
285 if (!SignedMode && OpMinSize <= 8)
286 return LT.first * 3; // pmullw/zext
287 if (OpMinSize <= 15)
288 return LT.first * 5; // pmullw/pmulhw/pshuf
289 if (!SignedMode && OpMinSize <= 16)
290 return LT.first * 5; // pmullw/pmulhw/pshuf
291 }
292
293 // If both vXi64 are representable as (unsigned) i32, then we can perform
294 // the multiple with a single PMULUDQ instruction.
295 // TODO: Add (SSE41+) PMULDQ handling for signed extensions.
296 if (!SignedMode && OpMinSize <= 32 && LT.second.getScalarType() == MVT::i64)
297 ISD = X86ISD::PMULUDQ;
298 }
299
300 // Vector multiply by pow2 will be simplified to shifts.
301 // Vector multiply by -pow2 will be simplified to shifts/negates.
302 if (ISD == ISD::MUL && Op2Info.isConstant() &&
303 (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) {
305 getArithmeticInstrCost(Instruction::Shl, Ty, CostKind,
306 Op1Info.getNoProps(), Op2Info.getNoProps());
307 if (Op2Info.isNegatedPowerOf2())
308 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
309 return Cost;
310 }
311
312 // On X86, vector signed division by constants power-of-two are
313 // normally expanded to the sequence SRA + SRL + ADD + SRA.
314 // The OperandValue properties may not be the same as that of the previous
315 // operation; conservatively assume OP_None.
316 if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
317 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
319 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
320 Op1Info.getNoProps(), Op2Info.getNoProps());
321 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
322 Op1Info.getNoProps(), Op2Info.getNoProps());
323 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
324 Op1Info.getNoProps(), Op2Info.getNoProps());
325
326 if (ISD == ISD::SREM) {
327 // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
328 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(),
329 Op2Info.getNoProps());
330 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(),
331 Op2Info.getNoProps());
332 }
333
334 return Cost;
335 }
336
337 // Vector unsigned division/remainder will be simplified to shifts/masks.
338 if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
339 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
340 if (ISD == ISD::UDIV)
341 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
342 Op1Info.getNoProps(), Op2Info.getNoProps());
343 // UREM
344 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
345 Op1Info.getNoProps(), Op2Info.getNoProps());
346 }
347
348 static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
349 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
350 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
351 { ISD::SRA, MVT::v16i8, { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb.
352 { ISD::SHL, MVT::v32i8, { 1, 8, 2, 3 } }, // psllw + pand.
353 { ISD::SRL, MVT::v32i8, { 1, 8, 2, 3 } }, // psrlw + pand.
354 { ISD::SRA, MVT::v32i8, { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb.
355 { ISD::SHL, MVT::v64i8, { 1, 8, 2, 3 } }, // psllw + pand.
356 { ISD::SRL, MVT::v64i8, { 1, 8, 2, 3 } }, // psrlw + pand.
357 { ISD::SRA, MVT::v64i8, { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb.
358
359 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // psllw
360 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
361 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
362 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // psllw
363 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
364 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
365 };
366
367 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI())
368 if (const auto *Entry =
369 CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second))
370 if (auto KindCost = Entry->Cost[CostKind])
371 return LT.first * *KindCost;
372
373 static const CostKindTblEntry AVX512UniformConstCostTable[] = {
374 { ISD::SHL, MVT::v64i8, { 2, 12, 5, 6 } }, // psllw + pand.
375 { ISD::SRL, MVT::v64i8, { 2, 12, 5, 6 } }, // psrlw + pand.
376 { ISD::SRA, MVT::v64i8, { 3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb.
377
378 { ISD::SHL, MVT::v16i16, { 2, 7, 4, 4 } }, // psllw + split.
379 { ISD::SRL, MVT::v16i16, { 2, 7, 4, 4 } }, // psrlw + split.
380 { ISD::SRA, MVT::v16i16, { 2, 7, 4, 4 } }, // psraw + split.
381
382 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, // pslld
383 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, // psrld
384 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, // psrad
385 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, // pslld
386 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, // psrld
387 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, // psrad
388
389 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, // psraq
390 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, // psllq
391 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, // psrlq
392 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, // psraq
393 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, // psllq
394 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, // psrlq
395 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, // psraq
396
397 { ISD::SDIV, MVT::v16i32, { 6 } }, // pmuludq sequence
398 { ISD::SREM, MVT::v16i32, { 8 } }, // pmuludq+mul+sub sequence
399 { ISD::UDIV, MVT::v16i32, { 5 } }, // pmuludq sequence
400 { ISD::UREM, MVT::v16i32, { 7 } }, // pmuludq+mul+sub sequence
401 };
402
403 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512())
404 if (const auto *Entry =
405 CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second))
406 if (auto KindCost = Entry->Cost[CostKind])
407 return LT.first * *KindCost;
408
409 static const CostKindTblEntry AVX2UniformConstCostTable[] = {
410 { ISD::SHL, MVT::v16i8, { 1, 8, 2, 3 } }, // psllw + pand.
411 { ISD::SRL, MVT::v16i8, { 1, 8, 2, 3 } }, // psrlw + pand.
412 { ISD::SRA, MVT::v16i8, { 2, 10, 5, 6 } }, // psrlw, pand, pxor, psubb.
413 { ISD::SHL, MVT::v32i8, { 2, 8, 2, 4 } }, // psllw + pand.
414 { ISD::SRL, MVT::v32i8, { 2, 8, 2, 4 } }, // psrlw + pand.
415 { ISD::SRA, MVT::v32i8, { 3, 10, 5, 9 } }, // psrlw, pand, pxor, psubb.
416
417 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw
418 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw
419 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw
420 { ISD::SHL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psllw
421 { ISD::SRL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psrlw
422 { ISD::SRA, MVT::v16i16,{ 2, 2, 1, 2 } }, // psraw
423
424 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
425 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld
426 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad
427 { ISD::SHL, MVT::v8i32, { 2, 2, 1, 2 } }, // pslld
428 { ISD::SRL, MVT::v8i32, { 2, 2, 1, 2 } }, // psrld
429 { ISD::SRA, MVT::v8i32, { 2, 2, 1, 2 } }, // psrad
430
431 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq
432 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq
433 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
434 { ISD::SHL, MVT::v4i64, { 2, 2, 1, 2 } }, // psllq
435 { ISD::SRL, MVT::v4i64, { 2, 2, 1, 2 } }, // psrlq
436 { ISD::SRA, MVT::v4i64, { 4, 4, 3, 6 } }, // psrad + shuffle + split.
437
438 { ISD::SDIV, MVT::v8i32, { 6 } }, // pmuludq sequence
439 { ISD::SREM, MVT::v8i32, { 8 } }, // pmuludq+mul+sub sequence
440 { ISD::UDIV, MVT::v8i32, { 5 } }, // pmuludq sequence
441 { ISD::UREM, MVT::v8i32, { 7 } }, // pmuludq+mul+sub sequence
442 };
443
444 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2())
445 if (const auto *Entry =
446 CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second))
447 if (auto KindCost = Entry->Cost[CostKind])
448 return LT.first * *KindCost;
449
450 static const CostKindTblEntry AVXUniformConstCostTable[] = {
451 { ISD::SHL, MVT::v16i8, { 2, 7, 2, 3 } }, // psllw + pand.
452 { ISD::SRL, MVT::v16i8, { 2, 7, 2, 3 } }, // psrlw + pand.
453 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
454 { ISD::SHL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psllw + pand) + split.
455 { ISD::SRL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psrlw + pand) + split.
456 { ISD::SRA, MVT::v32i8, { 7, 7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split.
457
458 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 1 } }, // psllw.
459 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 1 } }, // psrlw.
460 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 1 } }, // psraw.
461 { ISD::SHL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psllw + split.
462 { ISD::SRL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psrlw + split.
463 { ISD::SRA, MVT::v16i16,{ 3, 6, 4, 5 } }, // psraw + split.
464
465 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 1 } }, // pslld.
466 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 1 } }, // psrld.
467 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 1 } }, // psrad.
468 { ISD::SHL, MVT::v8i32, { 3, 6, 4, 5 } }, // pslld + split.
469 { ISD::SRL, MVT::v8i32, { 3, 6, 4, 5 } }, // psrld + split.
470 { ISD::SRA, MVT::v8i32, { 3, 6, 4, 5 } }, // psrad + split.
471
472 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 1 } }, // psllq.
473 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 1 } }, // psrlq.
474 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
475 { ISD::SHL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
476 { ISD::SRL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
477 { ISD::SRA, MVT::v4i64, { 5, 7, 8, 9 } }, // 2 x psrad + shuffle + split.
478
479 { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split.
480 { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split.
481 { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split.
482 { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split.
483 };
484
485 // XOP has faster vXi8 shifts.
486 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() &&
487 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
488 if (const auto *Entry =
489 CostTableLookup(AVXUniformConstCostTable, ISD, LT.second))
490 if (auto KindCost = Entry->Cost[CostKind])
491 return LT.first * *KindCost;
492
493 static const CostKindTblEntry SSE2UniformConstCostTable[] = {
494 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
495 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
496 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
497
498 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw.
499 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw.
500 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw.
501
502 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
503 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld.
504 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad.
505
506 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq.
507 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq.
508 { ISD::SRA, MVT::v2i64, { 3, 5, 6, 6 } }, // 2 x psrad + shuffle.
509
510 { ISD::SDIV, MVT::v4i32, { 6 } }, // pmuludq sequence
511 { ISD::SREM, MVT::v4i32, { 8 } }, // pmuludq+mul+sub sequence
512 { ISD::UDIV, MVT::v4i32, { 5 } }, // pmuludq sequence
513 { ISD::UREM, MVT::v4i32, { 7 } }, // pmuludq+mul+sub sequence
514 };
515
516 // XOP has faster vXi8 shifts.
517 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() &&
518 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
519 if (const auto *Entry =
520 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
521 if (auto KindCost = Entry->Cost[CostKind])
522 return LT.first * *KindCost;
523
524 static const CostKindTblEntry AVX512BWConstCostTable[] = {
525 { ISD::SDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
526 { ISD::SREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
527 { ISD::UDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
528 { ISD::UREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
529
530 { ISD::SDIV, MVT::v32i16, { 6 } }, // vpmulhw sequence
531 { ISD::SREM, MVT::v32i16, { 8 } }, // vpmulhw+mul+sub sequence
532 { ISD::UDIV, MVT::v32i16, { 6 } }, // vpmulhuw sequence
533 { ISD::UREM, MVT::v32i16, { 8 } }, // vpmulhuw+mul+sub sequence
534 };
535
536 if (Op2Info.isConstant() && ST->hasBWI())
537 if (const auto *Entry =
538 CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
539 if (auto KindCost = Entry->Cost[CostKind])
540 return LT.first * *KindCost;
541
542 static const CostKindTblEntry AVX512ConstCostTable[] = {
543 { ISD::SDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
544 { ISD::SREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
545 { ISD::UDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
546 { ISD::UREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
547
548 { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence
549 { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence
550 { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence
551 { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence
552
553 { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence
554 { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence
555 { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence
556 { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence
557 };
558
559 if (Op2Info.isConstant() && ST->hasAVX512())
560 if (const auto *Entry =
561 CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
562 if (auto KindCost = Entry->Cost[CostKind])
563 return LT.first * *KindCost;
564
565 static const CostKindTblEntry AVX2ConstCostTable[] = {
566 { ISD::SDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
567 { ISD::SREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
568 { ISD::UDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
569 { ISD::UREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
570
571 { ISD::SDIV, MVT::v16i16, { 6 } }, // vpmulhw sequence
572 { ISD::SREM, MVT::v16i16, { 8 } }, // vpmulhw+mul+sub sequence
573 { ISD::UDIV, MVT::v16i16, { 6 } }, // vpmulhuw sequence
574 { ISD::UREM, MVT::v16i16, { 8 } }, // vpmulhuw+mul+sub sequence
575
576 { ISD::SDIV, MVT::v8i32, { 15 } }, // vpmuldq sequence
577 { ISD::SREM, MVT::v8i32, { 19 } }, // vpmuldq+mul+sub sequence
578 { ISD::UDIV, MVT::v8i32, { 15 } }, // vpmuludq sequence
579 { ISD::UREM, MVT::v8i32, { 19 } }, // vpmuludq+mul+sub sequence
580 };
581
582 if (Op2Info.isConstant() && ST->hasAVX2())
583 if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
584 if (auto KindCost = Entry->Cost[CostKind])
585 return LT.first * *KindCost;
586
587 static const CostKindTblEntry AVXConstCostTable[] = {
588 { ISD::SDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
589 { ISD::SREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
590 { ISD::UDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
591 { ISD::UREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
592
593 { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split.
594 { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split.
595 { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split.
596 { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split.
597
598 { ISD::SDIV, MVT::v8i32, { 32 } }, // vpmuludq sequence
599 { ISD::SREM, MVT::v8i32, { 38 } }, // vpmuludq+mul+sub sequence
600 { ISD::UDIV, MVT::v8i32, { 32 } }, // 2*pmuludq sequence + split.
601 { ISD::UREM, MVT::v8i32, { 42 } }, // 2*pmuludq+mul+sub sequence + split.
602 };
603
604 if (Op2Info.isConstant() && ST->hasAVX())
605 if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second))
606 if (auto KindCost = Entry->Cost[CostKind])
607 return LT.first * *KindCost;
608
609 static const CostKindTblEntry SSE41ConstCostTable[] = {
610 { ISD::SDIV, MVT::v4i32, { 15 } }, // vpmuludq sequence
611 { ISD::SREM, MVT::v4i32, { 20 } }, // vpmuludq+mul+sub sequence
612 };
613
614 if (Op2Info.isConstant() && ST->hasSSE41())
615 if (const auto *Entry =
616 CostTableLookup(SSE41ConstCostTable, ISD, LT.second))
617 if (auto KindCost = Entry->Cost[CostKind])
618 return LT.first * *KindCost;
619
620 static const CostKindTblEntry SSE2ConstCostTable[] = {
621 { ISD::SDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
622 { ISD::SREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
623 { ISD::UDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
624 { ISD::UREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
625
626 { ISD::SDIV, MVT::v8i16, { 6 } }, // pmulhw sequence
627 { ISD::SREM, MVT::v8i16, { 8 } }, // pmulhw+mul+sub sequence
628 { ISD::UDIV, MVT::v8i16, { 6 } }, // pmulhuw sequence
629 { ISD::UREM, MVT::v8i16, { 8 } }, // pmulhuw+mul+sub sequence
630
631 { ISD::SDIV, MVT::v4i32, { 19 } }, // pmuludq sequence
632 { ISD::SREM, MVT::v4i32, { 24 } }, // pmuludq+mul+sub sequence
633 { ISD::UDIV, MVT::v4i32, { 15 } }, // pmuludq sequence
634 { ISD::UREM, MVT::v4i32, { 20 } }, // pmuludq+mul+sub sequence
635 };
636
637 if (Op2Info.isConstant() && ST->hasSSE2())
638 if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
639 if (auto KindCost = Entry->Cost[CostKind])
640 return LT.first * *KindCost;
641
642 static const CostKindTblEntry AVX512BWUniformCostTable[] = {
643 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
644 { ISD::SRL, MVT::v16i8, { 3,10, 5, 8 } }, // psrlw + pand.
645 { ISD::SRA, MVT::v16i8, { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb.
646 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
647 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
648 { ISD::SRA, MVT::v32i8, { 5,10,10,13 } }, // psrlw, pand, pxor, psubb.
649 { ISD::SHL, MVT::v64i8, { 4, 7, 6, 8 } }, // psllw + pand.
650 { ISD::SRL, MVT::v64i8, { 4, 8, 7,10 } }, // psrlw + pand.
651 { ISD::SRA, MVT::v64i8, { 5,10,10,15 } }, // psrlw, pand, pxor, psubb.
652
653 { ISD::SHL, MVT::v32i16, { 2, 4, 2, 3 } }, // psllw
654 { ISD::SRL, MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw
655 { ISD::SRA, MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw
656 };
657
658 if (ST->hasBWI() && Op2Info.isUniform())
659 if (const auto *Entry =
660 CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second))
661 if (auto KindCost = Entry->Cost[CostKind])
662 return LT.first * *KindCost;
663
664 static const CostKindTblEntry AVX512UniformCostTable[] = {
665 { ISD::SHL, MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split.
666 { ISD::SRL, MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split.
667 { ISD::SRA, MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split.
668
669 { ISD::SHL, MVT::v16i32, { 2, 4, 2, 3 } }, // pslld
670 { ISD::SRL, MVT::v16i32, { 2, 4, 2, 3 } }, // psrld
671 { ISD::SRA, MVT::v16i32, { 2, 4, 2, 3 } }, // psrad
672
673 { ISD::SRA, MVT::v2i64, { 1, 2, 1, 2 } }, // psraq
674 { ISD::SHL, MVT::v4i64, { 1, 4, 1, 2 } }, // psllq
675 { ISD::SRL, MVT::v4i64, { 1, 4, 1, 2 } }, // psrlq
676 { ISD::SRA, MVT::v4i64, { 1, 4, 1, 2 } }, // psraq
677 { ISD::SHL, MVT::v8i64, { 1, 4, 1, 2 } }, // psllq
678 { ISD::SRL, MVT::v8i64, { 1, 4, 1, 2 } }, // psrlq
679 { ISD::SRA, MVT::v8i64, { 1, 4, 1, 2 } }, // psraq
680 };
681
682 if (ST->hasAVX512() && Op2Info.isUniform())
683 if (const auto *Entry =
684 CostTableLookup(AVX512UniformCostTable, ISD, LT.second))
685 if (auto KindCost = Entry->Cost[CostKind])
686 return LT.first * *KindCost;
687
688 static const CostKindTblEntry AVX2UniformCostTable[] = {
689 // Uniform splats are cheaper for the following instructions.
690 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
691 { ISD::SRL, MVT::v16i8, { 3, 9, 5, 8 } }, // psrlw + pand.
692 { ISD::SRA, MVT::v16i8, { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb.
693 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
694 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
695 { ISD::SRA, MVT::v32i8, { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb.
696
697 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 2 } }, // psllw.
698 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 2 } }, // psrlw.
699 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 2 } }, // psraw.
700 { ISD::SHL, MVT::v16i16, { 2, 4, 2, 3 } }, // psllw.
701 { ISD::SRL, MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw.
702 { ISD::SRA, MVT::v16i16, { 2, 4, 2, 3 } }, // psraw.
703
704 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 2 } }, // pslld
705 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 2 } }, // psrld
706 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 2 } }, // psrad
707 { ISD::SHL, MVT::v8i32, { 2, 4, 2, 3 } }, // pslld
708 { ISD::SRL, MVT::v8i32, { 2, 4, 2, 3 } }, // psrld
709 { ISD::SRA, MVT::v8i32, { 2, 4, 2, 3 } }, // psrad
710
711 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 2 } }, // psllq
712 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 2 } }, // psrlq
713 { ISD::SRA, MVT::v2i64, { 2, 4, 5, 7 } }, // 2 x psrad + shuffle.
714 { ISD::SHL, MVT::v4i64, { 2, 4, 1, 2 } }, // psllq
715 { ISD::SRL, MVT::v4i64, { 2, 4, 1, 2 } }, // psrlq
716 { ISD::SRA, MVT::v4i64, { 4, 6, 5, 9 } }, // 2 x psrad + shuffle.
717 };
718
719 if (ST->hasAVX2() && Op2Info.isUniform())
720 if (const auto *Entry =
721 CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
722 if (auto KindCost = Entry->Cost[CostKind])
723 return LT.first * *KindCost;
724
725 static const CostKindTblEntry AVXUniformCostTable[] = {
726 { ISD::SHL, MVT::v16i8, { 4, 4, 6, 8 } }, // psllw + pand.
727 { ISD::SRL, MVT::v16i8, { 4, 8, 5, 8 } }, // psrlw + pand.
728 { ISD::SRA, MVT::v16i8, { 6, 6, 9,13 } }, // psrlw, pand, pxor, psubb.
729 { ISD::SHL, MVT::v32i8, { 7, 8,11,14 } }, // psllw + pand + split.
730 { ISD::SRL, MVT::v32i8, { 7, 9,10,14 } }, // psrlw + pand + split.
731 { ISD::SRA, MVT::v32i8, { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split.
732
733 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 2 } }, // psllw.
734 { ISD::SRL, MVT::v8i16, { 1, 3, 1, 2 } }, // psrlw.
735 { ISD::SRA, MVT::v8i16, { 1, 3, 1, 2 } }, // psraw.
736 { ISD::SHL, MVT::v16i16, { 3, 7, 5, 7 } }, // psllw + split.
737 { ISD::SRL, MVT::v16i16, { 3, 7, 5, 7 } }, // psrlw + split.
738 { ISD::SRA, MVT::v16i16, { 3, 7, 5, 7 } }, // psraw + split.
739
740 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 2 } }, // pslld.
741 { ISD::SRL, MVT::v4i32, { 1, 3, 1, 2 } }, // psrld.
742 { ISD::SRA, MVT::v4i32, { 1, 3, 1, 2 } }, // psrad.
743 { ISD::SHL, MVT::v8i32, { 3, 7, 5, 7 } }, // pslld + split.
744 { ISD::SRL, MVT::v8i32, { 3, 7, 5, 7 } }, // psrld + split.
745 { ISD::SRA, MVT::v8i32, { 3, 7, 5, 7 } }, // psrad + split.
746
747 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 2 } }, // psllq.
748 { ISD::SRL, MVT::v2i64, { 1, 3, 1, 2 } }, // psrlq.
749 { ISD::SRA, MVT::v2i64, { 3, 4, 5, 7 } }, // 2 x psrad + shuffle.
750 { ISD::SHL, MVT::v4i64, { 3, 7, 4, 6 } }, // psllq + split.
751 { ISD::SRL, MVT::v4i64, { 3, 7, 4, 6 } }, // psrlq + split.
752 { ISD::SRA, MVT::v4i64, { 6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split.
753 };
754
755 // XOP has faster vXi8 shifts.
756 if (ST->hasAVX() && Op2Info.isUniform() &&
757 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
758 if (const auto *Entry =
759 CostTableLookup(AVXUniformCostTable, ISD, LT.second))
760 if (auto KindCost = Entry->Cost[CostKind])
761 return LT.first * *KindCost;
762
763 static const CostKindTblEntry SSE2UniformCostTable[] = {
764 // Uniform splats are cheaper for the following instructions.
765 { ISD::SHL, MVT::v16i8, { 9, 10, 6, 9 } }, // psllw + pand.
766 { ISD::SRL, MVT::v16i8, { 9, 13, 5, 9 } }, // psrlw + pand.
767 { ISD::SRA, MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence.
768
769 { ISD::SHL, MVT::v8i16, { 2, 2, 1, 2 } }, // psllw.
770 { ISD::SRL, MVT::v8i16, { 2, 2, 1, 2 } }, // psrlw.
771 { ISD::SRA, MVT::v8i16, { 2, 2, 1, 2 } }, // psraw.
772
773 { ISD::SHL, MVT::v4i32, { 2, 2, 1, 2 } }, // pslld
774 { ISD::SRL, MVT::v4i32, { 2, 2, 1, 2 } }, // psrld.
775 { ISD::SRA, MVT::v4i32, { 2, 2, 1, 2 } }, // psrad.
776
777 { ISD::SHL, MVT::v2i64, { 2, 2, 1, 2 } }, // psllq.
778 { ISD::SRL, MVT::v2i64, { 2, 2, 1, 2 } }, // psrlq.
779 { ISD::SRA, MVT::v2i64, { 5, 9, 5, 7 } }, // 2*psrlq + xor + sub.
780 };
781
782 if (ST->hasSSE2() && Op2Info.isUniform() &&
783 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
784 if (const auto *Entry =
785 CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
786 if (auto KindCost = Entry->Cost[CostKind])
787 return LT.first * *KindCost;
788
789 static const CostKindTblEntry AVX512DQCostTable[] = {
790 { ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq
791 { ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq
792 { ISD::MUL, MVT::v8i64, { 3, 15, 1, 3 } } // pmullq
793 };
794
795 // Look for AVX512DQ lowering tricks for custom cases.
796 if (ST->hasDQI())
797 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
798 if (auto KindCost = Entry->Cost[CostKind])
799 return LT.first * *KindCost;
800
801 static const CostKindTblEntry AVX512BWCostTable[] = {
802 { ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence.
803 { ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence.
804 { ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence.
805 { ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence.
806 { ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence.
807 { ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence.
808 { ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence.
809 { ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence.
810 { ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence.
811
812 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw
813 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw
814 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw
815 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw
816 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw
817 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw
818 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw
819 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw
820 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw
821
822 { ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb
823 { ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw
824
825 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb
826 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw
827 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd
828 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq
829
830 { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb
831 { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw
832
833 { ISD::MUL, MVT::v64i8, { 5, 10,10,11 } },
834 { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw
835
836 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb
837 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw
838 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd
839 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq
840 };
841
842 // Look for AVX512BW lowering tricks for custom cases.
843 if (ST->hasBWI())
844 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
845 if (auto KindCost = Entry->Cost[CostKind])
846 return LT.first * *KindCost;
847
848 static const CostKindTblEntry AVX512CostTable[] = {
849 { ISD::SHL, MVT::v64i8, { 15, 19,27,33 } }, // vpblendv+split sequence.
850 { ISD::SRL, MVT::v64i8, { 15, 19,30,36 } }, // vpblendv+split sequence.
851 { ISD::SRA, MVT::v64i8, { 37, 37,51,63 } }, // vpblendv+split sequence.
852
853 { ISD::SHL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
854 { ISD::SRL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
855 { ISD::SRA, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence.
856
857 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } },
858 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } },
859 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } },
860 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } },
861 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } },
862 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } },
863 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } },
864 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } },
865 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } },
866
867 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } },
868 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } },
869 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } },
870 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } },
871 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } },
872 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } },
873 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } },
874 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } },
875 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } },
876
877 { ISD::ADD, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*paddb + split
878 { ISD::ADD, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*paddw + split
879
880 { ISD::SUB, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*psubb + split
881 { ISD::SUB, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*psubw + split
882
883 { ISD::AND, MVT::v32i8, { 1, 1, 1, 1 } },
884 { ISD::AND, MVT::v16i16, { 1, 1, 1, 1 } },
885 { ISD::AND, MVT::v8i32, { 1, 1, 1, 1 } },
886 { ISD::AND, MVT::v4i64, { 1, 1, 1, 1 } },
887
888 { ISD::OR, MVT::v32i8, { 1, 1, 1, 1 } },
889 { ISD::OR, MVT::v16i16, { 1, 1, 1, 1 } },
890 { ISD::OR, MVT::v8i32, { 1, 1, 1, 1 } },
891 { ISD::OR, MVT::v4i64, { 1, 1, 1, 1 } },
892
893 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 1 } },
894 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 1 } },
895 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } },
896 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } },
897
898 { ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
899 { ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
900 { ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
901 { ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add
902 { ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/
903
904 { X86ISD::PMULUDQ, MVT::v8i64, { 1, 5, 1, 1 } },
905
906 { ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
907 { ISD::FADD, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
908 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
909 { ISD::FSUB, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
910 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
911 { ISD::FMUL, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
912 { ISD::FMUL, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
913 { ISD::FMUL, MVT::v2f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
914 { ISD::FMUL, MVT::f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
915
916 { ISD::FDIV, MVT::f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
917 { ISD::FDIV, MVT::v2f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
918 { ISD::FDIV, MVT::v4f64, { 8, 14, 1, 1 } }, // Skylake from http://www.agner.org/
919 { ISD::FDIV, MVT::v8f64, { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/
920
921 { ISD::FNEG, MVT::v16f32, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
922 { ISD::FADD, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
923 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
924 { ISD::FSUB, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
925 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
926 { ISD::FMUL, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
927 { ISD::FMUL, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
928 { ISD::FMUL, MVT::v4f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
929 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
930
931 { ISD::FDIV, MVT::f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
932 { ISD::FDIV, MVT::v4f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
933 { ISD::FDIV, MVT::v8f32, { 5, 11, 1, 1 } }, // Skylake from http://www.agner.org/
934 { ISD::FDIV, MVT::v16f32, { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/
935 };
936
937 if (ST->hasAVX512())
938 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
939 if (auto KindCost = Entry->Cost[CostKind])
940 return LT.first * *KindCost;
941
942 static const CostKindTblEntry AVX2ShiftCostTable[] = {
943 // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
944 // customize them to detect the cases where shift amount is a scalar one.
945 { ISD::SHL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org)
946 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
947 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org)
948 { ISD::SHL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org)
949 { ISD::SRL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
950 { ISD::SRA, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org)
951 { ISD::SHL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org)
952 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org)
953 { ISD::SHL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org)
954 { ISD::SRL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org)
955 };
956
957 if (ST->hasAVX512()) {
958 if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant())
959 // On AVX512, a packed v32i16 shift left by a constant build_vector
960 // is lowered into a vector multiply (vpmullw).
961 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
962 Op1Info.getNoProps(), Op2Info.getNoProps());
963 }
964
965 // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
966 if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
967 if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
968 Op2Info.isConstant())
969 // On AVX2, a packed v16i16 shift left by a constant build_vector
970 // is lowered into a vector multiply (vpmullw).
971 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
972 Op1Info.getNoProps(), Op2Info.getNoProps());
973
974 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
975 if (auto KindCost = Entry->Cost[CostKind])
976 return LT.first * *KindCost;
977 }
978
979 static const CostKindTblEntry XOPShiftCostTable[] = {
980 // 128bit shifts take 1cy, but right shifts require negation beforehand.
981 { ISD::SHL, MVT::v16i8, { 1, 3, 1, 1 } },
982 { ISD::SRL, MVT::v16i8, { 2, 3, 1, 1 } },
983 { ISD::SRA, MVT::v16i8, { 2, 3, 1, 1 } },
984 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 1 } },
985 { ISD::SRL, MVT::v8i16, { 2, 3, 1, 1 } },
986 { ISD::SRA, MVT::v8i16, { 2, 3, 1, 1 } },
987 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 1 } },
988 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 1 } },
989 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 1 } },
990 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 1 } },
991 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } },
992 { ISD::SRA, MVT::v2i64, { 2, 3, 1, 1 } },
993 // 256bit shifts require splitting if AVX2 didn't catch them above.
994 { ISD::SHL, MVT::v32i8, { 4, 7, 5, 6 } },
995 { ISD::SRL, MVT::v32i8, { 6, 7, 5, 6 } },
996 { ISD::SRA, MVT::v32i8, { 6, 7, 5, 6 } },
997 { ISD::SHL, MVT::v16i16, { 4, 7, 5, 6 } },
998 { ISD::SRL, MVT::v16i16, { 6, 7, 5, 6 } },
999 { ISD::SRA, MVT::v16i16, { 6, 7, 5, 6 } },
1000 { ISD::SHL, MVT::v8i32, { 4, 7, 5, 6 } },
1001 { ISD::SRL, MVT::v8i32, { 6, 7, 5, 6 } },
1002 { ISD::SRA, MVT::v8i32, { 6, 7, 5, 6 } },
1003 { ISD::SHL, MVT::v4i64, { 4, 7, 5, 6 } },
1004 { ISD::SRL, MVT::v4i64, { 6, 7, 5, 6 } },
1005 { ISD::SRA, MVT::v4i64, { 6, 7, 5, 6 } },
1006 };
1007
1008 // Look for XOP lowering tricks.
1009 if (ST->hasXOP()) {
1010 // If the right shift is constant then we'll fold the negation so
1011 // it's as cheap as a left shift.
1012 int ShiftISD = ISD;
1013 if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant())
1014 ShiftISD = ISD::SHL;
1015 if (const auto *Entry =
1016 CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
1017 if (auto KindCost = Entry->Cost[CostKind])
1018 return LT.first * *KindCost;
1019 }
1020
1021 if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) {
1022 MVT VT = LT.second;
1023 // Vector shift left by non uniform constant can be lowered
1024 // into vector multiply.
1025 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
1026 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
1027 ISD = ISD::MUL;
1028 }
1029
1030 static const CostKindTblEntry GLMCostTable[] = {
1031 { ISD::FDIV, MVT::f32, { 18, 19, 1, 1 } }, // divss
1032 { ISD::FDIV, MVT::v4f32, { 35, 36, 1, 1 } }, // divps
1033 { ISD::FDIV, MVT::f64, { 33, 34, 1, 1 } }, // divsd
1034 { ISD::FDIV, MVT::v2f64, { 65, 66, 1, 1 } }, // divpd
1035 };
1036
1037 if (ST->useGLMDivSqrtCosts())
1038 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second))
1039 if (auto KindCost = Entry->Cost[CostKind])
1040 return LT.first * *KindCost;
1041
1042 static const CostKindTblEntry SLMCostTable[] = {
1043 { ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld
1044 { ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw
1045 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd
1046 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss
1047 { ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd
1048 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // mulps
1049 { ISD::FDIV, MVT::f32, { 17, 19, 1, 1 } }, // divss
1050 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 6 } }, // divps
1051 { ISD::FDIV, MVT::f64, { 32, 34, 1, 1 } }, // divsd
1052 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 6 } }, // divpd
1053 { ISD::FADD, MVT::v2f64, { 2, 4, 1, 1 } }, // addpd
1054 { ISD::FSUB, MVT::v2f64, { 2, 4, 1, 1 } }, // subpd
1055 // v2i64/v4i64 mul is custom lowered as a series of long:
1056 // multiplies(3), shifts(3) and adds(2)
1057 // slm muldq version throughput is 2 and addq throughput 4
1058 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
1059 // 3X4 (addq throughput) = 17
1060 { ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } },
1061 // slm addq\subq throughput is 4
1062 { ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } },
1063 { ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } },
1064 };
1065
1066 if (ST->useSLMArithCosts())
1067 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second))
1068 if (auto KindCost = Entry->Cost[CostKind])
1069 return LT.first * *KindCost;
1070
1071 static const CostKindTblEntry AVX2CostTable[] = {
1072 { ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence.
1073 { ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence.
1074 { ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence.
1075 { ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1076
1077 { ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence.
1078 { ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence.
1079 { ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence.
1080 { ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1081
1082 { ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence.
1083 { ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence.
1084 { ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence.
1085 { ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence.
1086 { ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence.
1087 { ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence.
1088
1089 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb
1090 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb
1091 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw
1092 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw
1093 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd
1094 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd
1095 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq
1096 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq
1097
1098 { ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack
1099 { ISD::MUL, MVT::v32i8, { 6, 11,10,19 } }, // unpack/pmullw
1100 { ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw
1101 { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld
1102 { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld
1103 { ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add
1104 { ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
1105
1106 { X86ISD::PMULUDQ, MVT::v4i64, { 1, 5, 1, 1 } },
1107
1108 { ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd
1109 { ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps
1110
1111 { ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd
1112 { ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss
1113 { ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd
1114 { ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps
1115 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd
1116 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps
1117
1118 { ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd
1119 { ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss
1120 { ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd
1121 { ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps
1122 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd
1123 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps
1124
1125 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd
1126 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss
1127 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd
1128 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps
1129 { ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd
1130 { ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps
1131
1132 { ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss
1133 { ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps
1134 { ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps
1135 { ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd
1136 { ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd
1137 { ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd
1138 };
1139
1140 // Look for AVX2 lowering tricks for custom cases.
1141 if (ST->hasAVX2())
1142 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
1143 if (auto KindCost = Entry->Cost[CostKind])
1144 return LT.first * *KindCost;
1145
1146 static const CostKindTblEntry AVX1CostTable[] = {
1147 // We don't have to scalarize unsupported ops. We can issue two half-sized
1148 // operations and we only need to extract the upper YMM half.
1149 // Two ops + 1 extract + 1 insert = 4.
1150 { ISD::MUL, MVT::v32i8, { 12, 13, 22, 23 } }, // unpack/pmullw + split
1151 { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split
1152 { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split
1153 { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
1154 { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } },
1155
1156 { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps
1157 { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps
1158 { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps
1159 { ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps
1160
1161 { ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps
1162 { ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps
1163 { ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps
1164 { ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps
1165
1166 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps
1167 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps
1168 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps
1169 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps
1170
1171 { ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split
1172 { ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split
1173 { ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split
1174 { ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split
1175 { ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split
1176 { ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split
1177 { ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split
1178 { ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split
1179 { ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq
1180 { ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq
1181
1182 { ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence.
1183 { ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split.
1184 { ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence.
1185 { ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split.
1186 { ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld
1187 { ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split
1188 { ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1189 { ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1190
1191 { ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence.
1192 { ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split.
1193 { ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1194 { ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1195 { ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1196 { ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1197 { ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1198 { ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1199
1200 { ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence.
1201 { ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split.
1202 { ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1203 { ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1204 { ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1205 { ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1206 { ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend.
1207 { ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split.
1208
1209 { ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1210 { ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1211
1212 { ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1213 { ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1214 { ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1215 { ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1216 { ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1217 { ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1218
1219 { ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1220 { ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1221 { ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1222 { ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1223 { ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1224 { ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1225
1226 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1227 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1228 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1229 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1230 { ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1231 { ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1232
1233 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1234 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1235 { ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/
1236 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1237 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1238 { ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/
1239 };
1240
1241 if (ST->hasAVX())
1242 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
1243 if (auto KindCost = Entry->Cost[CostKind])
1244 return LT.first * *KindCost;
1245
1246 static const CostKindTblEntry SSE42CostTable[] = {
1247 { ISD::FADD, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1248 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1249 { ISD::FADD, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1250 { ISD::FADD, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1251
1252 { ISD::FSUB, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1253 { ISD::FSUB, MVT::f32 , { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1254 { ISD::FSUB, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1255 { ISD::FSUB, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1256
1257 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1258 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1259 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1260 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1261
1262 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1263 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1264 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1265 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1266
1267 { ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add
1268 };
1269
1270 if (ST->hasSSE42())
1271 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
1272 if (auto KindCost = Entry->Cost[CostKind])
1273 return LT.first * *KindCost;
1274
1275 static const CostKindTblEntry SSE41CostTable[] = {
1276 { ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence.
1277 { ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence.
1278 { ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld
1279
1280 { ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence.
1281 { ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1282 { ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1283 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1284
1285 { ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence.
1286 { ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1287 { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1288 { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence.
1289
1290 { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*unpack/2*pmullw/2*and/pack
1291 { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org)
1292 };
1293
1294 if (ST->hasSSE41())
1295 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
1296 if (auto KindCost = Entry->Cost[CostKind])
1297 return LT.first * *KindCost;
1298
1299 static const CostKindTblEntry SSE2CostTable[] = {
1300 // We don't correctly identify costs of casts because they are marked as
1301 // custom.
1302 { ISD::SHL, MVT::v16i8, { 13, 21,26,28 } }, // cmpgtb sequence.
1303 { ISD::SHL, MVT::v8i16, { 24, 27,16,20 } }, // cmpgtw sequence.
1304 { ISD::SHL, MVT::v4i32, { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq.
1305 { ISD::SHL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1306
1307 { ISD::SRL, MVT::v16i8, { 14, 28,27,30 } }, // cmpgtb sequence.
1308 { ISD::SRL, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1309 { ISD::SRL, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1310 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1311
1312 { ISD::SRA, MVT::v16i8, { 27, 30,54,54 } }, // unpacked cmpgtb sequence.
1313 { ISD::SRA, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1314 { ISD::SRA, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1315 { ISD::SRA, MVT::v2i64, { 8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence.
1316
1317 { ISD::AND, MVT::v16i8, { 1, 1, 1, 1 } }, // pand
1318 { ISD::AND, MVT::v8i16, { 1, 1, 1, 1 } }, // pand
1319 { ISD::AND, MVT::v4i32, { 1, 1, 1, 1 } }, // pand
1320 { ISD::AND, MVT::v2i64, { 1, 1, 1, 1 } }, // pand
1321
1322 { ISD::OR, MVT::v16i8, { 1, 1, 1, 1 } }, // por
1323 { ISD::OR, MVT::v8i16, { 1, 1, 1, 1 } }, // por
1324 { ISD::OR, MVT::v4i32, { 1, 1, 1, 1 } }, // por
1325 { ISD::OR, MVT::v2i64, { 1, 1, 1, 1 } }, // por
1326
1327 { ISD::XOR, MVT::v16i8, { 1, 1, 1, 1 } }, // pxor
1328 { ISD::XOR, MVT::v8i16, { 1, 1, 1, 1 } }, // pxor
1329 { ISD::XOR, MVT::v4i32, { 1, 1, 1, 1 } }, // pxor
1330 { ISD::XOR, MVT::v2i64, { 1, 1, 1, 1 } }, // pxor
1331
1332 { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq
1333 { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq
1334
1335 { ISD::MUL, MVT::v16i8, { 5, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack
1336 { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw
1337 { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle
1338 { ISD::MUL, MVT::v2i64, { 7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add
1339
1340 { X86ISD::PMULUDQ, MVT::v2i64, { 1, 5, 1, 1 } },
1341
1342 { ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/
1343 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/
1344 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/
1345 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/
1346
1347 { ISD::FNEG, MVT::f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1348 { ISD::FNEG, MVT::f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1349 { ISD::FNEG, MVT::v4f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1350 { ISD::FNEG, MVT::v2f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1351
1352 { ISD::FADD, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1353 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1354 { ISD::FADD, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1355
1356 { ISD::FSUB, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1357 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1358 { ISD::FSUB, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1359
1360 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1361 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1362 };
1363
1364 if (ST->hasSSE2())
1365 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1366 if (auto KindCost = Entry->Cost[CostKind])
1367 return LT.first * *KindCost;
1368
1369 static const CostKindTblEntry SSE1CostTable[] = {
1370 { ISD::FDIV, MVT::f32, { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/
1371 { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/
1372
1373 { ISD::FNEG, MVT::f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1374 { ISD::FNEG, MVT::v4f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1375
1376 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1377 { ISD::FADD, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1378
1379 { ISD::FSUB, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1380 { ISD::FSUB, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1381
1382 { ISD::FMUL, MVT::f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1383 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1384 };
1385
1386 if (ST->hasSSE1())
1387 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1388 if (auto KindCost = Entry->Cost[CostKind])
1389 return LT.first * *KindCost;
1390
1391 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
1392 { ISD::ADD, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1393 { ISD::SUB, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1394 { ISD::MUL, MVT::i64, { 2, 6, 1, 2 } },
1395 };
1396
1397 if (ST->is64Bit())
1398 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1399 if (auto KindCost = Entry->Cost[CostKind])
1400 return LT.first * *KindCost;
1401
1402 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1403 { ISD::ADD, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1404 { ISD::ADD, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1405 { ISD::ADD, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1406
1407 { ISD::SUB, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1408 { ISD::SUB, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1409 { ISD::SUB, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1410
1411 { ISD::MUL, MVT::i8, { 3, 4, 1, 1 } },
1412 { ISD::MUL, MVT::i16, { 2, 4, 1, 1 } },
1413 { ISD::MUL, MVT::i32, { 1, 4, 1, 1 } },
1414
1415 { ISD::FNEG, MVT::f64, { 2, 2, 1, 3 } }, // (x87)
1416 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1417 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1418 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // (x87)
1419 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87)
1420 };
1421
1422 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1423 if (auto KindCost = Entry->Cost[CostKind])
1424 return LT.first * *KindCost;
1425
1426 // It is not a good idea to vectorize division. We have to scalarize it and
1427 // in the process we will often end up having to spilling regular
1428 // registers. The overhead of division is going to dominate most kernels
1429 // anyways so try hard to prevent vectorization of division - it is
1430 // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1431 // to hide "20 cycles" for each lane.
1432 if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() &&
1433 (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
1434 ISD == ISD::UREM)) {
1435 InstructionCost ScalarCost =
1437 Op1Info.getNoProps(), Op2Info.getNoProps());
1438 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1439 }
1440
1441 // Handle some basic single instruction code size cases.
1442 if (CostKind == TTI::TCK_CodeSize) {
1443 switch (ISD) {
1444 case ISD::FADD:
1445 case ISD::FSUB:
1446 case ISD::FMUL:
1447 case ISD::FDIV:
1448 case ISD::FNEG:
1449 case ISD::AND:
1450 case ISD::OR:
1451 case ISD::XOR:
1452 return LT.first;
1453 break;
1454 }
1455 }
1456
1457 // Fallback to the default implementation.
1458 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1459 Args, CxtI);
1460}
1461
1464 unsigned Opcode1, const SmallBitVector &OpcodeMask,
1466 if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
1467 return TTI::TCC_Basic;
1469}
1470
1472 VectorType *BaseTp,
1473 ArrayRef<int> Mask,
1475 int Index, VectorType *SubTp,
1477 // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1478 // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1479 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(BaseTp);
1480
1481 Kind = improveShuffleKindFromMask(Kind, Mask, BaseTp, Index, SubTp);
1482
1483 // Recognize a basic concat_vector shuffle.
1484 if (Kind == TTI::SK_PermuteTwoSrc &&
1485 Mask.size() == (2 * BaseTp->getElementCount().getKnownMinValue()) &&
1486 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
1489 CostKind, Mask.size() / 2, BaseTp);
1490
1491 // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1492 if (Kind == TTI::SK_Transpose)
1493 Kind = TTI::SK_PermuteTwoSrc;
1494
1495 // For Broadcasts we are splatting the first element from the first input
1496 // register, so only need to reference that input and all the output
1497 // registers are the same.
1498 if (Kind == TTI::SK_Broadcast)
1499 LT.first = 1;
1500
1501 // Treat <X x bfloat> shuffles as <X x half>.
1502 if (LT.second.isVector() && LT.second.getScalarType() == MVT::bf16)
1503 LT.second = LT.second.changeVectorElementType(MVT::f16);
1504
1505 // Subvector extractions are free if they start at the beginning of a
1506 // vector and cheap if the subvectors are aligned.
1507 if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1508 int NumElts = LT.second.getVectorNumElements();
1509 if ((Index % NumElts) == 0)
1510 return 0;
1511 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1512 if (SubLT.second.isVector()) {
1513 int NumSubElts = SubLT.second.getVectorNumElements();
1514 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1515 return SubLT.first;
1516 // Handle some cases for widening legalization. For now we only handle
1517 // cases where the original subvector was naturally aligned and evenly
1518 // fit in its legalized subvector type.
1519 // FIXME: Remove some of the alignment restrictions.
1520 // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1521 // vectors.
1522 int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1523 if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1524 (NumSubElts % OrigSubElts) == 0 &&
1525 LT.second.getVectorElementType() ==
1526 SubLT.second.getVectorElementType() &&
1527 LT.second.getVectorElementType().getSizeInBits() ==
1529 assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1530 "Unexpected number of elements!");
1531 auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
1532 LT.second.getVectorNumElements());
1533 auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
1534 SubLT.second.getVectorNumElements());
1535 int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1536 InstructionCost ExtractCost =
1537 getShuffleCost(TTI::SK_ExtractSubvector, VecTy, std::nullopt,
1538 CostKind, ExtractIndex, SubTy);
1539
1540 // If the original size is 32-bits or more, we can use pshufd. Otherwise
1541 // if we have SSSE3 we can use pshufb.
1542 if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1543 return ExtractCost + 1; // pshufd or pshufb
1544
1545 assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1546 "Unexpected vector size");
1547
1548 return ExtractCost + 2; // worst case pshufhw + pshufd
1549 }
1550 }
1551 // If the extract subvector is not optimal, treat it as single op shuffle.
1553 }
1554
1555 // Subvector insertions are cheap if the subvectors are aligned.
1556 // Note that in general, the insertion starting at the beginning of a vector
1557 // isn't free, because we need to preserve the rest of the wide vector.
1558 if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1559 int NumElts = LT.second.getVectorNumElements();
1560 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1561 if (SubLT.second.isVector()) {
1562 int NumSubElts = SubLT.second.getVectorNumElements();
1563 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1564 return SubLT.first;
1565 }
1566
1567 // If the insertion isn't aligned, treat it like a 2-op shuffle.
1568 Kind = TTI::SK_PermuteTwoSrc;
1569 }
1570
1571 // Handle some common (illegal) sub-vector types as they are often very cheap
1572 // to shuffle even on targets without PSHUFB.
1573 EVT VT = TLI->getValueType(DL, BaseTp);
1574 if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1575 !ST->hasSSSE3()) {
1576 static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
1577 {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw
1578 {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw
1579 {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw
1580 {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw
1581 {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck
1582
1583 {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw
1584 {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw
1585 {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus
1586 {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck
1587
1588 {TTI::SK_Splice, MVT::v4i16, 2}, // punpck+psrldq
1589 {TTI::SK_Splice, MVT::v2i16, 2}, // punpck+psrldq
1590 {TTI::SK_Splice, MVT::v4i8, 2}, // punpck+psrldq
1591 {TTI::SK_Splice, MVT::v2i8, 2}, // punpck+psrldq
1592
1593 {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw
1594 {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw
1595 {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw
1596 {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw
1597 {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck
1598
1599 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
1600 {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
1601 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw
1602 {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw
1603 {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck
1604 };
1605
1606 if (ST->hasSSE2())
1607 if (const auto *Entry =
1608 CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1609 return Entry->Cost;
1610 }
1611
1612 // We are going to permute multiple sources and the result will be in multiple
1613 // destinations. Providing an accurate cost only for splits where the element
1614 // type remains the same.
1615 if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
1616 MVT LegalVT = LT.second;
1617 if (LegalVT.isVector() &&
1618 LegalVT.getVectorElementType().getSizeInBits() ==
1620 LegalVT.getVectorNumElements() <
1621 cast<FixedVectorType>(BaseTp)->getNumElements()) {
1622 unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
1623 unsigned LegalVTSize = LegalVT.getStoreSize();
1624 // Number of source vectors after legalization:
1625 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1626 // Number of destination vectors after legalization:
1627 InstructionCost NumOfDests = LT.first;
1628
1629 auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
1630 LegalVT.getVectorNumElements());
1631
1632 if (!Mask.empty() && NumOfDests.isValid()) {
1633 // Try to perform better estimation of the permutation.
1634 // 1. Split the source/destination vectors into real registers.
1635 // 2. Do the mask analysis to identify which real registers are
1636 // permuted. If more than 1 source registers are used for the
1637 // destination register building, the cost for this destination register
1638 // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
1639 // source register is used, build mask and calculate the cost as a cost
1640 // of PermuteSingleSrc.
1641 // Also, for the single register permute we try to identify if the
1642 // destination register is just a copy of the source register or the
1643 // copy of the previous destination register (the cost is
1644 // TTI::TCC_Basic). If the source register is just reused, the cost for
1645 // this operation is 0.
1646 NumOfDests =
1648 FixedVectorType::get(BaseTp->getElementType(), Mask.size()))
1649 .first;
1650 unsigned E = *NumOfDests.getValue();
1651 unsigned NormalizedVF =
1652 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
1653 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
1654 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
1655 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
1656 copy(Mask, NormalizedMask.begin());
1657 unsigned PrevSrcReg = 0;
1658 ArrayRef<int> PrevRegMask;
1661 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
1662 [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask,
1663 &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
1664 if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) {
1665 // Check if the previous register can be just copied to the next
1666 // one.
1667 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
1668 PrevRegMask != RegMask)
1670 RegMask, CostKind, 0, nullptr);
1671 else
1672 // Just a copy of previous destination register.
1674 return;
1675 }
1676 if (SrcReg != DestReg &&
1677 any_of(RegMask, [](int I) { return I != PoisonMaskElem; })) {
1678 // Just a copy of the source register.
1680 }
1681 PrevSrcReg = SrcReg;
1682 PrevRegMask = RegMask;
1683 },
1684 [this, SingleOpTy, CostKind, &Cost](ArrayRef<int> RegMask,
1685 unsigned /*Unused*/,
1686 unsigned /*Unused*/) {
1687 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
1688 CostKind, 0, nullptr);
1689 });
1690 return Cost;
1691 }
1692
1693 InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1694 return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1695 std::nullopt, CostKind, 0, nullptr);
1696 }
1697
1698 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
1699 }
1700
1701 // For 2-input shuffles, we must account for splitting the 2 inputs into many.
1702 if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
1703 // We assume that source and destination have the same vector type.
1704 InstructionCost NumOfDests = LT.first;
1705 InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1;
1706 LT.first = NumOfDests * NumOfShufflesPerDest;
1707 }
1708
1709 static const CostTblEntry AVX512VBMIShuffleTbl[] = {
1710 {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
1711 {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
1712
1713 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
1714 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
1715
1716 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
1717 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
1718 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b
1719 };
1720
1721 if (ST->hasVBMI())
1722 if (const auto *Entry =
1723 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1724 return LT.first * Entry->Cost;
1725
1726 static const CostTblEntry AVX512BWShuffleTbl[] = {
1727 {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1728 {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
1729 {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1730
1731 {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
1732 {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
1733 {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
1734 {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
1735
1736 {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
1737 {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
1738 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
1739 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
1740 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
1741
1742 {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
1743 {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
1744 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
1745 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
1746 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1747
1748 {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
1749 {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
1750
1751 {TTI::SK_Splice, MVT::v32i16, 2}, // vshufi64x2 + palignr
1752 {TTI::SK_Splice, MVT::v32f16, 2}, // vshufi64x2 + palignr
1753 {TTI::SK_Splice, MVT::v64i8, 2}, // vshufi64x2 + palignr
1754 };
1755
1756 if (ST->hasBWI())
1757 if (const auto *Entry =
1758 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1759 return LT.first * Entry->Cost;
1760
1761 static const CostKindTblEntry AVX512ShuffleTbl[] = {
1762 {TTI::SK_Broadcast, MVT::v8f64, { 1, 1, 1, 1 } }, // vbroadcastsd
1763 {TTI::SK_Broadcast, MVT::v16f32, { 1, 1, 1, 1 } }, // vbroadcastss
1764 {TTI::SK_Broadcast, MVT::v8i64, { 1, 1, 1, 1 } }, // vpbroadcastq
1765 {TTI::SK_Broadcast, MVT::v16i32, { 1, 1, 1, 1 } }, // vpbroadcastd
1766 {TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw
1767 {TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw
1768 {TTI::SK_Broadcast, MVT::v64i8, { 1, 1, 1, 1 } }, // vpbroadcastb
1769
1770 {TTI::SK_Reverse, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1771 {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1772 {TTI::SK_Reverse, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1773 {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1774 {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca
1775 {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca
1776 {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca
1777
1778 {TTI::SK_Splice, MVT::v8f64, { 1, 1, 1, 1 } }, // vpalignd
1779 {TTI::SK_Splice, MVT::v4f64, { 1, 1, 1, 1 } }, // vpalignd
1780 {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd
1781 {TTI::SK_Splice, MVT::v8f32, { 1, 1, 1, 1 } }, // vpalignd
1782 {TTI::SK_Splice, MVT::v8i64, { 1, 1, 1, 1 } }, // vpalignd
1783 {TTI::SK_Splice, MVT::v4i64, { 1, 1, 1, 1 } }, // vpalignd
1784 {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd
1785 {TTI::SK_Splice, MVT::v8i32, { 1, 1, 1, 1 } }, // vpalignd
1786 {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr
1787 {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr
1788 {TTI::SK_Splice, MVT::v64i8, { 4, 4, 4, 4 } }, // split + palignr
1789
1790 {TTI::SK_PermuteSingleSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1791 {TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermpd
1792 {TTI::SK_PermuteSingleSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermpd
1793 {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1794 {TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermps
1795 {TTI::SK_PermuteSingleSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermps
1796 {TTI::SK_PermuteSingleSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1797 {TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermq
1798 {TTI::SK_PermuteSingleSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermq
1799 {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1800 {TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermd
1801 {TTI::SK_PermuteSingleSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermd
1802 {TTI::SK_PermuteSingleSrc, MVT::v16i8, { 1, 3, 1, 1 } }, // pshufb
1803
1804 {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermt2pd
1805 {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermt2ps
1806 {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermt2q
1807 {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermt2d
1808 {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermt2pd
1809 {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermt2ps
1810 {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermt2q
1811 {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermt2d
1812 {TTI::SK_PermuteTwoSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermt2pd
1813 {TTI::SK_PermuteTwoSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermt2ps
1814 {TTI::SK_PermuteTwoSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermt2q
1815 {TTI::SK_PermuteTwoSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermt2d
1816
1817 // FIXME: This just applies the type legalization cost rules above
1818 // assuming these completely split.
1819 {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } },
1820 {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } },
1821 {TTI::SK_PermuteSingleSrc, MVT::v64i8, { 14, 14, 14, 14 } },
1822 {TTI::SK_PermuteTwoSrc, MVT::v32i16, { 42, 42, 42, 42 } },
1823 {TTI::SK_PermuteTwoSrc, MVT::v32f16, { 42, 42, 42, 42 } },
1824 {TTI::SK_PermuteTwoSrc, MVT::v64i8, { 42, 42, 42, 42 } },
1825
1826 {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq
1827 {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq
1828 {TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vpternlogq
1829 {TTI::SK_Select, MVT::v8f64, { 1, 1, 1, 1 } }, // vblendmpd
1830 {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps
1831 {TTI::SK_Select, MVT::v8i64, { 1, 1, 1, 1 } }, // vblendmq
1832 {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd
1833 };
1834
1835 if (ST->hasAVX512())
1836 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1837 if (auto KindCost = Entry->Cost[CostKind])
1838 return LT.first * *KindCost;
1839
1840 static const CostTblEntry AVX2ShuffleTbl[] = {
1841 {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1842 {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1843 {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1844 {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1845 {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1846 {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
1847 {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1848
1849 {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1850 {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1851 {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1852 {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1853 {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1854 {TTI::SK_Reverse, MVT::v16f16, 2}, // vperm2i128 + pshufb
1855 {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1856
1857 {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1858 {TTI::SK_Select, MVT::v16f16, 1}, // vpblendvb
1859 {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1860
1861 {TTI::SK_Splice, MVT::v8i32, 2}, // vperm2i128 + vpalignr
1862 {TTI::SK_Splice, MVT::v8f32, 2}, // vperm2i128 + vpalignr
1863 {TTI::SK_Splice, MVT::v16i16, 2}, // vperm2i128 + vpalignr
1864 {TTI::SK_Splice, MVT::v16f16, 2}, // vperm2i128 + vpalignr
1865 {TTI::SK_Splice, MVT::v32i8, 2}, // vperm2i128 + vpalignr
1866
1867 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1868 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1869 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1870 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1871 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1872 // + vpblendvb
1873 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vperm2i128 + 2*vpshufb
1874 // + vpblendvb
1875 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1876 // + vpblendvb
1877
1878 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1879 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1880 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1881 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1882 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1883 // + vpblendvb
1884 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 7}, // 2*vperm2i128 + 4*vpshufb
1885 // + vpblendvb
1886 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
1887 // + vpblendvb
1888 };
1889
1890 if (ST->hasAVX2())
1891 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1892 return LT.first * Entry->Cost;
1893
1894 static const CostTblEntry XOPShuffleTbl[] = {
1895 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
1896 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
1897 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
1898 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
1899 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1900 // + vinsertf128
1901 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
1902 // + vinsertf128
1903
1904 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1905 // + vinsertf128
1906 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
1907 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
1908 // + vinsertf128
1909 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
1910 };
1911
1912 if (ST->hasXOP())
1913 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1914 return LT.first * Entry->Cost;
1915
1916 static const CostTblEntry AVX1ShuffleTbl[] = {
1917 {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1918 {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1919 {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1920 {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1921 {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1922 {TTI::SK_Broadcast, MVT::v16f16, 3}, // vpshuflw + vpshufd + vinsertf128
1923 {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
1924
1925 {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1926 {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1927 {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1928 {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1929 {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1930 // + vinsertf128
1931 {TTI::SK_Reverse, MVT::v16f16, 4}, // vextractf128 + 2*pshufb
1932 // + vinsertf128
1933 {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
1934 // + vinsertf128
1935
1936 {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
1937 {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
1938 {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
1939 {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
1940 {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
1941 {TTI::SK_Select, MVT::v16f16, 3}, // vpand + vpandn + vpor
1942 {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
1943
1944 {TTI::SK_Splice, MVT::v4i64, 2}, // vperm2f128 + shufpd
1945 {TTI::SK_Splice, MVT::v4f64, 2}, // vperm2f128 + shufpd
1946 {TTI::SK_Splice, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1947 {TTI::SK_Splice, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1948 {TTI::SK_Splice, MVT::v16i16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1949 {TTI::SK_Splice, MVT::v16f16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1950 {TTI::SK_Splice, MVT::v32i8, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1951
1952 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
1953 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
1954 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1955 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1956 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
1957 // + 2*por + vinsertf128
1958 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 8}, // vextractf128 + 4*pshufb
1959 // + 2*por + vinsertf128
1960 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
1961 // + 2*por + vinsertf128
1962
1963 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
1964 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
1965 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1966 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1967 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
1968 // + 4*por + vinsertf128
1969 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 15}, // 2*vextractf128 + 8*pshufb
1970 // + 4*por + vinsertf128
1971 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
1972 // + 4*por + vinsertf128
1973 };
1974
1975 if (ST->hasAVX())
1976 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
1977 return LT.first * Entry->Cost;
1978
1979 static const CostTblEntry SSE41ShuffleTbl[] = {
1980 {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
1981 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1982 {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
1983 {TTI::SK_Select, MVT::v4f32, 1}, // blendps
1984 {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
1985 {TTI::SK_Select, MVT::v8f16, 1}, // pblendw
1986 {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
1987 };
1988
1989 if (ST->hasSSE41())
1990 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
1991 return LT.first * Entry->Cost;
1992
1993 static const CostTblEntry SSSE3ShuffleTbl[] = {
1994 {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
1995 {TTI::SK_Broadcast, MVT::v8f16, 1}, // pshufb
1996 {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
1997
1998 {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
1999 {TTI::SK_Reverse, MVT::v8f16, 1}, // pshufb
2000 {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
2001
2002 {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
2003 {TTI::SK_Select, MVT::v8f16, 3}, // 2*pshufb + por
2004 {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
2005
2006 {TTI::SK_Splice, MVT::v4i32, 1}, // palignr
2007 {TTI::SK_Splice, MVT::v4f32, 1}, // palignr
2008 {TTI::SK_Splice, MVT::v8i16, 1}, // palignr
2009 {TTI::SK_Splice, MVT::v8f16, 1}, // palignr
2010 {TTI::SK_Splice, MVT::v16i8, 1}, // palignr
2011
2012 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
2013 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // pshufb
2014 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
2015
2016 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
2017 {TTI::SK_PermuteTwoSrc, MVT::v8f16, 3}, // 2*pshufb + por
2018 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
2019 };
2020
2021 if (ST->hasSSSE3())
2022 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
2023 return LT.first * Entry->Cost;
2024
2025 static const CostTblEntry SSE2ShuffleTbl[] = {
2026 {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
2027 {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
2028 {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
2029 {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
2030 {TTI::SK_Broadcast, MVT::v8f16, 2}, // pshuflw + pshufd
2031 {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
2032
2033 {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
2034 {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
2035 {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
2036 {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
2037 {TTI::SK_Reverse, MVT::v8f16, 3}, // pshuflw + pshufhw + pshufd
2038 {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
2039 // + 2*pshufd + 2*unpck + packus
2040
2041 {TTI::SK_Select, MVT::v2i64, 1}, // movsd
2042 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
2043 {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
2044 {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
2045 {TTI::SK_Select, MVT::v8f16, 3}, // pand + pandn + por
2046 {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
2047
2048 {TTI::SK_Splice, MVT::v2i64, 1}, // shufpd
2049 {TTI::SK_Splice, MVT::v2f64, 1}, // shufpd
2050 {TTI::SK_Splice, MVT::v4i32, 2}, // 2*{unpck,movsd,pshufd}
2051 {TTI::SK_Splice, MVT::v8i16, 3}, // psrldq + psrlldq + por
2052 {TTI::SK_Splice, MVT::v8f16, 3}, // psrldq + psrlldq + por
2053 {TTI::SK_Splice, MVT::v16i8, 3}, // psrldq + psrlldq + por
2054
2055 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
2056 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
2057 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
2058 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
2059 // + pshufd/unpck
2060 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 5}, // 2*pshuflw + 2*pshufhw
2061 // + pshufd/unpck
2062 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
2063 // + 2*pshufd + 2*unpck + 2*packus
2064
2065 { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
2066 { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
2067 { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
2068 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
2069 { TTI::SK_PermuteTwoSrc, MVT::v8f16, 8 }, // blend+permute
2070 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
2071 };
2072
2073 static const CostTblEntry SSE3BroadcastLoadTbl[] = {
2074 {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
2075 };
2076
2077 if (ST->hasSSE2()) {
2078 bool IsLoad =
2079 llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
2080 if (ST->hasSSE3() && IsLoad)
2081 if (const auto *Entry =
2082 CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
2084 LT.second.getVectorElementCount()) &&
2085 "Table entry missing from isLegalBroadcastLoad()");
2086 return LT.first * Entry->Cost;
2087 }
2088
2089 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
2090 return LT.first * Entry->Cost;
2091 }
2092
2093 static const CostTblEntry SSE1ShuffleTbl[] = {
2094 { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
2095 { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
2096 { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
2097 { TTI::SK_Splice, MVT::v4f32, 2 }, // 2*shufps
2098 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
2099 { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
2100 };
2101
2102 if (ST->hasSSE1())
2103 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
2104 return LT.first * Entry->Cost;
2105
2106 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
2107}
2108
2110 Type *Src,
2113 const Instruction *I) {
2114 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2115 assert(ISD && "Invalid opcode");
2116
2117 // TODO: Allow non-throughput costs that aren't binary.
2118 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
2120 return Cost == 0 ? 0 : 1;
2121 return Cost;
2122 };
2123
2124 // The cost tables include both specific, custom (non-legal) src/dst type
2125 // conversions and generic, legalized types. We test for customs first, before
2126 // falling back to legalization.
2127 // FIXME: Need a better design of the cost table to handle non-simple types of
2128 // potential massive combinations (elem_num x src_type x dst_type).
2129 static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
2130 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
2131 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
2132
2133 // Mask sign extend has an instruction.
2134 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
2135 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 },
2136 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
2137 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 },
2138 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
2139 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 },
2140 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
2141 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 },
2142 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
2143 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 },
2144 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
2145 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
2146 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
2147 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
2148 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 },
2149 { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 },
2150 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, 1 },
2151
2152 // Mask zero extend is a sext + shift.
2153 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
2154 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 },
2155 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
2156 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 },
2157 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
2158 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 },
2159 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
2160 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 },
2161 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
2162 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 },
2163 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
2164 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
2165 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
2166 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
2167 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 },
2168 { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 },
2169 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, 2 },
2170
2171 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 },
2172 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 },
2173 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 },
2174 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 },
2175 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 },
2176 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 },
2177 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 },
2178 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 },
2179 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 },
2180 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 },
2181 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 },
2182 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 },
2183 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 },
2184 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 },
2185 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, 2 },
2186 { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, 2 },
2187 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, 2 },
2188
2189 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 2 },
2190 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm
2191 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // vpmovwb
2192 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // vpmovwb
2193 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // vpmovwb
2194 };
2195
2196 static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
2197 // Mask sign extend has an instruction.
2198 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 },
2199 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 },
2200 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 },
2201 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 },
2202 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 },
2203 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, 1 },
2204 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 },
2205 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 },
2206
2207 // Mask zero extend is a sext + shift.
2208 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 },
2209 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 },
2210 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 },
2211 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 },
2212 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 },
2213 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, 2 },
2214 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 },
2215 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 },
2216
2217 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 },
2218 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 },
2219 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 },
2220 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 },
2221 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
2222 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 },
2223 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 },
2224 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, 2 },
2225
2226 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
2227 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
2228
2229 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
2230 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
2231
2232 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 },
2233 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 },
2234
2235 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 },
2236 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 },
2237 };
2238
2239 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
2240 // 256-bit wide vectors.
2241
2242 static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
2243 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 },
2244 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 },
2245 { ISD::FP_EXTEND, MVT::v16f64, MVT::v16f32, 4 }, // 2*vcvtps2pd+vextractf64x4
2246 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 },
2247
2248 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
2249 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
2250 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
2251 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd
2252 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
2253 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
2254 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
2255 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd
2256 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd
2257 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd
2258 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd
2259 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd
2260 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq
2261 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq
2262 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq
2263 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 2 }, // vpmovdb
2264 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 2 }, // vpmovdb
2265 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 }, // vpmovdb
2266 { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, 2 }, // vpmovdb
2267 { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, 2 }, // vpmovdb
2268 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 }, // vpmovdw
2269 { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, 2 }, // vpmovdw
2270 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 2 }, // vpmovqb
2271 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1 }, // vpshufb
2272 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 }, // vpmovqb
2273 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, 2 }, // vpmovqb
2274 { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, 2 }, // vpmovqb
2275 { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, 2 }, // vpmovqb
2276 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 }, // vpmovqw
2277 { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, 2 }, // vpmovqw
2278 { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, 2 }, // vpmovqw
2279 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, // vpmovqd
2280 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd
2281 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb
2282
2283 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32
2284 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 8 },
2285 { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, 8 },
2286
2287 // Sign extend is zmm vpternlogd+vptruncdb.
2288 // Zero extend is zmm broadcast load+vptruncdw.
2289 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 3 },
2290 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 4 },
2291 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 3 },
2292 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 4 },
2293 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 3 },
2294 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 4 },
2295 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 3 },
2296 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 4 },
2297
2298 // Sign extend is zmm vpternlogd+vptruncdw.
2299 // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
2300 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 3 },
2301 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 4 },
2302 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 3 },
2303 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 4 },
2304 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 3 },
2305 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 4 },
2306 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 3 },
2307 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
2308
2309 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd
2310 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld
2311 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd
2312 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld
2313 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd
2314 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld
2315 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq
2316 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq
2317 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq
2318 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq
2319
2320 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd
2321 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld
2322 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq
2323 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq
2324
2325 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
2326 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
2327 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
2328 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
2329 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
2330 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
2331 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
2332 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
2333 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
2334 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
2335
2336 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
2337 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
2338
2339 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
2340 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
2341 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 },
2342 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 },
2343 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
2344 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 },
2345 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
2346 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
2347
2348 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
2349 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
2350 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 },
2351 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 },
2352 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
2353 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 },
2354 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
2355 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
2356 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 },
2357 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 },
2358
2359 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 },
2360 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, 7 },
2361 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64,15 },
2362 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32,11 },
2363 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64,31 },
2364 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, 3 },
2365 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, 7 },
2366 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, 5 },
2367 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64,15 },
2368 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 1 },
2369 { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, 3 },
2370
2371 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 },
2372 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 3 },
2373 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 3 },
2374 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 },
2375 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 3 },
2376 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 3 },
2377 };
2378
2379 static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] {
2380 // Mask sign extend has an instruction.
2381 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
2382 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 },
2383 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
2384 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 },
2385 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
2386 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 },
2387 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
2388 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 },
2389 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
2390 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 },
2391 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
2392 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
2393 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
2394 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
2395 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, 1 },
2396 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, 1 },
2397 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, 1 },
2398
2399 // Mask zero extend is a sext + shift.
2400 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
2401 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 },
2402 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
2403 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 },
2404 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
2405 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 },
2406 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
2407 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 },
2408 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
2409 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 },
2410 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
2411 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
2412 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
2413 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
2414 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, 2 },
2415 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, 2 },
2416 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, 2 },
2417
2418 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 },
2419 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 },
2420 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 },
2421 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 },
2422 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 },
2423 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 },
2424 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 },
2425 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 },
2426 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 },
2427 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 },
2428 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 },
2429 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 },
2430 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 },
2431 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 },
2432 { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, 2 },
2433 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, 2 },
2434 { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, 2 },
2435
2436 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 },
2437 };
2438
2439 static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = {
2440 // Mask sign extend has an instruction.
2441 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 },
2442 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 },
2443 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 },
2444 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, 1 },
2445 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 },
2446 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, 1 },
2447 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, 1 },
2448 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 },
2449
2450 // Mask zero extend is a sext + shift.
2451 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 },
2452 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 },
2453 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 },
2454 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, 2 },
2455 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 },
2456 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, 2 },
2457 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, 2 },
2458 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 },
2459
2460 { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, 2 },
2461 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, 2 },
2462 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 },
2463 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 },
2464 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 },
2465 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 },
2466 { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, 2 },
2467 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
2468
2469 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
2470 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
2471 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
2472 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
2473
2474 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
2475 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
2476 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
2477 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
2478
2479 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, 1 },
2480 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 },
2481 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
2482 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 },
2483
2484 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, 1 },
2485 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 },
2486 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
2487 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 },
2488 };
2489
2490 static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = {
2491 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
2492 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
2493 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
2494 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8
2495 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
2496 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
2497 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
2498 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16
2499 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd
2500 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd
2501 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd
2502 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, 2 }, // vpslld+vptestmd
2503 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq
2504 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq
2505 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd
2506 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, // vpmovqb
2507 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, // vpmovqw
2508 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, // vpmovwb
2509
2510 // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
2511 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
2512 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 5 },
2513 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 6 },
2514 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 5 },
2515 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 6 },
2516 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 5 },
2517 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 6 },
2518 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 10 },
2519 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 12 },
2520
2521 // sign extend is vpcmpeq+maskedmove+vpmovdw
2522 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
2523 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 4 },
2524 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 5 },
2525 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 4 },
2526 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 5 },
2527 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 4 },
2528 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 5 },
2529 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 10 },
2530 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 12 },
2531
2532 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd
2533 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld
2534 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd
2535 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld
2536 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd
2537 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld
2538 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, 1 }, // vpternlogd
2539 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, 2 }, // vpternlogd+psrld
2540
2541 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq
2542 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq
2543 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq
2544 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq
2545
2546 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 1 },
2547 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 1 },
2548 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 1 },
2549 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 1 },
2550 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
2551 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
2552 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 1 },
2553 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 1 },
2554 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
2555 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
2556 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
2557 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
2558
2559 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
2560 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 },
2561 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
2562 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 },
2563
2564 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 },
2565 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 },
2566 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
2567 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 },
2568 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
2569 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 },
2570 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
2571 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
2572 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
2573 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
2574 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 },
2575 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
2576 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 5 },
2577
2578 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 },
2579 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 },
2580 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, 5 },
2581
2582 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 },
2583 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 },
2584 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
2585 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 1 },
2586 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 },
2587 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 },
2588 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 },
2589 };
2590
2591 static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
2592 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
2593 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
2594 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
2595 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
2596 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
2597 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
2598
2599 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 2 },
2600 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 2 },
2601 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 2 },
2602 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 2 },
2603 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
2604 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
2605 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 2 },
2606 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 2 },
2607 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
2608 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
2609 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
2610 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
2611 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
2612 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
2613
2614 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
2615
2616 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 4 },
2617 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 4 },
2618 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 1 },
2619 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 1 },
2620 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 1 },
2621 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 4 },
2622 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 4 },
2623 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 1 },
2624 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 1 },
2625 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 5 },
2626 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 },
2627 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 },
2628
2629 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 },
2630 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 },
2631
2632 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 1 },
2633 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 1 },
2634 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 1 },
2635 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 3 },
2636
2637 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 3 },
2638 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 3 },
2639 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 1 },
2640 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 },
2641 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
2642 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4 },
2643 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 3 },
2644 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 4 },
2645
2646 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 },
2647 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 },
2648 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 },
2649 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 },
2650 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
2651 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
2652 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 3 },
2653
2654 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 },
2655 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 },
2656 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 },
2657 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 },
2658 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 },
2659 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
2660 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 2 },
2661 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
2662 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
2663 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 },
2664 };
2665
2666 static const TypeConversionCostTblEntry AVXConversionTbl[] = {
2667 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 },
2668 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 },
2669 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 },
2670 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 },
2671 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
2672 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
2673
2674 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 3 },
2675 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 3 },
2676 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 3 },
2677 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 3 },
2678 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
2679 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
2680 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 3 },
2681 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 3 },
2682 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
2683 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
2684 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
2685 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
2686
2687 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 4 },
2688 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 5 },
2689 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 4 },
2690 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 9 },
2691 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, 11 },
2692
2693 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 },
2694 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
2695 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // and+extract+packuswb
2696 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 5 },
2697 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
2698 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 5 },
2699 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 3 }, // and+extract+2*packusdw
2700 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 },
2701
2702 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
2703 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 },
2704 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 },
2705 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 },
2706 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 },
2707 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
2708 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 },
2709 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
2710 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
2711 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 },
2712 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 5 },
2713 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 8 },
2714
2715 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 },
2716 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 },
2717 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 },
2718 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 },
2719 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 },
2720 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
2721 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 },
2722 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 4 },
2723 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 4 },
2724 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
2725 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 },
2726 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 },
2727 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 10 },
2728 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 10 },
2729 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 18 },
2730 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
2731 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 10 },
2732
2733 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 },
2734 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, 2 },
2735 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, 2 },
2736 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, 2 },
2737 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 2 },
2738 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, 2 },
2739 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 2 },
2740 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, 2 },
2741 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 2 },
2742 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 2 },
2743 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 5 },
2744
2745 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, 2 },
2746 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, 2 },
2747 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, 2 },
2748 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, 2 },
2749 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 2 },
2750 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, 2 },
2751 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 2 },
2752 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, 2 },
2753 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 },
2754 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
2755 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 6 },
2756 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 7 },
2757 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 7 },
2758
2759 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 },
2760 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 },
2761 };
2762
2763 static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
2764 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 1 },
2765 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 1 },
2766 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 1 },
2767 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 1 },
2768 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
2769 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
2770 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 1 },
2771 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 1 },
2772 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
2773 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
2774 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
2775 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
2776
2777 // These truncates end up widening elements.
2778 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ
2779 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ
2780 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD
2781
2782 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 2 },
2783 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 2 },
2784 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 2 },
2785
2786 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 1 },
2787 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 1 },
2788 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 1 },
2789 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 1 },
2790 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 },
2791 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
2792 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 },
2793 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
2794 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
2795 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 1 },
2796 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
2797
2798 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 1 },
2799 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 1 },
2800 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 4 },
2801 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 },
2802 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 },
2803 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
2804 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 },
2805 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
2806 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 3 },
2807 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 },
2808 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 2 },
2809 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 12 },
2810 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 22 },
2811 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 4 },
2812
2813 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 1 },
2814 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 1 },
2815 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 1 },
2816 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 1 },
2817 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 2 },
2818 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 2 },
2819 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 1 },
2820 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 1 },
2821 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
2822 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 1 },
2823
2824 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 1 },
2825 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 },
2826 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 1 },
2827 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 },
2828 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 2 },
2829 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 2 },
2830 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 1 },
2831 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 1 },
2832 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 4 },
2833 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
2834 };
2835
2836 static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
2837 // These are somewhat magic numbers justified by comparing the
2838 // output of llvm-mca for our various supported scheduler models
2839 // and basing it off the worst case scenario.
2840 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 3 },
2841 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 3 },
2842 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 3 },
2843 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 3 },
2844 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 3 },
2845 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 },
2846 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 3 },
2847 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 },
2848 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 },
2849 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4 },
2850 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 8 },
2851 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 8 },
2852
2853 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 3 },
2854 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 3 },
2855 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 8 },
2856 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 9 },
2857 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 },
2858 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 4 },
2859 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 4 },
2860 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 },
2861 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 7 },
2862 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 7 },
2863 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
2864 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 15 },
2865 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 18 },
2866
2867 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 4 },
2868 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 4 },
2869 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 4 },
2870 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 4 },
2871 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 6 },
2872 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 6 },
2873 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 5 },
2874 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 5 },
2875 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 4 },
2876 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 4 },
2877
2878 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 4 },
2879 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 },
2880 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 4 },
2881 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 15 },
2882 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 6 },
2883 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 6 },
2884 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 5 },
2885 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 5 },
2886 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 8 },
2887 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 8 },
2888
2889 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 4 },
2890 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 4 },
2891 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 2 },
2892 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 3 },
2893 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
2894 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 2 },
2895 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 2 },
2896 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 3 },
2897 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
2898 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 2 },
2899 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
2900 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 2 },
2901
2902 // These truncates are really widening elements.
2903 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD
2904 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // PUNPCKLWD+DQ
2905 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // PUNPCKLBW+WD+PSHUFD
2906 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 1 }, // PUNPCKLWD
2907 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD
2908 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW
2909
2910 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 2 }, // PAND+PACKUSWB
2911 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 },
2912 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 3 }, // PAND+2*PACKUSWB
2913 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 },
2914 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 },
2915 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 3 },
2916 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
2917 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32,10 },
2918 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB
2919 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW
2920 { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, 1 }, // PSHUFD
2921 };
2922
2923 // Attempt to map directly to (simple) MVT types to let us match custom entries.
2924 EVT SrcTy = TLI->getValueType(DL, Src);
2925 EVT DstTy = TLI->getValueType(DL, Dst);
2926
2927 // The function getSimpleVT only handles simple value types.
2928 if (SrcTy.isSimple() && DstTy.isSimple()) {
2929 MVT SimpleSrcTy = SrcTy.getSimpleVT();
2930 MVT SimpleDstTy = DstTy.getSimpleVT();
2931
2932 if (ST->useAVX512Regs()) {
2933 if (ST->hasBWI())
2934 if (const auto *Entry = ConvertCostTableLookup(
2935 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2936 return AdjustCost(Entry->Cost);
2937
2938 if (ST->hasDQI())
2939 if (const auto *Entry = ConvertCostTableLookup(
2940 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2941 return AdjustCost(Entry->Cost);
2942
2943 if (ST->hasAVX512())
2944 if (const auto *Entry = ConvertCostTableLookup(
2945 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2946 return AdjustCost(Entry->Cost);
2947 }
2948
2949 if (ST->hasBWI())
2950 if (const auto *Entry = ConvertCostTableLookup(
2951 AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2952 return AdjustCost(Entry->Cost);
2953
2954 if (ST->hasDQI())
2955 if (const auto *Entry = ConvertCostTableLookup(
2956 AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2957 return AdjustCost(Entry->Cost);
2958
2959 if (ST->hasAVX512())
2960 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2961 SimpleDstTy, SimpleSrcTy))
2962 return AdjustCost(Entry->Cost);
2963
2964 if (ST->hasAVX2()) {
2965 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2966 SimpleDstTy, SimpleSrcTy))
2967 return AdjustCost(Entry->Cost);
2968 }
2969
2970 if (ST->hasAVX()) {
2971 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2972 SimpleDstTy, SimpleSrcTy))
2973 return AdjustCost(Entry->Cost);
2974 }
2975
2976 if (ST->hasSSE41()) {
2977 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
2978 SimpleDstTy, SimpleSrcTy))
2979 return AdjustCost(Entry->Cost);
2980 }
2981
2982 if (ST->hasSSE2()) {
2983 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2984 SimpleDstTy, SimpleSrcTy))
2985 return AdjustCost(Entry->Cost);
2986 }
2987 }
2988
2989 // Fall back to legalized types.
2990 std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src);
2991 std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst);
2992
2993 // If we're truncating to the same legalized type - just assume its free.
2994 if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
2995 return TTI::TCC_Free;
2996
2997 if (ST->useAVX512Regs()) {
2998 if (ST->hasBWI())
2999 if (const auto *Entry = ConvertCostTableLookup(
3000 AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
3001 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3002
3003 if (ST->hasDQI())
3004 if (const auto *Entry = ConvertCostTableLookup(
3005 AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
3006 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3007
3008 if (ST->hasAVX512())
3009 if (const auto *Entry = ConvertCostTableLookup(
3010 AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
3011 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3012 }
3013
3014 if (ST->hasBWI())
3015 if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
3016 LTDest.second, LTSrc.second))
3017 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3018
3019 if (ST->hasDQI())
3020 if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
3021 LTDest.second, LTSrc.second))
3022 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3023
3024 if (ST->hasAVX512())
3025 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3026 LTDest.second, LTSrc.second))
3027 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3028
3029 if (ST->hasAVX2())
3030 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3031 LTDest.second, LTSrc.second))
3032 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3033
3034 if (ST->hasAVX())
3035 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3036 LTDest.second, LTSrc.second))
3037 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3038
3039 if (ST->hasSSE41())
3040 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3041 LTDest.second, LTSrc.second))
3042 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3043
3044 if (ST->hasSSE2())
3045 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3046 LTDest.second, LTSrc.second))
3047 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
3048
3049 // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
3050 // sitofp.
3051 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3052 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
3053 Type *ExtSrc = Src->getWithNewBitWidth(32);
3054 unsigned ExtOpc =
3055 (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
3056
3057 // For scalar loads the extend would be free.
3058 InstructionCost ExtCost = 0;
3059 if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
3060 ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
3061
3062 return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
3064 }
3065
3066 // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
3067 // i32.
3068 if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
3069 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
3070 Type *TruncDst = Dst->getWithNewBitWidth(32);
3071 return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
3072 getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
3074 }
3075
3076 return AdjustCost(
3077 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3078}
3079
3081 Type *CondTy,
3082 CmpInst::Predicate VecPred,
3084 const Instruction *I) {
3085 // Early out if this type isn't scalar/vector integer/float.
3086 if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy()))
3087 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3088 I);
3089
3090 // Legalize the type.
3091 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3092
3093 MVT MTy = LT.second;
3094
3095 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3096 assert(ISD && "Invalid opcode");
3097
3098 InstructionCost ExtraCost = 0;
3099 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
3100 // Some vector comparison predicates cost extra instructions.
3101 // TODO: Adjust ExtraCost based on CostKind?
3102 // TODO: Should we invert this and assume worst case cmp costs
3103 // and reduce for particular predicates?
3104 if (MTy.isVector() &&
3105 !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
3106 (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
3107 ST->hasBWI())) {
3108 // Fallback to I if a specific predicate wasn't specified.
3109 CmpInst::Predicate Pred = VecPred;
3110 if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
3112 Pred = cast<CmpInst>(I)->getPredicate();
3113
3114 bool CmpWithConstant = false;
3115 if (auto *CmpInstr = dyn_cast_or_null<CmpInst>(I))
3116 CmpWithConstant = isa<Constant>(CmpInstr->getOperand(1));
3117
3118 switch (Pred) {
3120 // xor(cmpeq(x,y),-1)
3121 ExtraCost = CmpWithConstant ? 0 : 1;
3122 break;
3125 // xor(cmpgt(x,y),-1)
3126 ExtraCost = CmpWithConstant ? 0 : 1;
3127 break;
3130 // cmpgt(xor(x,signbit),xor(y,signbit))
3131 // xor(cmpeq(pmaxu(x,y),x),-1)
3132 ExtraCost = CmpWithConstant ? 1 : 2;
3133 break;
3136 if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
3137 (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
3138 // cmpeq(psubus(x,y),0)
3139 // cmpeq(pminu(x,y),x)
3140 ExtraCost = 1;
3141 } else {
3142 // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
3143 ExtraCost = CmpWithConstant ? 2 : 3;
3144 }
3145 break;
3148 // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases.
3149 // Use FCMP_UEQ expansion - FCMP_ONE should be the same.
3150 if (CondTy && !ST->hasAVX())
3151 return getCmpSelInstrCost(Opcode, ValTy, CondTy,
3153 getCmpSelInstrCost(Opcode, ValTy, CondTy,
3155 getArithmeticInstrCost(Instruction::Or, CondTy, CostKind);
3156
3157 break;
3160 // Assume worst case scenario and add the maximum extra cost.
3161 ExtraCost = 3;
3162 break;
3163 default:
3164 break;
3165 }
3166 }
3167 }
3168
3169 static const CostKindTblEntry SLMCostTbl[] = {
3170 // slm pcmpeq/pcmpgt throughput is 2
3171 { ISD::SETCC, MVT::v2i64, { 2, 5, 1, 2 } },
3172 // slm pblendvb/blendvpd/blendvps throughput is 4
3173 { ISD::SELECT, MVT::v2f64, { 4, 4, 1, 3 } }, // vblendvpd
3174 { ISD::SELECT, MVT::v4f32, { 4, 4, 1, 3 } }, // vblendvps
3175 { ISD::SELECT, MVT::v2i64, { 4, 4, 1, 3 } }, // pblendvb
3176 { ISD::SELECT, MVT::v8i32, { 4, 4, 1, 3 } }, // pblendvb
3177 { ISD::SELECT, MVT::v8i16, { 4, 4, 1, 3 } }, // pblendvb
3178 { ISD::SELECT, MVT::v16i8, { 4, 4, 1, 3 } }, // pblendvb
3179 };
3180
3181 static const CostKindTblEntry AVX512BWCostTbl[] = {
3182 { ISD::SETCC, MVT::v32i16, { 1, 1, 1, 1 } },
3183 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 1 } },
3184 { ISD::SETCC, MVT::v64i8, { 1, 1, 1, 1 } },
3185 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 1 } },
3186
3187 { ISD::SELECT, MVT::v32i16, { 1, 1, 1, 1 } },
3188 { ISD::SELECT, MVT::v64i8, { 1, 1, 1, 1 } },
3189 };
3190
3191 static const CostKindTblEntry AVX512CostTbl[] = {
3192 { ISD::SETCC, MVT::v8f64, { 1, 4, 1, 1 } },
3193 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 1 } },
3194 { ISD::SETCC, MVT::v16f32, { 1, 4, 1, 1 } },
3195 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 1 } },
3196
3197 { ISD::SETCC, MVT::v8i64, { 1, 1, 1, 1 } },
3198 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 1 } },
3199 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3200 { ISD::SETCC, MVT::v16i32, { 1, 1, 1, 1 } },
3201 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 1 } },
3202 { ISD::SETCC, MVT::v32i16, { 3, 7, 5, 5 } },
3203 { ISD::SETCC, MVT::v64i8, { 3, 7, 5, 5 } },
3204
3205 { ISD::SELECT, MVT::v8i64, { 1, 1, 1, 1 } },
3206 { ISD::SELECT, MVT::v4i64, { 1, 1, 1, 1 } },
3207 { ISD::SELECT, MVT::v2i64, { 1, 1, 1, 1 } },
3208 { ISD::SELECT, MVT::v16i32, { 1, 1, 1, 1 } },
3209 { ISD::SELECT, MVT::v8i32, { 1, 1, 1, 1 } },
3210 { ISD::SELECT, MVT::v4i32, { 1, 1, 1, 1 } },
3211 { ISD::SELECT, MVT::v8f64, { 1, 1, 1, 1 } },
3212 { ISD::SELECT, MVT::v4f64, { 1, 1, 1, 1 } },
3213 { ISD::SELECT, MVT::v2f64, { 1, 1, 1, 1 } },
3214 { ISD::SELECT, MVT::f64, { 1, 1, 1, 1 } },
3215 { ISD::SELECT, MVT::v16f32, { 1, 1, 1, 1 } },
3216 { ISD::SELECT, MVT::v8f32 , { 1, 1, 1, 1 } },
3217 { ISD::SELECT, MVT::v4f32, { 1, 1, 1, 1 } },
3218 { ISD::SELECT, MVT::f32 , { 1, 1, 1, 1 } },
3219
3220 { ISD::SELECT, MVT::v32i16, { 2, 2, 4, 4 } },
3221 { ISD::SELECT, MVT::v16i16, { 1, 1, 1, 1 } },
3222 { ISD::SELECT, MVT::v8i16, { 1, 1, 1, 1 } },
3223 { ISD::SELECT, MVT::v64i8, { 2, 2, 4, 4 } },
3224 { ISD::SELECT, MVT::v32i8, { 1, 1, 1, 1 } },
3225 { ISD::SELECT, MVT::v16i8, { 1, 1, 1, 1 } },
3226 };
3227
3228 static const CostKindTblEntry AVX2CostTbl[] = {
3229 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 2 } },
3230 { ISD::SETCC, MVT::v2f64, { 1, 4, 1, 1 } },
3231 { ISD::SETCC, MVT::f64, { 1, 4, 1, 1 } },
3232 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 2 } },
3233 { ISD::SETCC, MVT::v4f32, { 1, 4, 1, 1 } },
3234 { ISD::SETCC, MVT::f32, { 1, 4, 1, 1 } },
3235
3236 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 2 } },
3237 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 2 } },
3238 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 2 } },
3239 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 2 } },
3240
3241 { ISD::SELECT, MVT::v4f64, { 2, 2, 1, 2 } }, // vblendvpd
3242 { ISD::SELECT, MVT::v8f32, { 2, 2, 1, 2 } }, // vblendvps
3243 { ISD::SELECT, MVT::v4i64, { 2, 2, 1, 2 } }, // pblendvb
3244 { ISD::SELECT, MVT::v8i32, { 2, 2, 1, 2 } }, // pblendvb
3245 { ISD::SELECT, MVT::v16i16, { 2, 2, 1, 2 } }, // pblendvb
3246 { ISD::SELECT, MVT::v32i8, { 2, 2, 1, 2 } }, // pblendvb
3247 };
3248
3249 static const CostKindTblEntry XOPCostTbl[] = {
3250 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3251 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3252 };
3253
3254 static const CostKindTblEntry AVX1CostTbl[] = {
3255 { ISD::SETCC, MVT::v4f64, { 2, 3, 1, 2 } },
3256 { ISD::SETCC, MVT::v2f64, { 1, 3, 1, 1 } },
3257 { ISD::SETCC, MVT::f64, { 1, 3, 1, 1 } },
3258 { ISD::SETCC, MVT::v8f32, { 2, 3, 1, 2 } },
3259 { ISD::SETCC, MVT::v4f32, { 1, 3, 1, 1 } },
3260 { ISD::SETCC, MVT::f32, { 1, 3, 1, 1 } },
3261
3262 // AVX1 does not support 8-wide integer compare.
3263 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3264 { ISD::SETCC, MVT::v8i32, { 4, 2, 5, 6 } },
3265 { ISD::SETCC, MVT::v16i16, { 4, 2, 5, 6 } },
3266 { ISD::SETCC, MVT::v32i8, { 4, 2, 5, 6 } },
3267
3268 { ISD::SELECT, MVT::v4f64, { 3, 3, 1, 2 } }, // vblendvpd
3269 { ISD::SELECT, MVT::v8f32, { 3, 3, 1, 2 } }, // vblendvps
3270 { ISD::SELECT, MVT::v4i64, { 3, 3, 1, 2 } }, // vblendvpd
3271 { ISD::SELECT, MVT::v8i32, { 3, 3, 1, 2 } }, // vblendvps
3272 { ISD::SELECT, MVT::v16i16, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3273 { ISD::SELECT, MVT::v32i8, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3274 };
3275
3276 static const CostKindTblEntry SSE42CostTbl[] = {
3277 { ISD::SETCC, MVT::v2i64, { 1, 2, 1, 2 } },
3278 };
3279
3280 static const CostKindTblEntry SSE41CostTbl[] = {
3281 { ISD::SETCC, MVT::v2f64, { 1, 5, 1, 1 } },
3282 { ISD::SETCC, MVT::v4f32, { 1, 5, 1, 1 } },
3283
3284 { ISD::SELECT, MVT::v2f64, { 2, 2, 1, 2 } }, // blendvpd
3285 { ISD::SELECT, MVT::f64, { 2, 2, 1, 2 } }, // blendvpd
3286 { ISD::SELECT, MVT::v4f32, { 2, 2, 1, 2 } }, // blendvps
3287 { ISD::SELECT, MVT::f32 , { 2, 2, 1, 2 } }, // blendvps
3288 { ISD::SELECT, MVT::v2i64, { 2, 2, 1, 2 } }, // pblendvb
3289 { ISD::SELECT, MVT::v4i32, { 2, 2, 1, 2 } }, // pblendvb
3290 { ISD::SELECT, MVT::v8i16, { 2, 2, 1, 2 } }, // pblendvb
3291 { ISD::SELECT, MVT::v16i8, { 2, 2, 1, 2 } }, // pblendvb
3292 };
3293
3294 static const CostKindTblEntry SSE2CostTbl[] = {
3295 { ISD::SETCC, MVT::v2f64, { 2, 5, 1, 1 } },
3296 { ISD::SETCC, MVT::f64, { 1, 5, 1, 1 } },
3297
3298 { ISD::SETCC, MVT::v2i64, { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion
3299 { ISD::SETCC, MVT::v4i32, { 1, 1, 1, 1 } },
3300 { ISD::SETCC, MVT::v8i16, { 1, 1, 1, 1 } },
3301 { ISD::SETCC, MVT::v16i8, { 1, 1, 1, 1 } },
3302
3303 { ISD::SELECT, MVT::v2f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3304 { ISD::SELECT, MVT::f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3305 { ISD::SELECT, MVT::v2i64, { 2, 2, 3, 3 } }, // pand + pandn + por
3306 { ISD::SELECT, MVT::v4i32, { 2, 2, 3, 3 } }, // pand + pandn + por
3307 { ISD::SELECT, MVT::v8i16, { 2, 2, 3, 3 } }, // pand + pandn + por
3308 { ISD::SELECT, MVT::v16i8, { 2, 2, 3, 3 } }, // pand + pandn + por
3309 };
3310
3311 static const CostKindTblEntry SSE1CostTbl[] = {
3312 { ISD::SETCC, MVT::v4f32, { 2, 5, 1, 1 } },
3313 { ISD::SETCC, MVT::f32, { 1, 5, 1, 1 } },
3314
3315 { ISD::SELECT, MVT::v4f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3316 { ISD::SELECT, MVT::f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3317 };
3318
3319 if (ST->useSLMArithCosts())
3320 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
3321 if (auto KindCost = Entry->Cost[CostKind])
3322 return LT.first * (ExtraCost + *KindCost);
3323
3324 if (ST->hasBWI())
3325 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3326 if (auto KindCost = Entry->Cost[CostKind])
3327 return LT.first * (ExtraCost + *KindCost);
3328
3329 if (ST->hasAVX512())
3330 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3331 if (auto KindCost = Entry->Cost[CostKind])
3332 return LT.first * (ExtraCost + *KindCost);
3333
3334 if (ST->hasAVX2())
3335 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3336 if (auto KindCost = Entry->Cost[CostKind])
3337 return LT.first * (ExtraCost + *KindCost);
3338
3339 if (ST->hasXOP())
3340 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3341 if (auto KindCost = Entry->Cost[CostKind])
3342 return LT.first * (ExtraCost + *KindCost);
3343
3344 if (ST->hasAVX())
3345 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3346 if (auto KindCost = Entry->Cost[CostKind])
3347 return LT.first * (ExtraCost + *KindCost);
3348
3349 if (ST->hasSSE42())
3350 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3351 if (auto KindCost = Entry->Cost[CostKind])
3352 return LT.first * (ExtraCost + *KindCost);
3353
3354 if (ST->hasSSE41())
3355 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3356 if (auto KindCost = Entry->Cost[CostKind])
3357 return LT.first * (ExtraCost + *KindCost);
3358
3359 if (ST->hasSSE2())
3360 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3361 if (auto KindCost = Entry->Cost[CostKind])
3362 return LT.first * (ExtraCost + *KindCost);
3363
3364 if (ST->hasSSE1())
3365 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3366 if (auto KindCost = Entry->Cost[CostKind])
3367 return LT.first * (ExtraCost + *KindCost);
3368
3369 // Assume a 3cy latency for fp select ops.
3370 if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select)
3371 if (ValTy->getScalarType()->isFloatingPointTy())
3372 return 3;
3373
3374 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
3375}
3376
3378
3382 // Costs should match the codegen from:
3383 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
3384 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
3385 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
3386 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
3387 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
3388
3389 // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
3390 // specialized in these tables yet.
3391 static const CostKindTblEntry AVX512VBMI2CostTbl[] = {
3392 { ISD::FSHL, MVT::v8i64, { 1, 1, 1, 1 } },
3393 { ISD::FSHL, MVT::v4i64, { 1, 1, 1, 1 } },
3394 { ISD::FSHL, MVT::v2i64, { 1, 1, 1, 1 } },
3395 { ISD::FSHL, MVT::v16i32, { 1, 1, 1, 1 } },
3396 { ISD::FSHL, MVT::v8i32, { 1, 1, 1, 1 } },
3397 { ISD::FSHL, MVT::v4i32, { 1, 1, 1, 1 } },
3398 { ISD::FSHL, MVT::v32i16, { 1, 1, 1, 1 } },
3399 { ISD::FSHL, MVT::v16i16, { 1, 1, 1, 1 } },
3400 { ISD::FSHL, MVT::v8i16, { 1, 1, 1, 1 } },
3401 { ISD::ROTL, MVT::v32i16, { 1, 1, 1, 1 } },
3402 { ISD::ROTL, MVT::v16i16, { 1, 1, 1, 1 } },
3403 { ISD::ROTL, MVT::v8i16, { 1, 1, 1, 1 } },
3404 { ISD::ROTR, MVT::v32i16, { 1, 1, 1, 1 } },
3405 { ISD::ROTR, MVT::v16i16, { 1, 1, 1, 1 } },
3406 { ISD::ROTR, MVT::v8i16, { 1, 1, 1, 1 } },
3407 };
3408 static const CostKindTblEntry AVX512BITALGCostTbl[] = {
3409 { ISD::CTPOP, MVT::v32i16, { 1, 1, 1, 1 } },
3410 { ISD::CTPOP, MVT::v64i8, { 1, 1, 1, 1 } },
3411 { ISD::CTPOP, MVT::v16i16, { 1, 1, 1, 1 } },
3412 { ISD::CTPOP, MVT::v32i8, { 1, 1, 1, 1 } },
3413 { ISD::CTPOP, MVT::v8i16, { 1, 1, 1, 1 } },
3414 { ISD::CTPOP, MVT::v16i8, { 1, 1, 1, 1 } },
3415 };
3416 static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = {
3417 { ISD::CTPOP, MVT::v8i64, { 1, 1, 1, 1 } },
3418 { ISD::CTPOP, MVT::v16i32, { 1, 1, 1, 1 } },
3419 { ISD::CTPOP, MVT::v4i64, { 1, 1, 1, 1 } },
3420 { ISD::CTPOP, MVT::v8i32, { 1, 1, 1, 1 } },
3421 { ISD::CTPOP, MVT::v2i64, { 1, 1, 1, 1 } },
3422 { ISD::CTPOP, MVT::v4i32, { 1, 1, 1, 1 } },
3423 };
3424 static const CostKindTblEntry AVX512CDCostTbl[] = {
3425 { ISD::CTLZ, MVT::v8i64, { 1, 5, 1, 1 } },
3426 { ISD::CTLZ, MVT::v16i32, { 1, 5, 1, 1 } },
3427 { ISD::CTLZ, MVT::v32i16, { 18, 27, 23, 27 } },
3428 { ISD::CTLZ, MVT::v64i8, { 3, 16, 9, 11 } },
3429 { ISD::CTLZ, MVT::v4i64, { 1, 5, 1, 1 } },
3430 { ISD::CTLZ, MVT::v8i32, { 1, 5, 1, 1 } },
3431 { ISD::CTLZ, MVT::v16i16, { 8, 19, 11, 13 } },
3432 { ISD::CTLZ, MVT::v32i8, { 2, 11, 9, 10 } },
3433 { ISD::CTLZ, MVT::v2i64, { 1, 5, 1, 1 } },
3434 { ISD::CTLZ, MVT::v4i32, { 1, 5, 1, 1 } },
3435 { ISD::CTLZ, MVT::v8i16, { 3, 15, 4, 6 } },
3436 { ISD::CTLZ, MVT::v16i8, { 2, 10, 9, 10 } },
3437
3438 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3439 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3440 { ISD::CTTZ, MVT::v4i64, { 1, 8, 6, 6 } },
3441 { ISD::CTTZ, MVT::v8i32, { 1, 8, 6, 6 } },
3442 { ISD::CTTZ, MVT::v2i64, { 1, 8, 6, 6 } },
3443 { ISD::CTTZ, MVT::v4i32, { 1, 8, 6, 6 } },
3444 };
3445 static const CostKindTblEntry AVX512BWCostTbl[] = {
3446 { ISD::ABS, MVT::v32i16, { 1, 1, 1, 1 } },
3447 { ISD::ABS, MVT::v64i8, { 1, 1, 1, 1 } },
3448 { ISD::BITREVERSE, MVT::v2i64, { 3, 10, 10, 11 } },
3449 { ISD::BITREVERSE, MVT::v4i64, { 3, 11, 10, 11 } },
3450 { ISD::BITREVERSE, MVT::v8i64, { 3, 12, 10, 14 } },
3451 { ISD::BITREVERSE, MVT::v4i32, { 3, 10, 10, 11 } },
3452 { ISD::BITREVERSE, MVT::v8i32, { 3, 11, 10, 11 } },
3453 { ISD::BITREVERSE, MVT::v16i32, { 3, 12, 10, 14 } },
3454 { ISD::BITREVERSE, MVT::v8i16, { 3, 10, 10, 11 } },
3455 { ISD::BITREVERSE, MVT::v16i16, { 3, 11, 10, 11 } },
3456 { ISD::BITREVERSE, MVT::v32i16, { 3, 12, 10, 14 } },
3457 { ISD::BITREVERSE, MVT::v16i8, { 2, 5, 9, 9 } },
3458 { ISD::BITREVERSE, MVT::v32i8, { 2, 5, 9, 9 } },
3459 { ISD::BITREVERSE, MVT::v64i8, { 2, 5, 9, 12 } },
3460 { ISD::BSWAP, MVT::v2i64, { 1, 1, 1, 2 } },
3461 { ISD::BSWAP, MVT::v4i64, { 1, 1, 1, 2 } },
3462 { ISD::BSWAP, MVT::v8i64, { 1, 1, 1, 2 } },
3463 { ISD::BSWAP, MVT::v4i32, { 1, 1, 1, 2 } },
3464 { ISD::BSWAP, MVT::v8i32, { 1, 1, 1, 2 } },
3465 { ISD::BSWAP, MVT::v16i32, { 1, 1, 1, 2 } },
3466 { ISD::BSWAP, MVT::v8i16, { 1, 1, 1, 2 } },
3467 { ISD::BSWAP, MVT::v16i16, { 1, 1, 1, 2 } },
3468 { ISD::BSWAP, MVT::v32i16, { 1, 1, 1, 2 } },
3469 { ISD::CTLZ, MVT::v8i64, { 8, 22, 23, 23 } },
3470 { ISD::CTLZ, MVT::v16i32, { 8, 23, 25, 25 } },
3471 { ISD::CTLZ, MVT::v32i16, { 4, 15, 15, 16 } },
3472 { ISD::CTLZ, MVT::v64i8, { 3, 12, 10, 9 } },
3473 { ISD::CTPOP, MVT::v2i64, { 3, 7, 10, 10 } },
3474 { ISD::CTPOP, MVT::v4i64, { 3, 7, 10, 10 } },
3475 { ISD::CTPOP, MVT::v8i64, { 3, 8, 10, 12 } },
3476 { ISD::CTPOP, MVT::v4i32, { 7, 11, 14, 14 } },
3477 { ISD::CTPOP, MVT::v8i32, { 7, 11, 14, 14 } },
3478 { ISD::CTPOP, MVT::v16i32, { 7, 12, 14, 16 } },
3479 { ISD::CTPOP, MVT::v8i16, { 2, 7, 11, 11 } },
3480 { ISD::CTPOP, MVT::v16i16, { 2, 7, 11, 11 } },
3481 { ISD::CTPOP, MVT::v32i16, { 3, 7, 11, 13 } },
3482 { ISD::CTPOP, MVT::v16i8, { 2, 4, 8, 8 } },
3483 { ISD::CTPOP, MVT::v32i8, { 2, 4, 8, 8 } },
3484 { ISD::CTPOP, MVT::v64i8, { 2, 5, 8, 10 } },
3485 { ISD::CTTZ, MVT::v8i16, { 3, 9, 14, 14 } },
3486 { ISD::CTTZ, MVT::v16i16, { 3, 9, 14, 14 } },
3487 { ISD::CTTZ, MVT::v32i16, { 3, 10, 14, 16 } },
3488 { ISD::CTTZ, MVT::v16i8, { 2, 6, 11, 11 } },
3489 { ISD::CTTZ, MVT::v32i8, { 2, 6, 11, 11 } },
3490 { ISD::CTTZ, MVT::v64i8, { 3, 7, 11, 13 } },
3491 { ISD::ROTL, MVT::v32i16, { 2, 8, 6, 8 } },
3492 { ISD::ROTL, MVT::v16i16, { 2, 8, 6, 7 } },
3493 { ISD::ROTL, MVT::v8i16, { 2, 7, 6, 7 } },
3494 { ISD::ROTL, MVT::v64i8, { 5, 6, 11, 12 } },
3495 { ISD::ROTL, MVT::v32i8, { 5, 15, 7, 10 } },
3496 { ISD::ROTL, MVT::v16i8, { 5, 15, 7, 10 } },
3497 { ISD::ROTR, MVT::v32i16, { 2, 8, 6, 8 } },
3498 { ISD::ROTR, MVT::v16i16, { 2, 8, 6, 7 } },
3499 { ISD::ROTR, MVT::v8i16, { 2, 7, 6, 7 } },
3500 { ISD::ROTR, MVT::v64i8, { 5, 6, 12, 14 } },
3501 { ISD::ROTR, MVT::v32i8, { 5, 14, 6, 9 } },
3502 { ISD::ROTR, MVT::v16i8, { 5, 14, 6, 9 } },
3503 { ISD::SADDSAT, MVT::v32i16, { 1 } },
3504 { ISD::SADDSAT, MVT::v64i8, { 1 } },
3505 { ISD::SMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3506 { ISD::SMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3507 { ISD::SMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3508 { ISD::SMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3509 { ISD::SSUBSAT, MVT::v32i16, { 1 } },
3510 { ISD::SSUBSAT, MVT::v64i8, { 1 } },
3511 { ISD::UADDSAT, MVT::v32i16, { 1 } },
3512 { ISD::UADDSAT, MVT::v64i8, { 1 } },
3513 { ISD::UMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3514 { ISD::UMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3515 { ISD::UMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3516 { ISD::UMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3517 { ISD::USUBSAT, MVT::v32i16, { 1 } },
3518 { ISD::USUBSAT, MVT::v64i8, { 1 } },
3519 };
3520 static const CostKindTblEntry AVX512CostTbl[] = {
3521 { ISD::ABS, MVT::v8i64, { 1, 1, 1, 1 } },
3522 { ISD::ABS, MVT::v4i64, { 1, 1, 1, 1 } },
3523 { ISD::ABS, MVT::v2i64, { 1, 1, 1, 1 } },
3524 { ISD::ABS, MVT::v16i32, { 1, 1, 1, 1 } },
3525 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 1 } },
3526 { ISD::ABS, MVT::v32i16, { 2, 7, 4, 4 } },
3527 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 1 } },
3528 { ISD::ABS, MVT::v64i8, { 2, 7, 4, 4 } },
3529 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 1 } },
3530 { ISD::BITREVERSE, MVT::v8i64, { 9, 13, 20, 20 } },
3531 { ISD::BITREVERSE, MVT::v16i32, { 9, 13, 20, 20 } },
3532 { ISD::BITREVERSE, MVT::v32i16, { 9, 13, 20, 20 } },
3533 { ISD::BITREVERSE, MVT::v64i8, { 6, 11, 17, 17 } },
3534 { ISD::BSWAP, MVT::v8i64, { 4, 7, 5, 5 } },
3535 { ISD::BSWAP, MVT::v16i32, { 4, 7, 5, 5 } },
3536 { ISD::BSWAP, MVT::v32i16, { 4, 7, 5, 5 } },
3537 { ISD::CTLZ, MVT::v8i64, { 10, 28, 32, 32 } },
3538 { ISD::CTLZ, MVT::v16i32, { 12, 30, 38, 38 } },
3539 { ISD::CTLZ, MVT::v32i16, { 8, 15, 29, 29 } },
3540 { ISD::CTLZ, MVT::v64i8, { 6, 11, 19, 19 } },
3541 { ISD::CTPOP, MVT::v8i64, { 16, 16, 19, 19 } },
3542 { ISD::CTPOP, MVT::v16i32, { 24, 19, 27, 27 } },
3543 { ISD::CTPOP, MVT::v32i16, { 18, 15, 22, 22 } },
3544 { ISD::CTPOP, MVT::v64i8, { 12, 11, 16, 16 } },
3545 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3546 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3547 { ISD::CTTZ, MVT::v32i16, { 7, 17, 27, 27 } },
3548 { ISD::CTTZ, MVT::v64i8, { 6, 13, 21, 21 } },
3549 { ISD::ROTL, MVT::v8i64, { 1, 1, 1, 1 } },
3550 { ISD::ROTL, MVT::v4i64, { 1, 1, 1, 1 } },
3551 { ISD::ROTL, MVT::v2i64, { 1, 1, 1, 1 } },
3552 { ISD::ROTL, MVT::v16i32, { 1, 1, 1, 1 } },
3553 { ISD::ROTL, MVT::v8i32, { 1, 1, 1, 1 } },
3554 { ISD::ROTL, MVT::v4i32, { 1, 1, 1, 1 } },
3555 { ISD::ROTR, MVT::v8i64, { 1, 1, 1, 1 } },
3556 { ISD::ROTR, MVT::v4i64, { 1, 1, 1, 1 } },
3557 { ISD::ROTR, MVT::v2i64, { 1, 1, 1, 1 } },
3558 { ISD::ROTR, MVT::v16i32, { 1, 1, 1, 1 } },
3559 { ISD::ROTR, MVT::v8i32, { 1, 1, 1, 1 } },
3560 { ISD::ROTR, MVT::v4i32, { 1, 1, 1, 1 } },
3561 { ISD::SMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3562 { ISD::SMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3563 { ISD::SMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3564 { ISD::SMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3565 { ISD::SMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3566 { ISD::SMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3567 { ISD::SMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3568 { ISD::SMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3569 { ISD::SMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3570 { ISD::SMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3571 { ISD::SMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3572 { ISD::SMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3573 { ISD::UMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3574 { ISD::UMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3575 { ISD::UMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3576 { ISD::UMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3577 { ISD::UMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3578 { ISD::UMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3579 { ISD::UMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3580 { ISD::UMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3581 { ISD::UMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3582 { ISD::UMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3583 { ISD::UMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3584 { ISD::UMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3585 { ISD::USUBSAT, MVT::v16i32, { 2 } }, // pmaxud + psubd
3586 { ISD::USUBSAT, MVT::v2i64, { 2 } }, // pmaxuq + psubq
3587 { ISD::USUBSAT, MVT::v4i64, { 2 } }, // pmaxuq + psubq
3588 { ISD::USUBSAT, MVT::v8i64, { 2 } }, // pmaxuq + psubq
3589 { ISD::UADDSAT, MVT::v16i32, { 3 } }, // not + pminud + paddd
3590 { ISD::UADDSAT, MVT::v2i64, { 3 } }, // not + pminuq + paddq
3591 { ISD::UADDSAT, MVT::v4i64, { 3 } }, // not + pminuq + paddq
3592 { ISD::UADDSAT, MVT::v8i64, { 3 } }, // not + pminuq + paddq
3593 { ISD::SADDSAT, MVT::v32i16, { 2 } },
3594 { ISD::SADDSAT, MVT::v64i8, { 2 } },
3595 { ISD::SSUBSAT, MVT::v32i16, { 2 } },
3596 { ISD::SSUBSAT, MVT::v64i8, { 2 } },
3597 { ISD::UADDSAT, MVT::v32i16, { 2 } },
3598 { ISD::UADDSAT, MVT::v64i8, { 2 } },
3599 { ISD::USUBSAT, MVT::v32i16, { 2 } },
3600 { ISD::USUBSAT, MVT::v64i8, { 2 } },
3601 { ISD::FMAXNUM, MVT::f32, { 2, 2, 3, 3 } },
3602 { ISD::FMAXNUM, MVT::v4f32, { 1, 1, 3, 3 } },
3603 { ISD::FMAXNUM, MVT::v8f32, { 2, 2, 3, 3 } },
3604 { ISD::FMAXNUM, MVT::v16f32, { 4, 4, 3, 3 } },
3605 { ISD::FMAXNUM, MVT::f64, { 2, 2, 3, 3 } },
3606 { ISD::FMAXNUM, MVT::v2f64, { 1, 1, 3, 3 } },
3607 { ISD::FMAXNUM, MVT::v4f64, { 2, 2, 3, 3 } },
3608 { ISD::FMAXNUM, MVT::v8f64, { 3, 3, 3, 3 } },
3609 { ISD::FSQRT, MVT::f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3610 { ISD::FSQRT, MVT::v4f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3611 { ISD::FSQRT, MVT::v8f32, { 6, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3612 { ISD::FSQRT, MVT::v16f32, { 12, 20, 1, 3 } }, // Skylake from http://www.agner.org/
3613 { ISD::FSQRT, MVT::f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3614 { ISD::FSQRT, MVT::v2f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3615 { ISD::FSQRT, MVT::v4f64, { 12, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3616 { ISD::FSQRT, MVT::v8f64, { 24, 32, 1, 3 } }, // Skylake from http://www.agner.org/
3617 };
3618 static const CostKindTblEntry XOPCostTbl[] = {
3619 { ISD::BITREVERSE, MVT::v4i64, { 3, 6, 5, 6 } },
3620 { ISD::BITREVERSE, MVT::v8i32, { 3, 6, 5, 6 } },
3621 { ISD::BITREVERSE, MVT::v16i16, { 3, 6, 5, 6 } },
3622 { ISD::BITREVERSE, MVT::v32i8, { 3, 6, 5, 6 } },
3623 { ISD::BITREVERSE, MVT::v2i64, { 2, 7, 1, 1 } },
3624 { ISD::BITREVERSE, MVT::v4i32, { 2, 7, 1, 1 } },
3625 { ISD::BITREVERSE, MVT::v8i16, { 2, 7, 1, 1 } },
3626 { ISD::BITREVERSE, MVT::v16i8, { 2, 7, 1, 1 } },
3627 { ISD::BITREVERSE, MVT::i64, { 2, 2, 3, 4 } },
3628 { ISD::BITREVERSE, MVT::i32, { 2, 2, 3, 4 } },
3629 { ISD::BITREVERSE, MVT::i16, { 2, 2, 3, 4 } },
3630 { ISD::BITREVERSE, MVT::i8, { 2, 2, 3, 4 } },
3631 // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
3632 { ISD::ROTL, MVT::v4i64, { 4, 7, 5, 6 } },
3633 { ISD::ROTL, MVT::v8i32, { 4, 7, 5, 6 } },
3634 { ISD::ROTL, MVT::v16i16, { 4, 7, 5, 6 } },
3635 { ISD::ROTL, MVT::v32i8, { 4, 7, 5, 6 } },
3636 { ISD::ROTL, MVT::v2i64, { 1, 3, 1, 1 } },
3637 { ISD::ROTL, MVT::v4i32, { 1, 3, 1, 1 } },
3638 { ISD::ROTL, MVT::v8i16, { 1, 3, 1, 1 } },
3639 { ISD::ROTL, MVT::v16i8, { 1, 3, 1, 1 } },
3640 { ISD::ROTR, MVT::v4i64, { 4, 7, 8, 9 } },
3641 { ISD::ROTR, MVT::v8i32, { 4, 7, 8, 9 } },
3642 { ISD::ROTR, MVT::v16i16, { 4, 7, 8, 9 } },
3643 { ISD::ROTR, MVT::v32i8, { 4, 7, 8, 9 } },
3644 { ISD::ROTR, MVT::v2i64, { 1, 3, 3, 3 } },
3645 { ISD::ROTR, MVT::v4i32, { 1, 3, 3, 3 } },
3646 { ISD::ROTR, MVT::v8i16, { 1, 3, 3, 3 } },
3647 { ISD::ROTR, MVT::v16i8, { 1, 3, 3, 3 } }
3648 };
3649 static const CostKindTblEntry AVX2CostTbl[] = {
3650 { ISD::ABS, MVT::v2i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3651 { ISD::ABS, MVT::v4i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3652 { ISD::ABS, MVT::v4i32, { 1, 1, 1, 1 } },
3653 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 2 } },
3654 { ISD::ABS, MVT::v8i16, { 1, 1, 1, 1 } },
3655 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 2 } },
3656 { ISD::ABS, MVT::v16i8, { 1, 1, 1, 1 } },
3657 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 2 } },
3658 { ISD::BITREVERSE, MVT::v2i64, { 3, 11, 10, 11 } },
3659 { ISD::BITREVERSE, MVT::v4i64, { 5, 11, 10, 17 } },
3660 { ISD::BITREVERSE, MVT::v4i32, { 3, 11, 10, 11 } },
3661 { ISD::BITREVERSE, MVT::v8i32, { 5, 11, 10, 17 } },
3662 { ISD::BITREVERSE, MVT::v8i16, { 3, 11, 10, 11 } },
3663 { ISD::BITREVERSE, MVT::v16i16, { 5, 11, 10, 17 } },
3664 { ISD::BITREVERSE, MVT::v16i8, { 3, 6, 9, 9 } },
3665 { ISD::BITREVERSE, MVT::v32i8, { 4, 5, 9, 15 } },
3666 { ISD::BSWAP, MVT::v2i64, { 1, 2, 1, 2 } },
3667 { ISD::BSWAP, MVT::v4i64, { 1, 3, 1, 2 } },
3668 { ISD::BSWAP, MVT::v4i32, { 1, 2, 1, 2 } },
3669 { ISD::BSWAP, MVT::v8i32, { 1, 3, 1, 2 } },
3670 { ISD::BSWAP, MVT::v8i16, { 1, 2, 1, 2 } },
3671 { ISD::BSWAP, MVT::v16i16, { 1, 3, 1, 2 } },
3672 { ISD::CTLZ, MVT::v2i64, { 7, 18, 24, 25 } },
3673 { ISD::CTLZ, MVT::v4i64, { 14, 18, 24, 44 } },
3674 { ISD::CTLZ, MVT::v4i32, { 5, 16, 19, 20 } },
3675 { ISD::CTLZ, MVT::v8i32, { 10, 16, 19, 34 } },
3676 { ISD::CTLZ, MVT::v8i16, { 4, 13, 14, 15 } },
3677 { ISD::CTLZ, MVT::v16i16, { 6, 14, 14, 24 } },
3678 { ISD::CTLZ, MVT::v16i8, { 3, 12, 9, 10 } },
3679 { ISD::CTLZ, MVT::v32i8, { 4, 12, 9, 14 } },
3680 { ISD::CTPOP, MVT::v2i64, { 3, 9, 10, 10 } },
3681 { ISD::CTPOP, MVT::v4i64, { 4, 9, 10, 14 } },
3682 { ISD::CTPOP, MVT::v4i32, { 7, 12, 14, 14 } },
3683 { ISD::CTPOP, MVT::v8i32, { 7, 12, 14, 18 } },
3684 { ISD::CTPOP, MVT::v8i16, { 3, 7, 11, 11 } },
3685 { ISD::CTPOP, MVT::v16i16, { 6, 8, 11, 18 } },
3686 { ISD::CTPOP, MVT::v16i8, { 2, 5, 8, 8 } },
3687 { ISD::CTPOP, MVT::v32i8, { 3, 5, 8, 12 } },
3688 { ISD::CTTZ, MVT::v2i64, { 4, 11, 13, 13 } },
3689 { ISD::CTTZ, MVT::v4i64, { 5, 11, 13, 20 } },
3690 { ISD::CTTZ, MVT::v4i32, { 7, 14, 17, 17 } },
3691 { ISD::CTTZ, MVT::v8i32, { 7, 15, 17, 24 } },
3692 { ISD::CTTZ, MVT::v8i16, { 4, 9, 14, 14 } },
3693 { ISD::CTTZ, MVT::v16i16, { 6, 9, 14, 24 } },
3694 { ISD::CTTZ, MVT::v16i8, { 3, 7, 11, 11 } },
3695 { ISD::CTTZ, MVT::v32i8, { 5, 7, 11, 18 } },
3696 { ISD::SADDSAT, MVT::v16i16, { 1 } },
3697 { ISD::SADDSAT, MVT::v32i8, { 1 } },
3698 { ISD::SMAX, MVT::v2i64, { 2, 7, 2, 3 } },
3699 { ISD::SMAX, MVT::v4i64, { 2, 7, 2, 3 } },
3700 { ISD::SMAX, MVT::v8i32, { 1, 1, 1, 2 } },
3701 { ISD::SMAX, MVT::v16i16, { 1, 1, 1, 2 } },
3702 { ISD::SMAX, MVT::v32i8, { 1, 1, 1, 2 } },
3703 { ISD::SMIN, MVT::v2i64, { 2, 7, 2, 3 } },
3704 { ISD::SMIN, MVT::v4i64, { 2, 7, 2, 3 } },
3705 { ISD::SMIN, MVT::v8i32, { 1, 1, 1, 2 } },
3706 { ISD::SMIN, MVT::v16i16, { 1, 1, 1, 2 } },
3707 { ISD::SMIN, MVT::v32i8, { 1, 1, 1, 2 } },
3708 { ISD::SSUBSAT, MVT::v16i16, { 1 } },
3709 { ISD::SSUBSAT, MVT::v32i8, { 1 } },
3710 { ISD::UADDSAT, MVT::v16i16, { 1 } },
3711 { ISD::UADDSAT, MVT::v32i8, { 1 } },
3712 { ISD::UADDSAT, MVT::v8i32, { 3 } }, // not + pminud + paddd
3713 { ISD::UMAX, MVT::v2i64, { 2, 8, 5, 6 } },
3714 { ISD::UMAX, MVT::v4i64, { 2, 8, 5, 8 } },
3715 { ISD::UMAX, MVT::v8i32, { 1, 1, 1, 2 } },
3716 { ISD::UMAX, MVT::v16i16, { 1, 1, 1, 2 } },
3717 { ISD::UMAX, MVT::v32i8, { 1, 1, 1, 2 } },
3718 { ISD::UMIN, MVT::v2i64, { 2, 8, 5, 6 } },
3719 { ISD::UMIN, MVT::v4i64, { 2, 8, 5, 8 } },
3720 { ISD::UMIN, MVT::v8i32, { 1, 1, 1, 2 } },
3721 { ISD::UMIN, MVT::v16i16, { 1, 1, 1, 2 } },
3722 { ISD::UMIN, MVT::v32i8, { 1, 1, 1, 2 } },
3723 { ISD::USUBSAT, MVT::v16i16, { 1 } },
3724 { ISD::USUBSAT, MVT::v32i8, { 1 } },
3725 { ISD::USUBSAT, MVT::v8i32, { 2 } }, // pmaxud + psubd
3726 { ISD::FMAXNUM, MVT::f32, { 2, 7, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3727 { ISD::FMAXNUM, MVT::v4f32, { 2, 7, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3728 { ISD::FMAXNUM, MVT::v8f32, { 3, 7, 3, 6 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3729 { ISD::FMAXNUM, MVT::f64, { 2, 7, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3730 { ISD::FMAXNUM, MVT::v2f64, { 2, 7, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3731 { ISD::FMAXNUM, MVT::v4f64, { 3, 7, 3, 6 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3732 { ISD::FSQRT, MVT::f32, { 7, 15, 1, 1 } }, // vsqrtss
3733 { ISD::FSQRT, MVT::v4f32, { 7, 15, 1, 1 } }, // vsqrtps
3734 { ISD::FSQRT, MVT::v8f32, { 14, 21, 1, 3 } }, // vsqrtps
3735 { ISD::FSQRT, MVT::f64, { 14, 21, 1, 1 } }, // vsqrtsd
3736 { ISD::FSQRT, MVT::v2f64, { 14, 21, 1, 1 } }, // vsqrtpd
3737 { ISD::FSQRT, MVT::v4f64, { 28, 35, 1, 3 } }, // vsqrtpd
3738 };
3739 static const CostKindTblEntry AVX1CostTbl[] = {
3740 { ISD::ABS, MVT::v4i64, { 6, 8, 6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3741 { ISD::ABS, MVT::v8i32, { 3, 6, 4, 5 } },
3742 { ISD::ABS, MVT::v16i16, { 3, 6, 4, 5 } },
3743 { ISD::ABS, MVT::v32i8, { 3, 6, 4, 5 } },
3744 { ISD::BITREVERSE, MVT::v4i64, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3745 { ISD::BITREVERSE, MVT::v2i64, { 8, 13, 10, 16 } },
3746 { ISD::BITREVERSE, MVT::v8i32, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3747 { ISD::BITREVERSE, MVT::v4i32, { 8, 13, 10, 16 } },
3748 { ISD::BITREVERSE, MVT::v16i16, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3749 { ISD::BITREVERSE, MVT::v8i16, { 8, 13, 10, 16 } },
3750 { ISD::BITREVERSE, MVT::v32i8, { 13, 15, 17, 26 } }, // 2 x 128-bit Op + extract/insert
3751 { ISD::BITREVERSE, MVT::v16i8, { 7, 7, 9, 13 } },
3752 { ISD::BSWAP, MVT::v4i64, { 5, 6, 5, 10 } },
3753 { ISD::BSWAP, MVT::v2i64, { 2, 2, 1, 3 } },
3754 { ISD::BSWAP, MVT::v8i32, { 5, 6, 5, 10 } },
3755 { ISD::BSWAP, MVT::v4i32, { 2, 2, 1, 3 } },
3756 { ISD::BSWAP, MVT::v16i16, { 5, 6, 5, 10 } },
3757 { ISD::BSWAP, MVT::v8i16, { 2, 2, 1, 3 } },
3758 { ISD::CTLZ, MVT::v4i64, { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert
3759 { ISD::CTLZ, MVT::v2i64, { 14, 24, 24, 28 } },
3760 { ISD::CTLZ, MVT::v8i32, { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert
3761 { ISD::CTLZ, MVT::v4i32, { 12, 20, 19, 23 } },
3762 { ISD::CTLZ, MVT::v16i16, { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert
3763 { ISD::CTLZ, MVT::v8i16, { 9, 16, 14, 18 } },
3764 { ISD::CTLZ, MVT::v32i8, { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert
3765 { ISD::CTLZ, MVT::v16i8, { 7, 12, 9, 13 } },
3766 { ISD::CTPOP, MVT::v4i64, { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert
3767 { ISD::CTPOP, MVT::v2i64, { 7, 14, 10, 14 } },
3768 { ISD::CTPOP, MVT::v8i32, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
3769 { ISD::CTPOP, MVT::v4i32, { 9, 20, 14, 18 } },
3770 { ISD::CTPOP, MVT::v16i16, { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert
3771 { ISD::CTPOP, MVT::v8i16, { 8, 18, 11, 15 } },
3772 { ISD::CTPOP, MVT::v32i8, { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert
3773 { ISD::CTPOP, MVT::v16i8, { 6, 12, 8, 12 } },
3774 { ISD::CTTZ, MVT::v4i64, { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert
3775 { ISD::CTTZ, MVT::v2i64, { 9, 19, 13, 17 } },
3776 { ISD::CTTZ, MVT::v8i32, { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert
3777 { ISD::CTTZ, MVT::v4i32, { 11, 24, 17, 21 } },
3778 { ISD::CTTZ, MVT::v16i16, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
3779 { ISD::CTTZ, MVT::v8i16, { 9, 21, 14, 18 } },
3780 { ISD::CTTZ, MVT::v32i8, { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert
3781 { ISD::CTTZ, MVT::v16i8, { 8, 16, 11, 15 } },
3782 { ISD::SADDSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3783 { ISD::SADDSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3784 { ISD::SMAX, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
3785 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 4 } },
3786 { ISD::SMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3787 { ISD::SMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3788 { ISD::SMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3789 { ISD::SMIN, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
3790 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
3791 { ISD::SMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3792 { ISD::SMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3793 { ISD::SMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3794 { ISD::SSUBSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3795 { ISD::SSUBSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3796 { ISD::UADDSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3797 { ISD::UADDSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3798 { ISD::UADDSAT, MVT::v8i32, { 8 } }, // 2 x 128-bit Op + extract/insert
3799 { ISD::UMAX, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
3800 { ISD::UMAX, MVT::v2i64, { 4, 8, 5, 7 } },
3801 { ISD::UMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3802 { ISD::UMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3803 { ISD::UMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3804 { ISD::UMIN, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
3805 { ISD::UMIN, MVT::v2i64, { 4, 8, 5, 7 } },
3806 { ISD::UMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3807 { ISD::UMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3808 { ISD::UMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3809 { ISD::USUBSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3810 { ISD::USUBSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3811 { ISD::USUBSAT, MVT::v8i32, { 6 } }, // 2 x 128-bit Op + extract/insert
3812 { ISD::FMAXNUM, MVT::f32, { 3, 6, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3813 { ISD::FMAXNUM, MVT::v4f32, { 3, 6, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3814 { ISD::FMAXNUM, MVT::v8f32, { 5, 7, 3, 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3815 { ISD::FMAXNUM, MVT::f64, { 3, 6, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3816 { ISD::FMAXNUM, MVT::v2f64, { 3, 6, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3817 { ISD::FMAXNUM, MVT::v4f64, { 5, 7, 3, 10 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3818 { ISD::FSQRT, MVT::f32, { 21, 21, 1, 1 } }, // vsqrtss
3819 { ISD::FSQRT, MVT::v4f32, { 21, 21, 1, 1 } }, // vsqrtps
3820 { ISD::FSQRT, MVT::v8f32, { 42, 42, 1, 3 } }, // vsqrtps
3821 { ISD::FSQRT, MVT::f64, { 27, 27, 1, 1 } }, // vsqrtsd
3822 { ISD::FSQRT, MVT::v2f64, { 27, 27, 1, 1 } }, // vsqrtpd
3823 { ISD::FSQRT, MVT::v4f64, { 54, 54, 1, 3 } }, // vsqrtpd
3824 };
3825 static const CostKindTblEntry GLMCostTbl[] = {
3826 { ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss
3827 { ISD::FSQRT, MVT::v4f32, { 37, 41, 1, 5 } }, // sqrtps
3828 { ISD::FSQRT, MVT::f64, { 34, 35, 1, 1 } }, // sqrtsd
3829 { ISD::FSQRT, MVT::v2f64, { 67, 71, 1, 5 } }, // sqrtpd
3830 };
3831 static const CostKindTblEntry SLMCostTbl[] = {
3832 { ISD::BSWAP, MVT::v2i64, { 5, 5, 1, 5 } },
3833 { ISD::BSWAP, MVT::v4i32, { 5, 5, 1, 5 } },
3834 { ISD::BSWAP, MVT::v8i16, { 5, 5, 1, 5 } },
3835 { ISD::FSQRT, MVT::f32, { 20, 20, 1, 1 } }, // sqrtss
3836 { ISD::FSQRT, MVT::v4f32, { 40, 41, 1, 5 } }, // sqrtps
3837 { ISD::FSQRT, MVT::f64, { 35, 35, 1, 1 } }, // sqrtsd
3838 { ISD::FSQRT, MVT::v2f64, { 70, 71, 1, 5 } }, // sqrtpd
3839 };
3840 static const CostKindTblEntry SSE42CostTbl[] = {
3841 { ISD::USUBSAT, MVT::v4i32, { 2 } }, // pmaxud + psubd
3842 { ISD::UADDSAT, MVT::v4i32, { 3 } }, // not + pminud + paddd
3843 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3844 { ISD::FMAXNUM, MVT::v4f32, { 4, 4, 4, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3845 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3846 { ISD::FMAXNUM, MVT::v2f64, { 4, 4, 4, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3847 { ISD::FSQRT, MVT::f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
3848 { ISD::FSQRT, MVT::v4f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
3849 };
3850 static const CostKindTblEntry SSE41CostTbl[] = {
3851 { ISD::ABS, MVT::v2i64, { 3, 4, 3, 5 } }, // BLENDVPD(X,PSUBQ(0,X),X)
3852 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 3 } },
3853 { ISD::SMAX, MVT::v4i32, { 1, 1, 1, 1 } },
3854 { ISD::SMAX, MVT::v16i8, { 1, 1, 1, 1 } },
3855 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
3856 { ISD::SMIN, MVT::v4i32, { 1, 1, 1, 1 } },
3857 { ISD::SMIN, MVT::v16i8, { 1, 1, 1, 1 } },
3858 { ISD::UMAX, MVT::v2i64, { 2, 11, 6, 7 } },
3859 { ISD::UMAX, MVT::v4i32, { 1, 1, 1, 1 } },
3860 { ISD::UMAX, MVT::v8i16, { 1, 1, 1, 1 } },
3861 { ISD::UMIN, MVT::v2i64, { 2, 11, 6, 7 } },
3862 { ISD::UMIN, MVT::v4i32, { 1, 1, 1, 1 } },
3863 { ISD::UMIN, MVT::v8i16, { 1, 1, 1, 1 } },
3864 };
3865 static const CostKindTblEntry SSSE3CostTbl[] = {
3866 { ISD::ABS, MVT::v4i32, { 1, 2, 1, 1 } },
3867 { ISD::ABS, MVT::v8i16, { 1, 2, 1, 1 } },
3868 { ISD::ABS, MVT::v16i8, { 1, 2, 1, 1 } },
3869 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 11, 21 } },
3870 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 11, 21 } },
3871 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 11, 21 } },
3872 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 10, 16 } },
3873 { ISD::BSWAP, MVT::v2i64, { 2, 3, 1, 5 } },
3874 { ISD::BSWAP, MVT::v4i32, { 2, 3, 1, 5 } },
3875 { ISD::BSWAP, MVT::v8i16, { 2, 3, 1, 5 } },
3876 { ISD::CTLZ, MVT::v2i64, { 18, 28, 28, 35 } },
3877 { ISD::CTLZ, MVT::v4i32, { 15, 20, 22, 28 } },
3878 { ISD::CTLZ, MVT::v8i16, { 13, 17, 16, 22 } },
3879 { ISD::CTLZ, MVT::v16i8, { 11, 15, 10, 16 } },
3880 { ISD::CTPOP, MVT::v2i64, { 13, 19, 12, 18 } },
3881 { ISD::CTPOP, MVT::v4i32, { 18, 24, 16, 22 } },
3882 { ISD::CTPOP, MVT::v8i16, { 13, 18, 14, 20 } },
3883 { ISD::CTPOP, MVT::v16i8, { 11, 12, 10, 16 } },
3884 { ISD::CTTZ, MVT::v2i64, { 13, 25, 15, 22 } },
3885 { ISD::CTTZ, MVT::v4i32, { 18, 26, 19, 25 } },
3886 { ISD::CTTZ, MVT::v8i16, { 13, 20, 17, 23 } },
3887 { ISD::CTTZ, MVT::v16i8, { 11, 16, 13, 19 } }
3888 };
3889 static const CostKindTblEntry SSE2CostTbl[] = {
3890 { ISD::ABS, MVT::v2i64, { 3, 6, 5, 5 } },
3891 { ISD::ABS, MVT::v4i32, { 1, 4, 4, 4 } },
3892 { ISD::ABS, MVT::v8i16, { 1, 2, 3, 3 } },
3893 { ISD::ABS, MVT::v16i8, { 1, 2, 3, 3 } },
3894 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 32, 32 } },
3895 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 30, 30 } },
3896 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 25, 25 } },
3897 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 21, 21 } },
3898 { ISD::BSWAP, MVT::v2i64, { 5, 6, 11, 11 } },
3899 { ISD::BSWAP, MVT::v4i32, { 5, 5, 9, 9 } },
3900 { ISD::BSWAP, MVT::v8i16, { 5, 5, 4, 5 } },
3901 { ISD::CTLZ, MVT::v2i64, { 10, 45, 36, 38 } },
3902 { ISD::CTLZ, MVT::v4i32, { 10, 45, 38, 40 } },
3903 { ISD::CTLZ, MVT::v8i16, { 9, 38, 32, 34 } },
3904 { ISD::CTLZ, MVT::v16i8, { 8, 39, 29, 32 } },
3905 { ISD::CTPOP, MVT::v2i64, { 12, 26, 16, 18 } },
3906 { ISD::CTPOP, MVT::v4i32, { 15, 29, 21, 23 } },
3907 { ISD::CTPOP, MVT::v8i16, { 13, 25, 18, 20 } },
3908 { ISD::CTPOP, MVT::v16i8, { 10, 21, 14, 16 } },
3909 { ISD::CTTZ, MVT::v2i64, { 14, 28, 19, 21 } },
3910 { ISD::CTTZ, MVT::v4i32, { 18, 31, 24, 26 } },
3911 { ISD::CTTZ, MVT::v8i16, { 16, 27, 21, 23 } },
3912 { ISD::CTTZ, MVT::v16i8, { 13, 23, 17, 19 } },
3913 { ISD::SADDSAT, MVT::v8i16, { 1 } },
3914 { ISD::SADDSAT, MVT::v16i8, { 1 } },
3915 { ISD::SMAX, MVT::v2i64, { 4, 8, 15, 15 } },
3916 { ISD::SMAX, MVT::v4i32, { 2, 4, 5, 5 } },
3917 { ISD::SMAX, MVT::v8i16, { 1, 1, 1, 1 } },
3918 { ISD::SMAX, MVT::v16i8, { 2, 4, 5, 5 } },
3919 { ISD::SMIN, MVT::v2i64, { 4, 8, 15, 15 } },
3920 { ISD::SMIN, MVT::v4i32, { 2, 4, 5, 5 } },
3921 { ISD::SMIN, MVT::v8i16, { 1, 1, 1, 1 } },
3922 { ISD::SMIN, MVT::v16i8, { 2, 4, 5, 5 } },
3923 { ISD::SSUBSAT, MVT::v8i16, { 1 } },
3924 { ISD::SSUBSAT, MVT::v16i8, { 1 } },
3925 { ISD::UADDSAT, MVT::v8i16, { 1 } },
3926 { ISD::UADDSAT, MVT::v16i8, { 1 } },
3927 { ISD::UMAX, MVT::v2i64, { 4, 8, 15, 15 } },
3928 { ISD::UMAX, MVT::v4i32, { 2, 5, 8, 8 } },
3929 { ISD::UMAX, MVT::v8i16, { 1, 3, 3, 3 } },
3930 { ISD::UMAX, MVT::v16i8, { 1, 1, 1, 1 } },
3931 { ISD::UMIN, MVT::v2i64, { 4, 8, 15, 15 } },
3932 { ISD::UMIN, MVT::v4i32, { 2, 5, 8, 8 } },
3933 { ISD::UMIN, MVT::v8i16, { 1, 3, 3, 3 } },
3934 { ISD::UMIN, MVT::v16i8, { 1, 1, 1, 1 } },
3935 { ISD::USUBSAT, MVT::v8i16, { 1 } },
3936 { ISD::USUBSAT, MVT::v16i8, { 1 } },
3937 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } },
3938 { ISD::FMAXNUM, MVT::v2f64, { 4, 6, 6, 6 } },
3939 { ISD::FSQRT, MVT::f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
3940 { ISD::FSQRT, MVT::v2f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
3941 };
3942 static const CostKindTblEntry SSE1CostTbl[] = {
3943 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } },
3944 { ISD::FMAXNUM, MVT::v4f32, { 4, 6, 6, 6 } },
3945 { ISD::FSQRT, MVT::f32, { 28, 30, 1, 2 } }, // Pentium III from http://www.agner.org/
3946 { ISD::FSQRT, MVT::v4f32, { 56, 56, 1, 2 } }, // Pentium III from http://www.agner.org/
3947 };
3948 static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets
3949 { ISD::CTTZ, MVT::i64, { 1 } },
3950 };
3951 static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
3952 { ISD::CTTZ, MVT::i32, { 1 } },
3953 { ISD::CTTZ, MVT::i16, { 1 } },
3954 { ISD::CTTZ, MVT::i8, { 1 } },
3955 };
3956 static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets
3957 { ISD::CTLZ, MVT::i64, { 1 } },
3958 };
3959 static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
3960 { ISD::CTLZ, MVT::i32, { 1 } },
3961 { ISD::CTLZ, MVT::i16, { 2 } },
3962 { ISD::CTLZ, MVT::i8, { 2 } },
3963 };
3964 static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets
3965 { ISD::CTPOP, MVT::i64, { 1, 1, 1, 1 } }, // popcnt
3966 };
3967 static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
3968 { ISD::CTPOP, MVT::i32, { 1, 1, 1, 1 } }, // popcnt
3969 { ISD::CTPOP, MVT::i16, { 1, 1, 2, 2 } }, // popcnt(zext())
3970 { ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext())
3971 };
3972 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
3973 { ISD::ABS, MVT::i64, { 1, 2, 3, 4 } }, // SUB+CMOV
3974 { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } },
3975 { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } },
3976 { ISD::CTLZ, MVT::i64, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
3977 { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 1, 1, 1 } }, // BSR+XOR
3978 { ISD::CTTZ, MVT::i64, { 3 } }, // TEST+BSF+CMOV/BRANCH
3979 { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 1, 1, 1 } }, // BSR
3980 { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } },
3981 { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } },
3982 { ISD::ROTR, MVT::i64, { 2, 3, 1, 3 } },
3983 { X86ISD::VROTLI, MVT::i64, { 1, 1, 1, 1 } },
3984 { ISD::FSHL, MVT::i64, { 4, 4, 1, 4 } },
3985 { ISD::SMAX, MVT::i64, { 1, 3, 2, 3 } },
3986 { ISD::SMIN, MVT::i64, { 1, 3, 2, 3 } },
3987 { ISD::UMAX, MVT::i64, { 1, 3, 2, 3 } },
3988 { ISD::UMIN, MVT::i64, { 1, 3, 2, 3 } },
3989 { ISD::SADDO, MVT::i64, { 1 } },
3990 { ISD::UADDO, MVT::i64, { 1 } },
3991 { ISD::UMULO, MVT::i64, { 2 } }, // mulq + seto
3992 };
3993 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
3994 { ISD::ABS, MVT::i32, { 1, 2, 3, 4 } }, // SUB+XOR+SRA or SUB+CMOV
3995 { ISD::ABS, MVT::i16, { 2, 2, 3, 4 } }, // SUB+XOR+SRA or SUB+CMOV
3996 { ISD::ABS, MVT::i8, { 2, 4, 4, 4 } }, // SUB+XOR+SRA
3997 { ISD::BITREVERSE, MVT::i32, { 9, 12, 17, 19 } },
3998 { ISD::BITREVERSE, MVT::i16, { 9, 12, 17, 19 } },
3999 { ISD::BITREVERSE, MVT::i8, { 7, 9, 13, 14 } },
4000 { ISD::BSWAP, MVT::i32, { 1, 1, 1, 1 } },
4001 { ISD::BSWAP, MVT::i16, { 1, 2, 1, 2 } }, // ROL
4002 { ISD::CTLZ, MVT::i32, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
4003 { ISD::CTLZ, MVT::i16, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
4004 { ISD::CTLZ, MVT::i8, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
4005 { ISD::CTLZ_ZERO_UNDEF, MVT::i32,{ 1, 1, 1, 1 } }, // BSR+XOR
4006 { ISD::CTLZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 3, 3 } }, // BSR+XOR
4007 { ISD::CTLZ_ZERO_UNDEF, MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR
4008 { ISD::CTTZ, MVT::i32, { 3 } }, // TEST+BSF+CMOV/BRANCH
4009 { ISD::CTTZ, MVT::i16, { 3 } }, // TEST+BSF+CMOV/BRANCH
4010 { ISD::CTTZ, MVT::i8, { 3 } }, // TEST+BSF+CMOV/BRANCH
4011 { ISD::CTTZ_ZERO_UNDEF, MVT::i32,{ 1, 1, 1, 1 } }, // BSF
4012 { ISD::CTTZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 1, 1 } }, // BSF
4013 { ISD::CTTZ_ZERO_UNDEF, MVT::i8, { 2, 2, 1, 1 } }, // BSF
4014 { ISD::CTPOP, MVT::i32, { 8, 7, 15, 15 } },
4015 { ISD::CTPOP, MVT::i16, { 9, 8, 17, 17 } },
4016 { ISD::CTPOP, MVT::i8, { 7, 6, 6, 6 } },
4017 { ISD::ROTL, MVT::i32, { 2, 3, 1, 3 } },
4018 { ISD::ROTL, MVT::i16, { 2, 3, 1, 3 } },
4019 { ISD::ROTL, MVT::i8, { 2, 3, 1, 3 } },
4020 { ISD::ROTR, MVT::i32, { 2, 3, 1, 3 } },
4021 { ISD::ROTR, MVT::i16, { 2, 3, 1, 3 } },
4022 { ISD::ROTR, MVT::i8, { 2, 3, 1, 3 } },
4023 { X86ISD::VROTLI, MVT::i32, { 1, 1, 1, 1 } },
4024 { X86ISD::VROTLI, MVT::i16, { 1, 1, 1, 1 } },
4025 { X86ISD::VROTLI, MVT::i8, { 1, 1, 1, 1 } },
4026 { ISD::FSHL, MVT::i32, { 4, 4, 1, 4 } },
4027 { ISD::FSHL, MVT::i16, { 4, 4, 2, 5 } },
4028 { ISD::FSHL, MVT::i8, { 4, 4, 2, 5 } },
4029 { ISD::SMAX, MVT::i32, { 1, 2, 2, 3 } },
4030 { ISD::SMAX, MVT::i16, { 1, 4, 2, 4 } },
4031 { ISD::SMAX, MVT::i8, { 1, 4, 2, 4 } },
4032 { ISD::SMIN, MVT::i32, { 1, 2, 2, 3 } },
4033 { ISD::SMIN, MVT::i16, { 1, 4, 2, 4 } },
4034 { ISD::SMIN, MVT::i8, { 1, 4, 2, 4 } },
4035 { ISD::UMAX, MVT::i32, { 1, 2, 2, 3 } },
4036 { ISD::UMAX, MVT::i16, { 1, 4, 2, 4 } },
4037 { ISD::UMAX, MVT::i8, { 1, 4, 2, 4 } },
4038 { ISD::UMIN, MVT::i32, { 1, 2, 2, 3 } },
4039 { ISD::UMIN, MVT::i16, { 1, 4, 2, 4 } },
4040 { ISD::UMIN, MVT::i8, { 1, 4, 2, 4 } },
4041 { ISD::SADDO, MVT::i32, { 1 } },
4042 { ISD::SADDO, MVT::i16, { 1 } },
4043 { ISD::SADDO, MVT::i8, { 1 } },
4044 { ISD::UADDO, MVT::i32, { 1 } },
4045 { ISD::UADDO, MVT::i16, { 1 } },
4046 { ISD::UADDO, MVT::i8, { 1 } },
4047 { ISD::UMULO, MVT::i32, { 2 } }, // mul + seto
4048 { ISD::UMULO, MVT::i16, { 2 } },
4049 { ISD::UMULO, MVT::i8, { 2 } },
4050 };
4051
4052 Type *RetTy = ICA.getReturnType();
4053 Type *OpTy = RetTy;
4054 Intrinsic::ID IID = ICA.getID();
4055 unsigned ISD = ISD::DELETED_NODE;
4056 switch (IID) {
4057 default:
4058 break;
4059 case Intrinsic::abs:
4060 ISD = ISD::ABS;
4061 break;
4062 case Intrinsic::bitreverse:
4063 ISD = ISD::BITREVERSE;
4064 break;
4065 case Intrinsic::bswap:
4066 ISD = ISD::BSWAP;
4067 break;
4068 case Intrinsic::ctlz:
4069 ISD = ISD::CTLZ;
4070 break;
4071 case Intrinsic::ctpop:
4072 ISD = ISD::CTPOP;
4073 break;
4074 case Intrinsic::cttz:
4075 ISD = ISD::CTTZ;
4076 break;
4077 case Intrinsic::fshl:
4078 ISD = ISD::FSHL;
4079 if (!ICA.isTypeBasedOnly()) {
4080 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4081 if (Args[0] == Args[1]) {
4082 ISD = ISD::ROTL;
4083 // Handle scalar constant rotation amounts.
4084 // TODO: Handle vector + funnel-shift cases.
4085 if (isa_and_nonnull<ConstantInt>(Args[2]))
4086 ISD = X86ISD::VROTLI;
4087 }
4088 }
4089 break;
4090 case Intrinsic::fshr:
4091 // FSHR has same costs so don't duplicate.
4092 ISD = ISD::FSHL;
4093 if (!ICA.isTypeBasedOnly()) {
4094 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4095 if (Args[0] == Args[1]) {
4096 // Handle scalar constant rotation amount.
4097 // TODO: Handle vector + funnel-shift cases.
4098 ISD = ISD::ROTR;
4099 if (isa_and_nonnull<ConstantInt>(Args[2]))
4100 ISD = X86ISD::VROTLI;
4101 }
4102 }
4103 break;
4104 case Intrinsic::maxnum:
4105 case Intrinsic::minnum:
4106 // FMINNUM has same costs so don't duplicate.
4107 ISD = ISD::FMAXNUM;
4108 break;
4109 case Intrinsic::sadd_sat:
4110 ISD = ISD::SADDSAT;
4111 break;
4112 case Intrinsic::smax:
4113 ISD = ISD::SMAX;
4114 break;
4115 case Intrinsic::smin:
4116 ISD = ISD::SMIN;
4117 break;
4118 case Intrinsic::ssub_sat:
4119 ISD = ISD::SSUBSAT;
4120 break;
4121 case Intrinsic::uadd_sat:
4122 ISD = ISD::UADDSAT;
4123 break;
4124 case Intrinsic::umax:
4125 ISD = ISD::UMAX;
4126 break;
4127 case Intrinsic::umin:
4128 ISD = ISD::UMIN;
4129 break;
4130 case Intrinsic::usub_sat:
4131 ISD = ISD::USUBSAT;
4132 break;
4133 case Intrinsic::sqrt:
4134 ISD = ISD::FSQRT;
4135 break;
4136 case Intrinsic::sadd_with_overflow:
4137 case Intrinsic::ssub_with_overflow:
4138 // SSUBO has same costs so don't duplicate.
4139 ISD = ISD::SADDO;
4140 OpTy = RetTy->getContainedType(0);
4141 break;
4142 case Intrinsic::uadd_with_overflow:
4143 case Intrinsic::usub_with_overflow:
4144 // USUBO has same costs so don't duplicate.
4145 ISD = ISD::UADDO;
4146 OpTy = RetTy->getContainedType(0);
4147 break;
4148 case Intrinsic::umul_with_overflow:
4149 case Intrinsic::smul_with_overflow:
4150 // SMULO has same costs so don't duplicate.
4151 ISD = ISD::UMULO;
4152 OpTy = RetTy->getContainedType(0);
4153 break;
4154 }
4155
4156 if (ISD != ISD::DELETED_NODE) {
4157 // Legalize the type.
4158 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy);
4159 MVT MTy = LT.second;
4160
4161 // Attempt to lookup cost.
4162 if (ISD == ISD::BITREVERSE && ST->hasGFNI() && ST->hasSSSE3() &&
4163 MTy.isVector()) {
4164 // With PSHUFB the code is very similar for all types. If we have integer
4165 // byte operations, we just need a GF2P8AFFINEQB for vXi8. For other types
4166 // we also need a PSHUFB.
4167 unsigned Cost = MTy.getVectorElementType() == MVT::i8 ? 1 : 2;
4168
4169 // Without byte operations, we need twice as many GF2P8AFFINEQB and PSHUFB
4170 // instructions. We also need an extract and an insert.
4171 if (!(MTy.is128BitVector() || (ST->hasAVX2() && MTy.is256BitVector()) ||
4172 (ST->hasBWI() && MTy.is512BitVector())))
4173 Cost = Cost * 2 + 2;
4174
4175 return LT.first * Cost;
4176 }
4177
4178 // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost.
4179 if (((ISD == ISD::CTTZ && !ST->hasBMI()) ||
4180 (ISD == ISD::CTLZ && !ST->hasLZCNT())) &&
4181 !MTy.isVector() && !ICA.isTypeBasedOnly()) {
4182 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4183 if (auto *Cst = dyn_cast<ConstantInt>(Args[1]))
4184 if (Cst->isAllOnesValue())
4186 }
4187
4188 // FSQRT is a single instruction.
4189 if (ISD == ISD::FSQRT && CostKind == TTI::TCK_CodeSize)
4190 return LT.first;
4191
4192 auto adjustTableCost = [](int ISD, unsigned Cost,
4193 InstructionCost LegalizationCost,
4194 FastMathFlags FMF) {
4195 // If there are no NANs to deal with, then these are reduced to a
4196 // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
4197 // assume is used in the non-fast case.
4198 if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) {
4199 if (FMF.noNaNs())
4200 return LegalizationCost * 1;
4201 }
4202 return LegalizationCost * (int)Cost;
4203 };
4204
4205 if (ST->useGLMDivSqrtCosts())
4206 if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
4207 if (auto KindCost = Entry->Cost[CostKind])
4208 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4209 ICA.getFlags());
4210
4211 if (ST->useSLMArithCosts())
4212 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
4213 if (auto KindCost = Entry->Cost[CostKind])
4214 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4215 ICA.getFlags());
4216
4217 if (ST->hasVBMI2())
4218 if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy))
4219 if (auto KindCost = Entry->Cost[CostKind])
4220 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4221 ICA.getFlags());
4222
4223 if (ST->hasBITALG())
4224 if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
4225 if (auto KindCost = Entry->Cost[CostKind])
4226 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4227 ICA.getFlags());
4228
4229 if (ST->hasVPOPCNTDQ())
4230 if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
4231 if (auto KindCost = Entry->Cost[CostKind])
4232 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4233 ICA.getFlags());
4234
4235 if (ST->hasCDI())
4236 if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
4237 if (auto KindCost = Entry->Cost[CostKind])
4238 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4239 ICA.getFlags());
4240
4241 if (ST->hasBWI())
4242 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
4243 if (auto KindCost = Entry->Cost[CostKind])
4244 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4245 ICA.getFlags());
4246
4247 if (ST->hasAVX512())
4248 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
4249 if (auto KindCost = Entry->Cost[CostKind])
4250 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4251 ICA.getFlags());
4252
4253 if (ST->hasXOP())
4254 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
4255 if (auto KindCost = Entry->Cost[CostKind])
4256 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4257 ICA.getFlags());
4258
4259 if (ST->hasAVX2())
4260 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
4261 if (auto KindCost = Entry->Cost[CostKind])
4262 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4263 ICA.getFlags());
4264
4265 if (ST->hasAVX())
4266 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
4267 if (auto KindCost = Entry->Cost[CostKind])
4268 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4269 ICA.getFlags());
4270
4271 if (ST->hasSSE42())
4272 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
4273 if (auto KindCost = Entry->Cost[CostKind])
4274 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4275 ICA.getFlags());
4276
4277 if (ST->hasSSE41())
4278 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
4279 if (auto KindCost = Entry->Cost[CostKind])
4280 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4281 ICA.getFlags());
4282
4283 if (ST->hasSSSE3())
4284 if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
4285 if (auto KindCost = Entry->Cost[CostKind])
4286 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4287 ICA.getFlags());
4288
4289 if (ST->hasSSE2())
4290 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
4291 if (auto KindCost = Entry->Cost[CostKind])
4292 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4293 ICA.getFlags());
4294
4295 if (ST->hasSSE1())
4296 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
4297 if (auto KindCost = Entry->Cost[CostKind])
4298 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4299 ICA.getFlags());
4300
4301 if (ST->hasBMI()) {
4302 if (ST->is64Bit())
4303 if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
4304 if (auto KindCost = Entry->Cost[CostKind])
4305 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4306 ICA.getFlags());
4307
4308 if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
4309 if (auto KindCost = Entry->Cost[CostKind])
4310 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4311 ICA.getFlags());
4312 }
4313
4314 if (ST->hasLZCNT()) {
4315 if (ST->is64Bit())
4316 if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
4317 if (auto KindCost = Entry->Cost[CostKind])
4318 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4319 ICA.getFlags());
4320
4321 if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
4322 if (auto KindCost = Entry->Cost[CostKind])
4323 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4324 ICA.getFlags());
4325 }
4326
4327 if (ST->hasPOPCNT()) {
4328 if (ST->is64Bit())
4329 if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
4330 if (auto KindCost = Entry->Cost[CostKind])
4331 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4332 ICA.getFlags());
4333
4334 if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
4335 if (auto KindCost = Entry->Cost[CostKind])
4336 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4337 ICA.getFlags());
4338 }
4339
4340 if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
4341 if (const Instruction *II = ICA.getInst()) {
4342 if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
4343 return TTI::TCC_Free;
4344 if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
4345 if (LI->hasOneUse())
4346 return TTI::TCC_Free;
4347 }
4348 }
4349 }
4350
4351 if (ST->is64Bit())
4352 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
4353 if (auto KindCost = Entry->Cost[CostKind])
4354 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4355 ICA.getFlags());
4356
4357 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
4358 if (auto KindCost = Entry->Cost[CostKind])
4359 return adjustTableCost(Entry->ISD, *KindCost, LT.first, ICA.getFlags());
4360 }
4361
4363}
4364
4367 unsigned Index, Value *Op0,
4368 Value *Op1) {
4369 static const CostTblEntry SLMCostTbl[] = {
4370 { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },
4371 { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 },
4372 { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 },
4373 { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 }
4374 };
4375
4376 assert(Val->isVectorTy() && "This must be a vector type");
4377 Type *ScalarType = Val->getScalarType();
4378 InstructionCost RegisterFileMoveCost = 0;
4379
4380 // Non-immediate extraction/insertion can be handled as a sequence of
4381 // aliased loads+stores via the stack.
4382 if (Index == -1U && (Opcode == Instruction::ExtractElement ||
4383 Opcode == Instruction::InsertElement)) {
4384 // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
4385 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
4386
4387 // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
4388 assert(isa<FixedVectorType>(Val) && "Fixed vector type expected");
4389 Align VecAlign = DL.getPrefTypeAlign(Val);
4390 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
4391
4392 // Extract - store vector to stack, load scalar.
4393 if (Opcode == Instruction::ExtractElement) {
4394 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4395 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
4396 CostKind);
4397 }
4398 // Insert - store vector to stack, store scalar, load vector.
4399 if (Opcode == Instruction::InsertElement) {
4400 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4401 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
4402 CostKind) +
4403 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind);
4404 }
4405 }
4406
4407 if (Index != -1U && (Opcode == Instruction::ExtractElement ||
4408 Opcode == Instruction::InsertElement)) {
4409 // Extraction of vXi1 elements are now efficiently handled by MOVMSK.
4410 if (Opcode == Instruction::ExtractElement &&
4411 ScalarType->getScalarSizeInBits() == 1 &&
4412 cast<FixedVectorType>(Val)->getNumElements() > 1)
4413 return 1;
4414
4415 // Legalize the type.
4416 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4417
4418 // This type is legalized to a scalar type.
4419 if (!LT.second.isVector())
4420 return 0;
4421
4422 // The type may be split. Normalize the index to the new type.
4423 unsigned SizeInBits = LT.second.getSizeInBits();
4424 unsigned NumElts = LT.second.getVectorNumElements();
4425 unsigned SubNumElts = NumElts;
4426 Index = Index % NumElts;
4427
4428 // For >128-bit vectors, we need to extract higher 128-bit subvectors.
4429 // For inserts, we also need to insert the subvector back.
4430 if (SizeInBits > 128) {
4431 assert((SizeInBits % 128) == 0 && "Illegal vector");
4432 unsigned NumSubVecs = SizeInBits / 128;
4433 SubNumElts = NumElts / NumSubVecs;
4434 if (SubNumElts <= Index) {
4435 RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
4436 Index %= SubNumElts;
4437 }
4438 }
4439
4440 MVT MScalarTy = LT.second.getScalarType();
4441 auto IsCheapPInsrPExtrInsertPS = [&]() {
4442 // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
4443 // Also, assume insertps is relatively cheap on all >= SSE41 targets.
4444 return (MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4445 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4446 (MScalarTy == MVT::f32 && ST->hasSSE41() &&
4447 Opcode == Instruction::InsertElement);
4448 };
4449
4450 if (Index == 0) {
4451 // Floating point scalars are already located in index #0.
4452 // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
4453 // true for all.
4454 if (ScalarType->isFloatingPointTy() &&
4455 (Opcode != Instruction::InsertElement || !Op0 ||
4456 isa<UndefValue>(Op0)))
4457 return RegisterFileMoveCost;
4458
4459 if (Opcode == Instruction::InsertElement &&
4460 isa_and_nonnull<UndefValue>(Op0)) {
4461 // Consider the gather cost to be cheap.
4462 if (isa_and_nonnull<LoadInst>(Op1))
4463 return RegisterFileMoveCost;
4464 if (!IsCheapPInsrPExtrInsertPS()) {
4465 // mov constant-to-GPR + movd/movq GPR -> XMM.
4466 if (isa_and_nonnull<Constant>(Op1) && Op1->getType()->isIntegerTy())
4467 return 2 + RegisterFileMoveCost;
4468 // Assume movd/movq GPR -> XMM is relatively cheap on all targets.
4469 return 1 + RegisterFileMoveCost;
4470 }
4471 }
4472
4473 // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
4474 if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
4475 return 1 + RegisterFileMoveCost;
4476 }
4477
4478 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4479 assert(ISD && "Unexpected vector opcode");
4480 if (ST->useSLMArithCosts())
4481 if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
4482 return Entry->Cost + RegisterFileMoveCost;
4483
4484 // Consider cheap cases.
4485 if (IsCheapPInsrPExtrInsertPS())
4486 return 1 + RegisterFileMoveCost;
4487
4488 // For extractions we just need to shuffle the element to index 0, which
4489 // should be very cheap (assume cost = 1). For insertions we need to shuffle
4490 // the elements to its destination. In both cases we must handle the
4491 // subvector move(s).
4492 // If the vector type is already less than 128-bits then don't reduce it.
4493 // TODO: Under what circumstances should we shuffle using the full width?
4494 InstructionCost ShuffleCost = 1;
4495 if (Opcode == Instruction::InsertElement) {
4496 auto *SubTy = cast<VectorType>(Val);
4497 EVT VT = TLI->getValueType(DL, Val);
4498 if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
4499 SubTy = FixedVectorType::get(ScalarType, SubNumElts);
4500 ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, std::nullopt,
4501 CostKind, 0, SubTy);
4502 }
4503 int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
4504 return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
4505 }
4506
4507 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1) +
4508 RegisterFileMoveCost;
4509}
4510
4513 bool Insert, bool Extract,
4515 assert(DemandedElts.getBitWidth() ==
4516 cast<FixedVectorType>(Ty)->getNumElements() &&
4517 "Vector size mismatch");
4518
4519 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4520 MVT MScalarTy = LT.second.getScalarType();
4521 unsigned LegalVectorBitWidth = LT.second.getSizeInBits();
4523
4524 constexpr unsigned LaneBitWidth = 128;
4525 assert((LegalVectorBitWidth < LaneBitWidth ||
4526 (LegalVectorBitWidth % LaneBitWidth) == 0) &&
4527 "Illegal vector");
4528
4529 const int NumLegalVectors = *LT.first.getValue();
4530 assert(NumLegalVectors >= 0 && "Negative cost!");
4531
4532 // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
4533 // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
4534 if (Insert) {
4535 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4536 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4537 (MScalarTy == MVT::f32 && ST->hasSSE41())) {
4538 // For types we can insert directly, insertion into 128-bit sub vectors is
4539 // cheap, followed by a cheap chain of concatenations.
4540 if (LegalVectorBitWidth <= LaneBitWidth) {
4541 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
4542 /*Extract*/ false, CostKind);
4543 } else {
4544 // In each 128-lane, if at least one index is demanded but not all
4545 // indices are demanded and this 128-lane is not the first 128-lane of
4546 // the legalized-vector, then this 128-lane needs a extracti128; If in
4547 // each 128-lane, there is at least one demanded index, this 128-lane
4548 // needs a inserti128.
4549
4550 // The following cases will help you build a better understanding:
4551 // Assume we insert several elements into a v8i32 vector in avx2,
4552 // Case#1: inserting into 1th index needs vpinsrd + inserti128.
4553 // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
4554 // inserti128.
4555 // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
4556 assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector");
4557 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
4558 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
4559 unsigned NumLegalElts =
4560 LT.second.getVectorNumElements() * NumLegalVectors;
4561 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
4562 "Vector has been legalized to smaller element count");
4563 assert((NumLegalElts % NumLanesTotal) == 0 &&
4564 "Unexpected elts per lane");
4565 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
4566
4567 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
4568 auto *LaneTy =
4569 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
4570
4571 for (unsigned I = 0; I != NumLanesTotal; ++I) {
4572 APInt LaneEltMask = WidenedDemandedElts.extractBits(
4573 NumEltsPerLane, NumEltsPerLane * I);
4574 if (LaneEltMask.isZero())
4575 continue;
4576 // FIXME: we don't need to extract if all non-demanded elements
4577 // are legalization-inserted padding.
4578 if (!LaneEltMask.isAllOnes())
4579 Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
4580 CostKind, I * NumEltsPerLane, LaneTy);
4581 Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert,
4582 /*Extract*/ false, CostKind);
4583 }
4584
4585 APInt AffectedLanes =
4586 APIntOps::ScaleBitMask(WidenedDemandedElts, NumLanesTotal);
4587 APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask(
4588 AffectedLanes, NumLegalVectors, /*MatchAllBits=*/true);
4589 for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) {
4590 for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) {
4591 unsigned I = NumLegalLanes * LegalVec + Lane;
4592 // No need to insert unaffected lane; or lane 0 of each legal vector
4593 // iff ALL lanes of that vector were affected and will be inserted.
4594 if (!AffectedLanes[I] ||
4595 (Lane == 0 && FullyAffectedLegalVectors[LegalVec]))
4596 continue;
4597 Cost += getShuffleCost(TTI::SK_InsertSubvector, Ty, std::nullopt,
4598 CostKind, I * NumEltsPerLane, LaneTy);
4599 }
4600 }
4601 }
4602 } else if (LT.second.isVector()) {
4603 // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
4604 // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
4605 // series of UNPCK followed by CONCAT_VECTORS - all of these can be
4606 // considered cheap.
4607 if (Ty->isIntOrIntVectorTy())
4608 Cost += DemandedElts.popcount();
4609
4610 // Get the smaller of the legalized or original pow2-extended number of
4611 // vector elements, which represents the number of unpacks we'll end up
4612 // performing.
4613 unsigned NumElts = LT.second.getVectorNumElements();
4614 unsigned Pow2Elts =
4615 PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements());
4616 Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
4617 }
4618 }
4619
4620 if (Extract) {
4621 // vXi1 can be efficiently extracted with MOVMSK.
4622 // TODO: AVX512 predicate mask handling.
4623 // NOTE: This doesn't work well for roundtrip scalarization.
4624 if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) {
4625 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
4626 unsigned MaxElts = ST->hasAVX2() ? 32 : 16;
4627 unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts;
4628 return MOVMSKCost;
4629 }
4630
4631 if (LT.second.isVector()) {
4632 unsigned NumLegalElts =
4633 LT.second.getVectorNumElements() * NumLegalVectors;
4634 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
4635 "Vector has been legalized to smaller element count");
4636
4637 // If we're extracting elements from a 128-bit subvector lane,
4638 // we only need to extract each lane once, not for every element.
4639 if (LegalVectorBitWidth > LaneBitWidth) {
4640 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
4641 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
4642 assert((NumLegalElts % NumLanesTotal) == 0 &&
4643 "Unexpected elts per lane");
4644 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
4645
4646 // Add cost for each demanded 128-bit subvector extraction.
4647 // Luckily this is a lot easier than for insertion.
4648 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
4649 auto *LaneTy =
4650 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
4651
4652 for (unsigned I = 0; I != NumLanesTotal; ++I) {
4653 APInt LaneEltMask = WidenedDemandedElts.extractBits(
4654 NumEltsPerLane, I * NumEltsPerLane);
4655 if (LaneEltMask.isZero())
4656 continue;
4657 Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
4658 CostKind, I * NumEltsPerLane, LaneTy);
4660 LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind);
4661 }
4662
4663 return Cost;
4664 }
4665 }
4666
4667 // Fallback to default extraction.
4668 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ false,
4669 Extract, CostKind);
4670 }
4671
4672 return Cost;
4673}
4674
4676X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
4677 int VF, const APInt &DemandedDstElts,
4679 const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy);
4680 // We don't differentiate element types here, only element bit width.
4681 EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits);
4682
4683 auto bailout = [&]() {
4684 return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
4685 DemandedDstElts, CostKind);
4686 };
4687
4688 // For now, only deal with AVX512 cases.
4689 if (!ST->hasAVX512())
4690 return bailout();
4691
4692 // Do we have a native shuffle for this element type, or should we promote?
4693 unsigned PromEltTyBits = EltTyBits;
4694 switch (EltTyBits) {
4695 case 32:
4696 case 64:
4697 break; // AVX512F.
4698 case 16:
4699 if (!ST->hasBWI())
4700 PromEltTyBits = 32; // promote to i32, AVX512F.
4701 break; // AVX512BW
4702 case 8:
4703 if (!ST->hasVBMI())
4704 PromEltTyBits = 32; // promote to i32, AVX512F.
4705 break; // AVX512VBMI
4706 case 1:
4707 // There is no support for shuffling i1 elements. We *must* promote.
4708 if (ST->hasBWI()) {
4709 if (ST->hasVBMI())
4710 PromEltTyBits = 8; // promote to i8, AVX512VBMI.
4711 else
4712 PromEltTyBits = 16; // promote to i16, AVX512BW.
4713 break;
4714 }
4715 PromEltTyBits = 32; // promote to i32, AVX512F.
4716 break;
4717 default:
4718 return bailout();
4719 }
4720 auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits);
4721
4722 auto *SrcVecTy = FixedVectorType::get(EltTy, VF);
4723 auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF);
4724
4725 int NumDstElements = VF * ReplicationFactor;
4726 auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements);
4727 auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements);
4728
4729 // Legalize the types.
4730 MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second;
4731 MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second;
4732 MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second;
4733 MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second;
4734 // They should have legalized into vector types.
4735 if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() ||
4736 !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector())
4737 return bailout();
4738
4739 if (PromEltTyBits != EltTyBits) {
4740 // If we have to perform the shuffle with wider elt type than our data type,
4741 // then we will first need to anyext (we don't care about the new bits)
4742 // the source elements, and then truncate Dst elements.
4743 InstructionCost PromotionCost;
4744 PromotionCost += getCastInstrCost(
4745 Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy,
4747 PromotionCost +=
4748 getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy,
4749 /*Src=*/PromDstVecTy,
4751 return PromotionCost + getReplicationShuffleCost(PromEltTy,
4752 ReplicationFactor, VF,
4753 DemandedDstElts, CostKind);
4754 }
4755
4756 assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&
4757 LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&
4758 "We expect that the legalization doesn't affect the element width, "
4759 "doesn't coalesce/split elements.");
4760
4761 unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements();
4762 unsigned NumDstVectors =
4763 divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec);
4764
4765 auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec);
4766
4767 // Not all the produced Dst elements may be demanded. In our case,
4768 // given that a single Dst vector is formed by a single shuffle,
4769 // if all elements that will form a single Dst vector aren't demanded,
4770 // then we won't need to do that shuffle, so adjust the cost accordingly.
4771 APInt DemandedDstVectors = APIntOps::ScaleBitMask(
4772 DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors);
4773 unsigned NumDstVectorsDemanded = DemandedDstVectors.popcount();
4774
4775 InstructionCost SingleShuffleCost = getShuffleCost(
4776 TTI::SK_PermuteSingleSrc, SingleDstVecTy, /*Mask=*/std::nullopt, CostKind,
4777 /*Index=*/0, /*SubTp=*/nullptr);
4778 return NumDstVectorsDemanded * SingleShuffleCost;
4779}
4780
4782 MaybeAlign Alignment,
4783 unsigned AddressSpace,
4785 TTI::OperandValueInfo OpInfo,
4786 const Instruction *I) {
4787 // TODO: Handle other cost kinds.
4789 if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
4790 // Store instruction with index and scale costs 2 Uops.
4791 // Check the preceding GEP to identify non-const indices.
4792 if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
4793 if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
4794 return TTI::TCC_Basic * 2;
4795 }
4796 }
4797 return TTI::TCC_Basic;
4798 }
4799
4800 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
4801 "Invalid Opcode");
4802 // Type legalization can't handle structs
4803 if (TLI->getValueType(DL, Src, true) == MVT::Other)
4804 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4805 CostKind);
4806
4807 // Legalize the type.
4808 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
4809
4810 auto *VTy = dyn_cast<FixedVectorType>(Src);
4811
4813
4814 // Add a cost for constant load to vector.
4815 if (Opcode == Instruction::Store && OpInfo.isConstant())
4816 Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src),
4817 /*AddressSpace=*/0, CostKind);
4818
4819 // Handle the simple case of non-vectors.
4820 // NOTE: this assumes that legalization never creates vector from scalars!
4821 if (!VTy || !LT.second.isVector()) {
4822 // Each load/store unit costs 1.
4823 return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1;
4824 }
4825
4826 bool IsLoad = Opcode == Instruction::Load;
4827
4828 Type *EltTy = VTy->getElementType();
4829
4830 const int EltTyBits = DL.getTypeSizeInBits(EltTy);
4831
4832 // Source of truth: how many elements were there in the original IR vector?
4833 const unsigned SrcNumElt = VTy->getNumElements();
4834
4835 // How far have we gotten?
4836 int NumEltRemaining = SrcNumElt;
4837 // Note that we intentionally capture by-reference, NumEltRemaining changes.
4838 auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
4839
4840 const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
4841
4842 // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
4843 const unsigned XMMBits = 128;
4844 if (XMMBits % EltTyBits != 0)
4845 // Vector size must be a multiple of the element size. I.e. no padding.
4846 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4847 CostKind);
4848 const int NumEltPerXMM = XMMBits / EltTyBits;
4849
4850 auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
4851
4852 for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
4853 NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
4854 // How many elements would a single op deal with at once?
4855 if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
4856 // Vector size must be a multiple of the element size. I.e. no padding.
4857 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4858 CostKind);
4859 int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
4860
4861 assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
4862 assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||
4863 (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
4864 "Unless we haven't halved the op size yet, "
4865 "we have less than two op's sized units of work left.");
4866
4867 auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
4868 ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
4869 : XMMVecTy;
4870
4871 assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
4872 "After halving sizes, the vector elt count is no longer a multiple "
4873 "of number of elements per operation?");
4874 auto *CoalescedVecTy =
4875 CurrNumEltPerOp == 1
4876 ? CurrVecTy
4878 IntegerType::get(Src->getContext(),
4879 EltTyBits * CurrNumEltPerOp),
4880 CurrVecTy->getNumElements() / CurrNumEltPerOp);
4881 assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
4882 DL.getTypeSizeInBits(CurrVecTy) &&
4883 "coalesciing elements doesn't change vector width.");
4884
4885 while (NumEltRemaining > 0) {
4886 assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
4887
4888 // Can we use this vector size, as per the remaining element count?
4889 // Iff the vector is naturally aligned, we can do a wide load regardless.
4890 if (NumEltRemaining < CurrNumEltPerOp &&
4891 (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) &&
4892 CurrOpSizeBytes != 1)
4893 break; // Try smalled vector size.
4894
4895 bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
4896
4897 // If we have fully processed the previous reg, we need to replenish it.
4898 if (SubVecEltsLeft == 0) {
4899 SubVecEltsLeft += CurrVecTy->getNumElements();
4900 // And that's free only for the 0'th subvector of a legalized vector.
4901 if (!Is0thSubVec)
4904 VTy, std::nullopt, CostKind, NumEltDone(),
4905 CurrVecTy);
4906 }
4907
4908 // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
4909 // for smaller widths (32/16/8) we have to insert/extract them separately.
4910 // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
4911 // but let's pretend that it is also true for 16/8 bit wide ops...)
4912 if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
4913 int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
4914 assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
4915 int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
4916 APInt DemandedElts =
4917 APInt::getBitsSet(CoalescedVecTy->getNumElements(),
4918 CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
4919 assert(DemandedElts.popcount() == 1 && "Inserting single value");
4920 Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
4921 !IsLoad, CostKind);
4922 }
4923
4924 // This isn't exactly right. We're using slow unaligned 32-byte accesses
4925 // as a proxy for a double-pumped AVX memory interface such as on
4926 // Sandybridge.
4927 // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
4928 // will be scalarized.
4929 if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
4930 Cost += 2;
4931 else if (CurrOpSizeBytes < 4)
4932 Cost += 2;
4933 else
4934 Cost += 1;
4935
4936 SubVecEltsLeft -= CurrNumEltPerOp;
4937 NumEltRemaining -= CurrNumEltPerOp;
4938 Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes);
4939 }
4940 }
4941
4942 assert(NumEltRemaining <= 0 && "Should have processed all the elements.");
4943
4944 return Cost;
4945}
4946
4948X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
4949 unsigned AddressSpace,
4951 bool IsLoad = (Instruction::Load == Opcode);
4952 bool IsStore = (Instruction::Store == Opcode);
4953
4954 auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
4955 if (!SrcVTy)
4956 // To calculate scalar take the regular cost, without mask
4957 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
4958
4959 unsigned NumElem = SrcVTy->getNumElements();
4960 auto *MaskTy =
4961 FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
4962 if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) ||
4963 (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) {
4964 // Scalarization
4965 APInt DemandedElts = APInt::getAllOnes(NumElem);
4967 MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind);
4968 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
4969 Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
4971 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
4972 InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
4974 SrcVTy, DemandedElts, IsLoad, IsStore, CostKind);
4975 InstructionCost MemopCost =
4976 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
4977 Alignment, AddressSpace, CostKind);
4978 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
4979 }
4980
4981 // Legalize the type.
4982 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
4983 auto VT = TLI->getValueType(DL, SrcVTy);
4985 if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
4986 LT.second.getVectorNumElements() == NumElem)
4987 // Promotion requires extend/truncate for data and a shuffle for mask.
4988 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, std::nullopt,
4989 CostKind, 0, nullptr) +
4990 getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, std::nullopt,
4991 CostKind, 0, nullptr);
4992
4993 else if (LT.first * LT.second.getVectorNumElements() > NumElem) {
4994 auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
4995 LT.second.getVectorNumElements());
4996 // Expanding requires fill mask with zeroes
4997 Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, std::nullopt,
4998 CostKind, 0, MaskTy);
4999 }
5000
5001 // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
5002 if (!ST->hasAVX512())
5003 return Cost + LT.first * (IsLoad ? 2 : 8);
5004
5005 // AVX-512 masked load/store is cheaper
5006 return Cost + LT.first;
5007}
5008
5011 const Value *Base,
5012 const TTI::PointersChainInfo &Info,
5013 Type *AccessTy, TTI::TargetCostKind CostKind) {
5014 if (Info.isSameBase() && Info.isKnownStride()) {
5015 // If all the pointers have known stride all the differences are translated
5016 // into constants. X86 memory addressing allows encoding it into
5017 // displacement. So we just need to take the base GEP cost.
5018 if (const auto *BaseGEP = dyn_cast<GetElementPtrInst>(Base)) {
5019 SmallVector<const Value *> Indices(BaseGEP->indices());
5020 return getGEPCost(BaseGEP->getSourceElementType(),
5021 BaseGEP->getPointerOperand(), Indices, nullptr,
5022 CostKind);
5023 }
5024 return TTI::TCC_Free;
5025 }
5026 return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind);
5027}
5028
5030 ScalarEvolution *SE,
5031 const SCEV *Ptr) {
5032 // Address computations in vectorized code with non-consecutive addresses will
5033 // likely result in more instructions compared to scalar code where the
5034 // computation can more often be merged into the index mode. The resulting
5035 // extra micro-ops can significantly decrease throughput.
5036 const unsigned NumVectorInstToHideOverhead = 10;
5037
5038 // Cost modeling of Strided Access Computation is hidden by the indexing
5039 // modes of X86 regardless of the stride value. We dont believe that there
5040 // is a difference between constant strided access in gerenal and constant
5041 // strided value which is less than or equal to 64.
5042 // Even in the case of (loop invariant) stride whose value is not known at
5043 // compile time, the address computation will not incur more than one extra
5044 // ADD instruction.
5045 if (Ty->isVectorTy() && SE && !ST->hasAVX2()) {
5046 // TODO: AVX2 is the current cut-off because we don't have correct
5047 // interleaving costs for prior ISA's.
5049 return NumVectorInstToHideOverhead;
5051 return 1;
5052 }
5053
5054 return BaseT::getAddressComputationCost(Ty, SE, Ptr);
5055}
5056
5059 std::optional<FastMathFlags> FMF,
5062 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5063
5064 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5065 // and make it as the cost.
5066
5067 static const CostTblEntry SLMCostTbl[] = {
5068 { ISD::FADD, MVT::v2f64, 3 },
5069 { ISD::ADD, MVT::v2i64, 5 },
5070 };
5071
5072 static const CostTblEntry SSE2CostTbl[] = {
5073 { ISD::FADD, MVT::v2f64, 2 },
5074 { ISD::FADD, MVT::v2f32, 2 },
5075 { ISD::FADD, MVT::v4f32, 4 },
5076 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
5077 { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
5078 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
5079 { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
5080 { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
5081 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
5082 { ISD::ADD, MVT::v2i8, 2 },
5083 { ISD::ADD, MVT::v4i8, 2 },
5084 { ISD::ADD, MVT::v8i8, 2 },
5085 { ISD::ADD, MVT::v16i8, 3 },
5086 };
5087
5088 static const CostTblEntry AVX1CostTbl[] = {
5089 { ISD::FADD, MVT::v4f64, 3 },
5090 { ISD::FADD, MVT::v4f32, 3 },
5091 { ISD::FADD, MVT::v8f32, 4 },
5092 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
5093 { ISD::ADD, MVT::v4i64, 3 },
5094 { ISD::ADD, MVT::v8i32, 5 },
5095 { ISD::ADD, MVT::v16i16, 5 },
5096 { ISD::ADD, MVT::v32i8, 4 },
5097 };
5098
5099 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5100 assert(ISD && "Invalid opcode");
5101
5102 // Before legalizing the type, give a chance to look up illegal narrow types
5103 // in the table.
5104 // FIXME: Is there a better way to do this?
5105 EVT VT = TLI->getValueType(DL, ValTy);
5106 if (VT.isSimple()) {
5107 MVT MTy = VT.getSimpleVT();
5108 if (ST->useSLMArithCosts())
5109 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5110 return Entry->Cost;
5111
5112 if (ST->hasAVX())
5113 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5114 return Entry->Cost;
5115
5116 if (ST->hasSSE2())
5117 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5118 return Entry->Cost;
5119 }
5120
5121 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5122
5123 MVT MTy = LT.second;
5124
5125 auto *ValVTy = cast<FixedVectorType>(ValTy);
5126
5127 // Special case: vXi8 mul reductions are performed as vXi16.
5128 if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
5129 auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
5130 auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
5131 return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
5133 CostKind) +
5134 getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind);
5135 }
5136
5137 InstructionCost ArithmeticCost = 0;
5138 if (LT.first != 1 && MTy.isVector() &&
5139 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5140 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5141 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5142 MTy.getVectorNumElements());
5143 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5144 ArithmeticCost *= LT.first - 1;
5145 }
5146
5147 if (ST->useSLMArithCosts())
5148 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5149 return ArithmeticCost + Entry->Cost;
5150
5151 if (ST->hasAVX())
5152 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5153 return ArithmeticCost + Entry->Cost;
5154
5155 if (ST->hasSSE2())
5156 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5157 return ArithmeticCost + Entry->Cost;
5158
5159 // FIXME: These assume a naive kshift+binop lowering, which is probably
5160 // conservative in most cases.
5161 static const CostTblEntry AVX512BoolReduction[] = {
5162 { ISD::AND, MVT::v2i1, 3 },
5163 { ISD::AND, MVT::v4i1, 5 },
5164 { ISD::AND, MVT::v8i1, 7 },
5165 { ISD::AND, MVT::v16i1, 9 },
5166 { ISD::AND, MVT::v32i1, 11 },
5167 { ISD::AND, MVT::v64i1, 13 },
5168 { ISD::OR, MVT::v2i1, 3 },
5169 { ISD::OR, MVT::v4i1, 5 },
5170 { ISD::OR, MVT::v8i1, 7 },
5171 { ISD::OR, MVT::v16i1, 9 },
5172 { ISD::OR, MVT::v32i1, 11 },
5173 { ISD::OR, MVT::v64i1, 13 },
5174 };
5175
5176 static const CostTblEntry AVX2BoolReduction[] = {
5177 { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
5178 { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
5179 { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
5180 { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
5181 };
5182
5183 static const CostTblEntry AVX1BoolReduction[] = {
5184 { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
5185 { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
5186 { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5187 { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5188 { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
5189 { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
5190 { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5191 { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5192 };
5193
5194 static const CostTblEntry SSE2BoolReduction[] = {
5195 { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
5196 { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
5197 { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
5198 { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
5199 { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
5200 { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
5201 { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
5202 { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
5203 };
5204
5205 // Handle bool allof/anyof patterns.
5206 if (ValVTy->getElementType()->isIntegerTy(1)) {
5207 InstructionCost ArithmeticCost = 0;
5208 if (LT.first != 1 && MTy.isVector() &&
5209 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5210 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5211 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5212 MTy.getVectorNumElements());
5213 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5214 ArithmeticCost *= LT.first - 1;
5215 }
5216
5217 if (ST->hasAVX512())
5218 if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
5219 return ArithmeticCost + Entry->Cost;
5220 if (ST->hasAVX2())
5221 if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
5222 return ArithmeticCost + Entry->Cost;
5223 if (ST->hasAVX())
5224 if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
5225 return ArithmeticCost + Entry->Cost;
5226 if (ST->hasSSE2())
5227 if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
5228 return ArithmeticCost + Entry->Cost;
5229
5230 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5231 }
5232
5233 unsigned NumVecElts = ValVTy->getNumElements();
5234 unsigned ScalarSize = ValVTy->getScalarSizeInBits();
5235
5236 // Special case power of 2 reductions where the scalar type isn't changed
5237 // by type legalization.
5238 if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
5239 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5240
5241 InstructionCost ReductionCost = 0;
5242
5243 auto *Ty = ValVTy;
5244 if (LT.first != 1 && MTy.isVector() &&
5245 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5246 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5247 Ty = FixedVectorType::get(ValVTy->getElementType(),
5248 MTy.getVectorNumElements());
5249 ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5250 ReductionCost *= LT.first - 1;
5251 NumVecElts = MTy.getVectorNumElements();
5252 }
5253
5254 // Now handle reduction with the legal type, taking into account size changes
5255 // at each level.
5256 while (NumVecElts > 1) {
5257 // Determine the size of the remaining vector we need to reduce.
5258 unsigned Size = NumVecElts * ScalarSize;
5259 NumVecElts /= 2;
5260 // If we're reducing from 256/512 bits, use an extract_subvector.
5261 if (Size > 128) {
5262 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5263 ReductionCost +=
5265 NumVecElts, SubTy);
5266 Ty = SubTy;
5267 } else if (Size == 128) {
5268 // Reducing from 128 bits is a permute of v2f64/v2i64.
5269 FixedVectorType *ShufTy;
5270 if (ValVTy->isFloatingPointTy())
5271 ShufTy =
5272 FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
5273 else
5274 ShufTy =
5275 FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
5276 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5277 std::nullopt, CostKind, 0, nullptr);
5278 } else if (Size == 64) {
5279 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5280 FixedVectorType *ShufTy;
5281 if (ValVTy->isFloatingPointTy())
5282 ShufTy =
5283 FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
5284 else
5285 ShufTy =
5286 FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
5287 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5288 std::nullopt, CostKind, 0, nullptr);
5289 } else {
5290 // Reducing from smaller size is a shift by immediate.
5291 auto *ShiftTy = FixedVectorType::get(
5292 Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
5293 ReductionCost += getArithmeticInstrCost(
5294 Instruction::LShr, ShiftTy, CostKind,
5297 }
5298
5299 // Add the arithmetic op for this level.
5300 ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
5301 }
5302
5303 // Add the final extract element to the cost.
5304 return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5305 CostKind, 0, nullptr, nullptr);
5306}
5307
5310 FastMathFlags FMF) {
5311 IntrinsicCostAttributes ICA(IID, Ty, {Ty, Ty}, FMF);
5312 return getIntrinsicInstrCost(ICA, CostKind);
5313}
5314
5317 FastMathFlags FMF,
5319 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5320
5321 MVT MTy = LT.second;
5322
5323 int ISD;
5324 if (ValTy->isIntOrIntVectorTy()) {
5325 ISD = (IID == Intrinsic::umin || IID == Intrinsic::umax) ? ISD::UMIN
5326 : ISD::SMIN;
5327 } else {
5328 assert(ValTy->isFPOrFPVectorTy() &&
5329 "Expected float point or integer vector type.");
5330 ISD = (IID == Intrinsic::minnum || IID == Intrinsic::maxnum)
5331 ? ISD::FMINNUM
5332 : ISD::FMINIMUM;
5333 }
5334
5335 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5336 // and make it as the cost.
5337
5338 static const CostTblEntry SSE2CostTbl[] = {
5339 {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
5340 {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
5341 {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
5342 };
5343
5344 static const CostTblEntry SSE41CostTbl[] = {
5345 {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
5346 {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
5347 {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
5348 {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
5349 {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
5350 {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
5351 {ISD::SMIN, MVT::v2i8, 3}, // pminsb
5352 {ISD::SMIN, MVT::v4i8, 5}, // pminsb
5353 {ISD::SMIN, MVT::v8i8, 7}, // pminsb
5354 {ISD::SMIN, MVT::v16i8, 6},
5355 {ISD::UMIN, MVT::v2i8, 3}, // same as sse2
5356 {ISD::UMIN, MVT::v4i8, 5}, // same as sse2
5357 {ISD::UMIN, MVT::v8i8, 7}, // same as sse2
5358 {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
5359 };
5360
5361 static const CostTblEntry AVX1CostTbl[] = {
5362 {ISD::SMIN, MVT::v16i16, 6},
5363 {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
5364 {ISD::SMIN, MVT::v32i8, 8},
5365 {ISD::UMIN, MVT::v32i8, 8},
5366 };
5367
5368 static const CostTblEntry AVX512BWCostTbl[] = {
5369 {ISD::SMIN, MVT::v32i16, 8},
5370 {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
5371 {ISD::SMIN, MVT::v64i8, 10},
5372 {ISD::UMIN, MVT::v64i8, 10},
5373 };
5374
5375 // Before legalizing the type, give a chance to look up illegal narrow types
5376 // in the table.
5377 // FIXME: Is there a better way to do this?
5378 EVT VT = TLI->getValueType(DL, ValTy);
5379 if (VT.isSimple()) {
5380 MVT MTy = VT.getSimpleVT();
5381 if (ST->hasBWI())
5382 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5383 return Entry->Cost;
5384
5385 if (ST->hasAVX())
5386 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5387 return Entry->Cost;
5388
5389 if (ST->hasSSE41())
5390 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5391 return Entry->Cost;
5392
5393 if (ST->hasSSE2())
5394 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5395 return Entry->Cost;
5396 }
5397
5398 auto *ValVTy = cast<FixedVectorType>(ValTy);
5399 unsigned NumVecElts = ValVTy->getNumElements();
5400
5401 auto *Ty = ValVTy;
5402 InstructionCost MinMaxCost = 0;
5403 if (LT.first != 1 && MTy.isVector() &&
5404 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5405 // Type needs to be split. We need LT.first - 1 operations ops.
5406 Ty = FixedVectorType::get(ValVTy->getElementType(),
5407 MTy.getVectorNumElements());
5408 MinMaxCost = getMinMaxCost(IID, Ty, CostKind, FMF);
5409 MinMaxCost *= LT.first - 1;
5410 NumVecElts = MTy.getVectorNumElements();
5411 }
5412
5413 if (ST->hasBWI())
5414 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5415 return MinMaxCost + Entry->Cost;
5416
5417 if (ST->hasAVX())
5418 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5419 return MinMaxCost + Entry->Cost;
5420
5421 if (ST->hasSSE41())
5422 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5423 return MinMaxCost + Entry->Cost;
5424
5425 if (ST->hasSSE2())
5426 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5427 return MinMaxCost + Entry->Cost;
5428
5429 unsigned ScalarSize = ValTy->getScalarSizeInBits();
5430
5431 // Special case power of 2 reductions where the scalar type isn't changed
5432 // by type legalization.
5433 if (!isPowerOf2_32(ValVTy->getNumElements()) ||
5434 ScalarSize != MTy.getScalarSizeInBits())
5435 return BaseT::getMinMaxReductionCost(IID, ValTy, FMF, CostKind);
5436
5437 // Now handle reduction with the legal type, taking into account size changes
5438 // at each level.
5439 while (NumVecElts > 1) {
5440 // Determine the size of the remaining vector we need to reduce.
5441 unsigned Size = NumVecElts * ScalarSize;
5442 NumVecElts /= 2;
5443 // If we're reducing from 256/512 bits, use an extract_subvector.
5444 if (Size > 128) {
5445 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5446 MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
5447 CostKind, NumVecElts, SubTy);
5448 Ty = SubTy;
5449 } else if (Size == 128) {
5450 // Reducing from 128 bits is a permute of v2f64/v2i64.
5451 VectorType *ShufTy;
5452 if (ValTy->isFloatingPointTy())
5453 ShufTy =
5455 else
5456 ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
5457 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5458 std::nullopt, CostKind, 0, nullptr);
5459 } else if (Size == 64) {
5460 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5461 FixedVectorType *ShufTy;
5462 if (ValTy->isFloatingPointTy())
5463 ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
5464 else
5465 ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
5466 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5467 std::nullopt, CostKind, 0, nullptr);
5468 } else {
5469 // Reducing from smaller size is a shift by immediate.
5470 auto *ShiftTy = FixedVectorType::get(
5471 Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
5472 MinMaxCost += getArithmeticInstrCost(
5473 Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
5476 }
5477
5478 // Add the arithmetic op for this level.
5479 MinMaxCost += getMinMaxCost(IID, Ty, CostKind, FMF);
5480 }
5481
5482 // Add the final extract element to the cost.
5483 return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5484 CostKind, 0, nullptr, nullptr);
5485}
5486
5487/// Calculate the cost of materializing a 64-bit value. This helper
5488/// method might only calculate a fraction of a larger immediate. Therefore it
5489/// is valid to return a cost of ZERO.
5491 if (Val == 0)
5492 return TTI::TCC_Free;
5493
5494 if (isInt<32>(Val))
5495 return TTI::TCC_Basic;
5496
5497 return 2 * TTI::TCC_Basic;
5498}
5499
5502 assert(Ty->isIntegerTy());
5503
5504 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5505 if (BitSize == 0)
5506 return ~0U;
5507
5508 // Never hoist constants larger than 128bit, because this might lead to
5509 // incorrect code generation or assertions in codegen.
5510 // Fixme: Create a cost model for types larger than i128 once the codegen
5511 // issues have been fixed.
5512 if (BitSize > 128)
5513 return TTI::TCC_Free;
5514
5515 if (Imm == 0)
5516 return TTI::TCC_Free;
5517
5518 // Sign-extend all constants to a multiple of 64-bit.
5519 APInt ImmVal = Imm;
5520 if (BitSize % 64 != 0)
5521 ImmVal = Imm.sext(alignTo(BitSize, 64));
5522
5523 // Split the constant into 64-bit chunks and calculate the cost for each
5524 // chunk.
5526 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
5527 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
5528 int64_t Val = Tmp.getSExtValue();
5529 Cost += getIntImmCost(Val);
5530 }
5531 // We need at least one instruction to materialize the constant.
5532 return std::max<InstructionCost>(1, Cost);
5533}
5534
5536 const APInt &Imm, Type *Ty,
5538 Instruction *Inst) {
5539 assert(Ty->isIntegerTy());
5540
5541 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5542 // There is no cost model for constants with a bit size of 0. Return TCC_Free
5543 // here, so that constant hoisting will ignore this constant.
5544 if (BitSize == 0)
5545 return TTI::TCC_Free;
5546
5547 unsigned ImmIdx = ~0U;
5548 switch (Opcode) {
5549 default:
5550 return TTI::TCC_Free;
5551 case Instruction::GetElementPtr:
5552 // Always hoist the base address of a GetElementPtr. This prevents the
5553 // creation of new constants for every base constant that gets constant
5554 // folded with the offset.
5555 if (Idx == 0)
5556 return 2 * TTI::TCC_Basic;
5557 return TTI::TCC_Free;
5558 case Instruction::Store:
5559 ImmIdx = 0;
5560 break;
5561 case Instruction::ICmp:
5562 // This is an imperfect hack to prevent constant hoisting of
5563 // compares that might be trying to check if a 64-bit value fits in
5564 // 32-bits. The backend can optimize these cases using a right shift by 32.
5565 // Ideally we would check the compare predicate here. There also other
5566 // similar immediates the backend can use shifts for.
5567 if (Idx == 1 && Imm.getBitWidth() == 64) {
5568 uint64_t ImmVal = Imm.getZExtValue();
5569 if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
5570 return TTI::TCC_Free;
5571 }
5572 ImmIdx = 1;
5573 break;
5574 case Instruction::And:
5575 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
5576 // by using a 32-bit operation with implicit zero extension. Detect such
5577 // immediates here as the normal path expects bit 31 to be sign extended.
5578 if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.isIntN(32))
5579 return TTI::TCC_Free;
5580 ImmIdx = 1;
5581 break;
5582 case Instruction::Add:
5583 case Instruction::Sub:
5584 // For add/sub, we can use the opposite instruction for INT32_MIN.
5585 if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000)
5586 return TTI::TCC_Free;
5587 ImmIdx = 1;
5588 break;
5589 case Instruction::UDiv:
5590 case Instruction::SDiv:
5591 case Instruction::URem:
5592 case Instruction::SRem:
5593 // Division by constant is typically expanded later into a different
5594 // instruction sequence. This completely changes the constants.
5595 // Report them as "free" to stop ConstantHoist from marking them as opaque.
5596 return TTI::TCC_Free;
5597 case Instruction::Mul:
5598 case Instruction::Or:
5599 case Instruction::Xor:
5600 ImmIdx = 1;
5601 break;
5602 // Always return TCC_Free for the shift value of a shift instruction.
5603 case Instruction::Shl:
5604 case Instruction::LShr:
5605 case Instruction::AShr:
5606 if (Idx == 1)
5607 return TTI::TCC_Free;
5608 break;
5609 case Instruction::Trunc:
5610 case Instruction::ZExt:
5611 case Instruction::SExt:
5612 case Instruction::IntToPtr:
5613 case Instruction::PtrToInt:
5614 case Instruction::BitCast:
5615 case Instruction::PHI:
5616 case Instruction::Call:
5617 case Instruction::Select:
5618 case Instruction::Ret:
5619 case Instruction::Load:
5620 break;
5621 }
5622
5623 if (Idx == ImmIdx) {
5624 uint64_t NumConstants = divideCeil(BitSize, 64);
5626 return (Cost <= NumConstants * TTI::TCC_Basic)
5627 ? static_cast<int>(TTI::TCC_Free)
5628 : Cost;
5629 }
5630
5631 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
5632}
5633
5635 const APInt &Imm, Type *Ty,
5637 assert(Ty->isIntegerTy());
5638
5639 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5640 // There is no cost model for constants with a bit size of 0. Return TCC_Free
5641 // here, so that constant hoisting will ignore this constant.
5642 if (BitSize == 0)
5643 return TTI::TCC_Free;
5644
5645 switch (IID) {
5646 default:
5647 return TTI::TCC_Free;
5648 case Intrinsic::sadd_with_overflow:
5649 case Intrinsic::uadd_with_overflow:
5650 case Intrinsic::ssub_with_overflow:
5651 case Intrinsic::usub_with_overflow:
5652 case Intrinsic::smul_with_overflow:
5653 case Intrinsic::umul_with_overflow:
5654 if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(32))
5655 return TTI::TCC_Free;
5656 break;
5657 case Intrinsic::experimental_stackmap:
5658 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
5659 return TTI::TCC_Free;
5660 break;
5661 case Intrinsic::experimental_patchpoint_void:
5662 case Intrinsic::experimental_patchpoint:
5663 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
5664 return TTI::TCC_Free;
5665 break;
5666 }
5667 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
5668}
5669
5672 const Instruction *I) {
5674 return Opcode == Instruction::PHI ? 0 : 1;
5675 // Branches are assumed to be predicted.
5676 return 0;
5677}
5678
5679int X86TTIImpl::getGatherOverhead() const {
5680 // Some CPUs have more overhead for gather. The specified overhead is relative
5681 // to the Load operation. "2" is the number provided by Intel architects. This
5682 // parameter is used for cost estimation of Gather Op and comparison with
5683 // other alternatives.
5684 // TODO: Remove the explicit hasAVX512()?, That would mean we would only
5685 // enable gather with a -march.
5686 if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
5687 return 2;
5688
5689 return 1024;
5690}
5691
5692int X86TTIImpl::getScatterOverhead() const {
5693 if (ST->hasAVX512())
5694 return 2;
5695
5696 return 1024;
5697}
5698
5699// Return an average cost of Gather / Scatter instruction, maybe improved later.
5700// FIXME: Add TargetCostKind support.
5701InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy,
5702 const Value *Ptr, Align Alignment,
5703 unsigned AddressSpace) {
5704
5705 assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
5706 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
5707
5708 // Try to reduce index size from 64 bit (default for GEP)
5709 // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
5710 // operation will use 16 x 64 indices which do not fit in a zmm and needs
5711 // to split. Also check that the base pointer is the same for all lanes,
5712 // and that there's at most one variable index.
5713 auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
5714 unsigned IndexSize = DL.getPointerSizeInBits();
5715 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
5716 if (IndexSize < 64 || !GEP)
5717 return IndexSize;
5718
5719 unsigned NumOfVarIndices = 0;
5720 const Value *Ptrs = GEP->getPointerOperand();
5721 if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
5722 return IndexSize;
5723 for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) {
5724 if (isa<Constant>(GEP->getOperand(I)))
5725 continue;
5726 Type *IndxTy = GEP->getOperand(I)->getType();
5727 if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
5728 IndxTy = IndexVTy->getElementType();
5729 if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
5730 !isa<SExtInst>(GEP->getOperand(I))) ||
5731 ++NumOfVarIndices > 1)
5732 return IndexSize; // 64
5733 }
5734 return (unsigned)32;
5735 };
5736
5737 // Trying to reduce IndexSize to 32 bits for vector 16.
5738 // By default the IndexSize is equal to pointer size.
5739 unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
5740 ? getIndexSizeInBits(Ptr, DL)
5742
5743 auto *IndexVTy = FixedVectorType::get(
5744 IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
5745 std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(IndexVTy);
5746 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcVTy);
5747 InstructionCost::CostType SplitFactor =
5748 *std::max(IdxsLT.first, SrcLT.first).getValue();
5749 if (SplitFactor > 1) {
5750 // Handle splitting of vector of pointers
5751 auto *SplitSrcTy =
5752 FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
5753 return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
5754 AddressSpace);
5755 }
5756
5757 // The gather / scatter cost is given by Intel architects. It is a rough
5758 // number since we are looking at one instruction in a time.
5759 const int GSOverhead = (Opcode == Instruction::Load)
5760 ? getGatherOverhead()
5761 : getScatterOverhead();
5762 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5763 MaybeAlign(Alignment), AddressSpace,
5765}
5766
5767/// Return the cost of full scalarization of gather / scatter operation.
5768///
5769/// Opcode - Load or Store instruction.
5770/// SrcVTy - The type of the data vector that should be gathered or scattered.
5771/// VariableMask - The mask is non-constant at compile time.
5772/// Alignment - Alignment for one element.
5773/// AddressSpace - pointer[s] address space.
5774///
5775/// FIXME: Add TargetCostKind support.
5776InstructionCost X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
5777 bool VariableMask, Align Alignment,
5778 unsigned AddressSpace) {
5779 Type *ScalarTy = SrcVTy->getScalarType();
5780 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
5781 APInt DemandedElts = APInt::getAllOnes(VF);
5783
5784 InstructionCost MaskUnpackCost = 0;
5785 if (VariableMask) {
5786 auto *MaskTy =
5788 MaskUnpackCost = getScalarizationOverhead(
5789 MaskTy, DemandedElts, /*Insert=*/false, /*Extract=*/true, CostKind);
5790 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
5791 Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr,
5793 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
5794 MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
5795 }
5796
5797 InstructionCost AddressUnpackCost = getScalarizationOverhead(
5799 DemandedElts, /*Insert=*/false, /*Extract=*/true, CostKind);
5800
5801 // The cost of the scalar loads/stores.
5802 InstructionCost MemoryOpCost =
5803 VF * getMemoryOpCost(Opcode, ScalarTy, MaybeAlign(Alignment),
5805
5806 // The cost of forming the vector from loaded scalars/
5807 // scalarizing the vector to perform scalar stores.
5808 InstructionCost InsertExtractCost = getScalarizationOverhead(
5809 cast<FixedVectorType>(SrcVTy), DemandedElts,
5810 /*Insert=*/Opcode == Instruction::Load,
5811 /*Extract=*/Opcode == Instruction::Store, CostKind);
5812
5813 return AddressUnpackCost + MemoryOpCost + MaskUnpackCost + InsertExtractCost;
5814}
5815
5816/// Calculate the cost of Gather / Scatter operation
5818 unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask,
5820 const Instruction *I = nullptr) {
5822 if ((Opcode == Instruction::Load &&
5823 isLegalMaskedGather(SrcVTy, Align(Alignment)) &&
5824 !forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
5825 Align(Alignment))) ||
5826 (Opcode == Instruction::Store &&
5827 isLegalMaskedScatter(SrcVTy, Align(Alignment)) &&
5828 !forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
5829 Align(Alignment))))
5830 return 1;
5831 return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask,
5832 Alignment, CostKind, I);
5833 }
5834
5835 assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
5836 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
5837 if (!PtrTy && Ptr->getType()->isVectorTy())
5838 PtrTy = dyn_cast<PointerType>(
5839 cast<VectorType>(Ptr->getType())->getElementType());
5840 assert(PtrTy && "Unexpected type for Ptr argument");
5841 unsigned AddressSpace = PtrTy->getAddressSpace();
5842
5843 if ((Opcode == Instruction::Load &&
5844 (!isLegalMaskedGather(SrcVTy, Align(Alignment)) ||
5845 forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
5846 Align(Alignment)))) ||
5847 (Opcode == Instruction::Store &&
5848 (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) ||
5849 forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
5850 Align(Alignment)))))
5851 return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
5852 AddressSpace);
5853
5854 return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
5855}
5856
5858 const TargetTransformInfo::LSRCost &C2) {
5859 // X86 specific here are "instruction number 1st priority".
5860 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
5861 C1.NumIVMuls, C1.NumBaseAdds,
5862 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
5863 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
5864 C2.NumIVMuls, C2.NumBaseAdds,
5865 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
5866}
5867
5869 return ST->hasMacroFusion() || ST->hasBranchFusion();
5870}
5871
5872bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
5873 if (!ST->hasAVX())
5874 return false;
5875
5876 // The backend can't handle a single element vector.
5877 if (isa<VectorType>(DataTy) &&
5878 cast<FixedVectorType>(DataTy)->getNumElements() == 1)
5879 return false;
5880 Type *ScalarTy = DataTy->getScalarType();
5881
5882 if (ScalarTy->isPointerTy())
5883 return true;
5884
5885 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
5886 return true;
5887
5888 if (ScalarTy->isHalfTy() && ST->hasBWI())
5889 return true;
5890
5891 if (ScalarTy->isBFloatTy() && ST->hasBF16())
5892 return true;
5893
5894 if (!ScalarTy->isIntegerTy())
5895 return false;
5896
5897 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
5898 return IntWidth == 32 || IntWidth == 64 ||
5899 ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
5900}
5901
5902bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) {
5903 return isLegalMaskedLoad(DataType, Alignment);
5904}
5905
5906bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) {
5907 unsigned DataSize = DL.getTypeStoreSize(DataType);
5908 // The only supported nontemporal loads are for aligned vectors of 16 or 32
5909 // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
5910 // (the equivalent stores only require AVX).
5911 if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
5912 return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
5913
5914 return false;
5915}
5916
5917bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) {
5918 unsigned DataSize = DL.getTypeStoreSize(DataType);
5919
5920 // SSE4A supports nontemporal stores of float and double at arbitrary
5921 // alignment.
5922 if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
5923 return true;
5924
5925 // Besides the SSE4A subtarget exception above, only aligned stores are
5926 // available nontemporaly on any other subtarget. And only stores with a size
5927 // of 4..32 bytes (powers of 2, only) are permitted.
5928 if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
5929 !isPowerOf2_32(DataSize))
5930 return false;
5931
5932 // 32-byte vector nontemporal stores are supported by AVX (the equivalent
5933 // loads require AVX2).
5934 if (DataSize == 32)
5935 return ST->hasAVX();
5936 if (DataSize == 16)
5937 return ST->hasSSE1();
5938 return true;
5939}
5940
5942 ElementCount NumElements) const {
5943 // movddup
5944 return ST->hasSSE3() && !NumElements.isScalable() &&
5945 NumElements.getFixedValue() == 2 &&
5946 ElementTy == Type::getDoubleTy(ElementTy->getContext());
5947}
5948
5950 if (!isa<VectorType>(DataTy))
5951 return false;
5952
5953 if (!ST->hasAVX512())
5954 return false;
5955
5956 // The backend can't handle a single element vector.
5957 if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
5958 return false;
5959
5960 Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
5961
5962 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
5963 return true;
5964
5965 if (!ScalarTy->isIntegerTy())
5966 return false;
5967
5968 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
5969 return IntWidth == 32 || IntWidth == 64 ||
5970 ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
5971}
5972
5974 return isLegalMaskedExpandLoad(DataTy, Alignment);
5975}
5976
5977bool X86TTIImpl::supportsGather() const {
5978 // Some CPUs have better gather performance than others.
5979 // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
5980 // enable gather with a -march.
5981 return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2());
5982}
5983
5985 // Gather / Scatter for vector 2 is not profitable on KNL / SKX
5986 // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend
5987 // it to 8 elements, but zeroing upper bits of the mask vector will add more
5988 // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO:
5989 // Check, maybe the gather/scatter instruction is better in the VariableMask
5990 // case.
5991 unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements();
5992 return NumElts == 1 ||
5993 (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())));
5994}
5995
5997 Type *ScalarTy = DataTy->getScalarType();
5998 if (ScalarTy->isPointerTy())
5999 return true;
6000
6001 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6002 return true;
6003
6004 if (!ScalarTy->isIntegerTy())
6005 return false;
6006
6007 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6008 return IntWidth == 32 || IntWidth == 64;
6009}
6010
6012 if (!supportsGather() || !ST->preferGather())
6013 return false;
6014 return isLegalMaskedGatherScatter(DataTy, Alignment);
6015}
6016
6017bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
6018 unsigned Opcode1,
6019 const SmallBitVector &OpcodeMask) const {
6020 // ADDSUBPS 4xf32 SSE3
6021 // VADDSUBPS 4xf32 AVX
6022 // VADDSUBPS 8xf32 AVX2
6023 // ADDSUBPD 2xf64 SSE3
6024 // VADDSUBPD 2xf64 AVX
6025 // VADDSUBPD 4xf64 AVX2
6026
6027 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
6028 assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible");
6029 if (!isPowerOf2_32(NumElements))
6030 return false;
6031 // Check the opcode pattern. We apply the mask on the opcode arguments and
6032 // then check if it is what we expect.
6033 for (int Lane : seq<int>(0, NumElements)) {
6034 unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0;
6035 // We expect FSub for even lanes and FAdd for odd lanes.
6036 if (Lane % 2 == 0 && Opc != Instruction::FSub)
6037 return false;
6038 if (Lane % 2 == 1 && Opc != Instruction::FAdd)
6039 return false;
6040 }
6041 // Now check that the pattern is supported by the target ISA.
6042 Type *ElemTy = cast<VectorType>(VecTy)->getElementType();
6043 if (ElemTy->isFloatTy())
6044 return ST->hasSSE3() && NumElements % 4 == 0;
6045 if (ElemTy->isDoubleTy())
6046 return ST->hasSSE3() && NumElements % 2 == 0;
6047 return false;
6048}
6049
6050bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) {
6051 // AVX2 doesn't support scatter
6052 if (!ST->hasAVX512() || !ST->preferScatter())
6053 return false;
6054 return isLegalMaskedGatherScatter(DataType, Alignment);
6055}
6056
6057bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
6058 EVT VT = TLI->getValueType(DL, DataType);
6059 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
6060}
6061
6063 // FDIV is always expensive, even if it has a very low uop count.
6064 // TODO: Still necessary for recent CPUs with low latency/throughput fdiv?
6065 if (I->getOpcode() == Instruction::FDiv)
6066 return true;
6067
6069}
6070
6072 return false;
6073}
6074
6076 const Function *Callee) const {
6077 const TargetMachine &TM = getTLI()->getTargetMachine();
6078
6079 // Work this as a subsetting of subtarget features.
6080 const FeatureBitset &CallerBits =
6081 TM.getSubtargetImpl(*Caller)->getFeatureBits();
6082 const FeatureBitset &CalleeBits =
6083 TM.getSubtargetImpl(*Callee)->getFeatureBits();
6084
6085 // Check whether features are the same (apart from the ignore list).
6086 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
6087 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
6088 if (RealCallerBits == RealCalleeBits)
6089 return true;
6090
6091 // If the features are a subset, we need to additionally check for calls
6092 // that may become ABI-incompatible as a result of inlining.
6093 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
6094 return false;
6095
6096 for (const Instruction &I : instructions(Callee)) {
6097 if (const auto *CB = dyn_cast<CallBase>(&I)) {
6098 // Having more target features is fine for inline ASM.
6099 if (CB->isInlineAsm())
6100 continue;
6101
6103 for (Value *Arg : CB->args())
6104 Types.push_back(Arg->getType());
6105 if (!CB->getType()->isVoidTy())
6106 Types.push_back(CB->getType());
6107
6108 // Simple types are always ABI compatible.
6109 auto IsSimpleTy = [](Type *Ty) {
6110 return !Ty->isVectorTy() && !Ty->isAggregateType();
6111 };
6112 if (all_of(Types, IsSimpleTy))
6113 continue;
6114
6115 if (Function *NestedCallee = CB->getCalledFunction()) {
6116 // Assume that intrinsics are always ABI compatible.
6117 if (NestedCallee->isIntrinsic())
6118 continue;
6119
6120 // Do a precise compatibility check.
6121 if (!areTypesABICompatible(Caller, NestedCallee, Types))
6122 return false;
6123 } else {
6124 // We don't know the target features of the callee,
6125 // assume it is incompatible.
6126 return false;
6127 }
6128 }
6129 }
6130 return true;
6131}
6132
6134 const Function *Callee,
6135 const ArrayRef<Type *> &Types) const {
6136 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
6137 return false;
6138
6139 // If we get here, we know the target features match. If one function
6140 // considers 512-bit vectors legal and the other does not, consider them
6141 // incompatible.
6142 const TargetMachine &TM = getTLI()->getTargetMachine();
6143
6144 if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
6145 TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs())
6146 return true;
6147
6148 // Consider the arguments compatible if they aren't vectors or aggregates.
6149 // FIXME: Look at the size of vectors.
6150 // FIXME: Look at the element types of aggregates to see if there are vectors.
6151 return llvm::none_of(Types,
6152 [](Type *T) { return T->isVectorTy() || T->isAggregateType(); });
6153}
6154
6156X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
6158 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
6159 Options.NumLoadsPerBlock = 2;
6160 // All GPR and vector loads can be unaligned.
6161 Options.AllowOverlappingLoads = true;
6162 if (IsZeroCmp) {
6163 // Only enable vector loads for equality comparison. Right now the vector
6164 // version is not as fast for three way compare (see #33329).
6165 const unsigned PreferredWidth = ST->getPreferVectorWidth();
6166 if (PreferredWidth >= 512 && ST->hasAVX512() && ST->hasEVEX512())
6167 Options.LoadSizes.push_back(64);
6168 if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
6169 if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
6170 }
6171 if (ST->is64Bit()) {
6172 Options.LoadSizes.push_back(8);
6173 }
6174 Options.LoadSizes.push_back(4);
6175 Options.LoadSizes.push_back(2);
6176 Options.LoadSizes.push_back(1);
6177 return Options;
6178}
6179
6181 return supportsGather();
6182}
6183
6185 return false;
6186}
6187
6189 // TODO: We expect this to be beneficial regardless of arch,
6190 // but there are currently some unexplained performance artifacts on Atom.
6191 // As a temporary solution, disable on Atom.
6192 return !(ST->isAtom());
6193}
6194
6195// Get estimation for interleaved load/store operations and strided load.
6196// \p Indices contains indices for strided load.
6197// \p Factor - the factor of interleaving.
6198// AVX-512 provides 3-src shuffles that significantly reduces the cost.
6200 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
6201 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
6202 TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
6203 // VecTy for interleave memop is <VF*Factor x Elt>.
6204 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6205 // VecTy = <12 x i32>.
6206
6207 // Calculate the number of memory operations (NumOfMemOps), required
6208 // for load/store the VecTy.
6209 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6210 unsigned VecTySize = DL.getTypeStoreSize(VecTy);
6211 unsigned LegalVTSize = LegalVT.getStoreSize();
6212 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
6213
6214 // Get the cost of one memory operation.
6215 auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
6216 LegalVT.getVectorNumElements());
6217 InstructionCost MemOpCost;
6218 bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps;
6219 if (UseMaskedMemOp)
6220 MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment,
6222 else
6223 MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, MaybeAlign(Alignment),
6225
6226 unsigned VF = VecTy->getNumElements() / Factor;
6227 MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
6228
6229 InstructionCost MaskCost;
6230 if (UseMaskedMemOp) {
6231 APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements());
6232 for (unsigned Index : Indices) {
6233 assert(Index < Factor && "Invalid index for interleaved memory op");
6234 for (unsigned Elm = 0; Elm < VF; Elm++)
6235 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
6236 }
6237
6238 Type *I1Type = Type::getInt1Ty(VecTy->getContext());
6239
6240 MaskCost = getReplicationShuffleCost(
6241 I1Type, Factor, VF,
6242 UseMaskForGaps ? DemandedLoadStoreElts
6244 CostKind);
6245
6246 // The Gaps mask is invariant and created outside the loop, therefore the
6247 // cost of creating it is not accounted for here. However if we have both
6248 // a MaskForGaps and some other mask that guards the execution of the
6249 // memory access, we need to account for the cost of And-ing the two masks
6250 // inside the loop.
6251 if (UseMaskForGaps) {
6252 auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements());
6253 MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind);
6254 }
6255 }
6256
6257 if (Opcode == Instruction::Load) {
6258 // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
6259 // contain the cost of the optimized shuffle sequence that the
6260 // X86InterleavedAccess pass will generate.
6261 // The cost of loads and stores are computed separately from the table.
6262
6263 // X86InterleavedAccess support only the following interleaved-access group.
6264 static const CostTblEntry AVX512InterleavedLoadTbl[] = {
6265 {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
6266 {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
6267 {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
6268 };
6269
6270 if (const auto *Entry =
6271 CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
6272 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6273 //If an entry does not exist, fallback to the default implementation.
6274
6275 // Kind of shuffle depends on number of loaded values.
6276 // If we load the entire data in one register, we can use a 1-src shuffle.
6277 // Otherwise, we'll merge 2 sources in each operation.
6278 TTI::ShuffleKind ShuffleKind =
6279 (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
6280
6281 InstructionCost ShuffleCost = getShuffleCost(
6282 ShuffleKind, SingleMemOpTy, std::nullopt, CostKind, 0, nullptr);
6283
6284 unsigned NumOfLoadsInInterleaveGrp =
6285 Indices.size() ? Indices.size() : Factor;
6286 auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
6287 VecTy->getNumElements() / Factor);
6288 InstructionCost NumOfResults =
6289 getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp;
6290
6291 // About a half of the loads may be folded in shuffles when we have only
6292 // one result. If we have more than one result, or the loads are masked,
6293 // we do not fold loads at all.
6294 unsigned NumOfUnfoldedLoads =
6295 UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
6296
6297 // Get a number of shuffle operations per result.
6298 unsigned NumOfShufflesPerResult =
6299 std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
6300
6301 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6302 // When we have more than one destination, we need additional instructions
6303 // to keep sources.
6304 InstructionCost NumOfMoves = 0;
6305 if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
6306 NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
6307
6308 InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
6309 MaskCost + NumOfUnfoldedLoads * MemOpCost +
6310 NumOfMoves;
6311
6312 return Cost;
6313 }
6314
6315 // Store.
6316 assert(Opcode == Instruction::Store &&
6317 "Expected Store Instruction at this point");
6318 // X86InterleavedAccess support only the following interleaved-access group.
6319 static const CostTblEntry AVX512InterleavedStoreTbl[] = {
6320 {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
6321 {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
6322 {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
6323
6324 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
6325 {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
6326 {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
6327 {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
6328 };
6329
6330 if (const auto *Entry =
6331 CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
6332 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6333 //If an entry does not exist, fallback to the default implementation.
6334
6335 // There is no strided stores meanwhile. And store can't be folded in
6336 // shuffle.
6337 unsigned NumOfSources = Factor; // The number of values to be merged.
6338 InstructionCost ShuffleCost = getShuffleCost(
6339 TTI::SK_PermuteTwoSrc, SingleMemOpTy, std::nullopt, CostKind, 0, nullptr);
6340 unsigned NumOfShufflesPerStore = NumOfSources - 1;
6341
6342 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6343 // We need additional instructions to keep sources.
6344 unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
6346 MaskCost +
6347 NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
6348 NumOfMoves;
6349 return Cost;
6350}
6351
6353 unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices,
6354 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
6355 bool UseMaskForCond, bool UseMaskForGaps) {
6356 auto *VecTy = cast<FixedVectorType>(BaseTy);
6357
6358 auto isSupportedOnAVX512 = [&](Type *VecTy) {
6359 Type *EltTy = cast<VectorType>(VecTy)->getElementType();
6360 if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
6361 EltTy->isIntegerTy(32) || EltTy->isPointerTy())
6362 return true;
6363 if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy())
6364 return ST->hasBWI();
6365 if (EltTy->isBFloatTy())
6366 return ST->hasBF16();
6367 return false;
6368 };
6369 if (ST->hasAVX512() && isSupportedOnAVX512(VecTy))
6371 Opcode, VecTy, Factor, Indices, Alignment,
6372 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
6373
6374 if (UseMaskForCond || UseMaskForGaps)
6375 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6376 Alignment, AddressSpace, CostKind,
6377 UseMaskForCond, UseMaskForGaps);
6378
6379 // Get estimation for interleaved load/store operations for SSE-AVX2.
6380 // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow
6381 // computing the cost using a generic formula as a function of generic
6382 // shuffles. We therefore use a lookup table instead, filled according to
6383 // the instruction sequences that codegen currently generates.
6384
6385 // VecTy for interleave memop is <VF*Factor x Elt>.
6386 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6387 // VecTy = <12 x i32>.
6388 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6389
6390 // This function can be called with VecTy=<6xi128>, Factor=3, in which case
6391 // the VF=2, while v2i128 is an unsupported MVT vector type
6392 // (see MachineValueType.h::getVectorVT()).
6393 if (!LegalVT.isVector())
6394 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6395 Alignment, AddressSpace, CostKind);
6396
6397 unsigned VF = VecTy->getNumElements() / Factor;
6398 Type *ScalarTy = VecTy->getElementType();
6399 // Deduplicate entries, model floats/pointers as appropriately-sized integers.
6400 if (!ScalarTy->isIntegerTy())
6401 ScalarTy =
6402 Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
6403
6404 // Get the cost of all the memory operations.
6405 // FIXME: discount dead loads.
6406 InstructionCost MemOpCosts = getMemoryOpCost(
6407 Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind);
6408
6409 auto *VT = FixedVectorType::get(ScalarTy, VF);
6410 EVT ETy = TLI->getValueType(DL, VT);
6411 if (!ETy.isSimple())
6412 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6413 Alignment, AddressSpace, CostKind);
6414
6415 // TODO: Complete for other data-types and strides.
6416 // Each combination of Stride, element bit width and VF results in a different
6417 // sequence; The cost tables are therefore accessed with:
6418 // Factor (stride) and VectorType=VFxiN.
6419 // The Cost accounts only for the shuffle sequence;
6420 // The cost of the loads/stores is accounted for separately.
6421 //
6422 static const CostTblEntry AVX2InterleavedLoadTbl[] = {
6423 {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8
6424 {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8
6425 {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8
6426 {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8
6427 {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8
6428
6429 {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16
6430 {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16
6431 {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16
6432
6433 {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32
6434 {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32
6435 {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32
6436
6437 {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64
6438 {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64
6439 {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64
6440 {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64
6441
6442 {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8
6443 {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8
6444 {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8
6445 {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
6446 {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8
6447
6448 {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16
6449 {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16
6450 {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16
6451 {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16
6452 {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16
6453
6454 {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32
6455 {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32
6456 {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32
6457 {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32
6458 {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32
6459
6460 {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64
6461 {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64
6462 {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64
6463 {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64
6464
6465 {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8
6466 {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8
6467 {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8
6468 {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8
6469 {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8
6470
6471 {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16
6472 {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16
6473 {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16
6474 {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16
6475 {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16
6476
6477 {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32
6478 {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32
6479 {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32
6480 {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32
6481 {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32
6482
6483 {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64
6484 {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64
6485 {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64
6486 {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64
6487
6488 {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8
6489 {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8
6490 {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8
6491 {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8
6492 {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8
6493
6494 {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16
6495 {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16
6496 {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16
6497 {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16
6498 {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16
6499
6500 {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32
6501 {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32
6502 {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32
6503 {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32
6504
6505 {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64
6506 {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64
6507 {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64
6508
6509 {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
6510 };
6511
6512 static const CostTblEntry SSSE3InterleavedLoadTbl[] = {
6513 {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16
6514 };
6515
6516 static const CostTblEntry SSE2InterleavedLoadTbl[] = {
6517 {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16
6518 {2, MVT::v4i16, 7}, // (load 8i16 and) deinterleave into 2 x 4i16
6519
6520 {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32
6521 {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32
6522
6523 {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64
6524 };
6525
6526 static const CostTblEntry AVX2InterleavedStoreTbl[] = {
6527 {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store)
6528 {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store)
6529
6530 {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store)
6531 {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store)
6532 {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store)
6533
6534 {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store)
6535 {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store)
6536 {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store)
6537 {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store)
6538
6539 {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store)
6540 {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store)
6541 {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store)
6542 {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store)
6543 {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store)
6544
6545 {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store)
6546 {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store)
6547 {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store)
6548 {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
6549 {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
6550
6551 {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store)
6552 {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store)
6553 {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store)
6554 {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store)
6555 {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store)
6556
6557 {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store)
6558 {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store)
6559 {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store)
6560 {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store)
6561 {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store)
6562
6563 {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store)
6564 {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store)
6565 {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store)
6566 {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store)
6567
6568 {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store)
6569 {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store)
6570 {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store)
6571 {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store)
6572 {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store)
6573
6574 {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store)
6575 {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store)
6576 {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store)
6577 {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store)
6578 {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store)
6579
6580 {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store)
6581 {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store)
6582 {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store)
6583 {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store)
6584 {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store)
6585
6586 {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store)
6587 {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store)
6588 {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store)
6589 {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store)
6590
6591 {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store)
6592 {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store)
6593 {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store)
6594 {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store)
6595 {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store)
6596
6597 {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store)
6598 {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store)
6599 {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store)
6600 {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store)
6601 {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store)
6602
6603 {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store)
6604 {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store)
6605 {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store)
6606 {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store)
6607
6608 {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store)
6609 {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store)
6610 {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store)
6611 };
6612
6613 static const CostTblEntry SSE2InterleavedStoreTbl[] = {
6614 {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store)
6615 {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store)
6616 {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store)
6617
6618 {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store)
6619 {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store)
6620
6621 {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store)
6622 };
6623
6624 if (Opcode == Instruction::Load) {
6625 auto GetDiscountedCost = [Factor, NumMembers = Indices.size(),
6626 MemOpCosts](const CostTblEntry *Entry) {
6627 // NOTE: this is just an approximation!
6628 // It can over/under -estimate the cost!
6629 return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor);
6630 };
6631
6632 if (ST->hasAVX2())
6633 if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor,
6634 ETy.getSimpleVT()))
6635 return GetDiscountedCost(Entry);
6636
6637 if (ST->hasSSSE3())
6638 if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor,
6639 ETy.getSimpleVT()))
6640 return GetDiscountedCost(Entry);
6641
6642 if (ST->hasSSE2())
6643 if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor,
6644 ETy.getSimpleVT()))
6645 return GetDiscountedCost(Entry);
6646 } else {
6647 assert(Opcode == Instruction::Store &&
6648 "Expected Store Instruction at this point");
6649 assert((!Indices.size() || Indices.size() == Factor) &&
6650 "Interleaved store only supports fully-interleaved groups.");
6651 if (ST->hasAVX2())
6652 if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor,
6653 ETy.getSimpleVT()))
6654 return MemOpCosts + Entry->Cost;
6655
6656 if (ST->hasSSE2())
6657 if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor,
6658 ETy.getSimpleVT()))
6659 return MemOpCosts + Entry->Cost;
6660 }
6661
6662 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6663 Alignment, AddressSpace, CostKind,
6664 UseMaskForCond, UseMaskForGaps);
6665}
6666
6668 int64_t BaseOffset,
6669 bool HasBaseReg, int64_t Scale,
6670 unsigned AddrSpace) const {
6671 // Scaling factors are not free at all.
6672 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
6673 // will take 2 allocations in the out of order engine instead of 1
6674 // for plain addressing mode, i.e. inst (reg1).
6675 // E.g.,
6676 // vaddps (%rsi,%rdx), %ymm0, %ymm1
6677 // Requires two allocations (one for the load, one for the computation)
6678 // whereas:
6679 // vaddps (%rsi), %ymm0, %ymm1
6680 // Requires just 1 allocation, i.e., freeing allocations for other operations
6681 // and having less micro operations to execute.
6682 //
6683 // For some X86 architectures, this is even worse because for instance for
6684 // stores, the complex addressing mode forces the instruction to use the
6685 // "load" ports instead of the dedicated "store" port.
6686 // E.g., on Haswell:
6687 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
6688 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
6690 AM.BaseGV = BaseGV;
6691 AM.BaseOffs = BaseOffset;
6692 AM.HasBaseReg = HasBaseReg;
6693 AM.Scale = Scale;
6694 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
6695 // Scale represents reg2 * scale, thus account for 1
6696 // as soon as we use a second register.
6697 return AM.Scale != 0;
6698 return -1;
6699}
Expand Atomic instructions
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
Hexagon Common GEP
static LVOptions Options
Definition: LVOptions.cpp:25
#define I(x, y, z)
Definition: MD5.cpp:58
if(VerifyEach)
const char LLVMTargetMachineRef TM
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
This file a TargetTransformInfo::Concept conforming object specific to the X86 target machine.
Class for arbitrary precision integers.
Definition: APInt.h:76
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1620
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1308
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:349
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:236
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:358
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1439
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:805
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:178
APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition: APInt.cpp:453
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1513
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:891
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:969
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:762
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:438
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:855
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:339
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:960
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:963
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:990
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:984
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:983
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:968
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:971
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:985
@ ICMP_NE
not equal
Definition: InstrTypes.h:982
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:988
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:986
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:970
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:410
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:865
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:672
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:472
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:874
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:307
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
unsigned getNumElements() const
Definition: DerivedTypes.h:582
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:973
static InstructionCost getInvalid(CostType Val=0)
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
const SmallVectorImpl< const Value * > & getArgs() const
const IntrinsicInst * getInst() const
Machine Value Type.
bool is128BitVector() const
Return true if this is a 128-bit vector type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:585
bool is512BitVector() const
Return true if this is a 512-bit vector type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool is256BitVector() const
Return true if this is a 256-bit vector type.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Class to represent pointers.
Definition: DerivedTypes.h:646
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:679
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
bool test(unsigned Idx) const
size_type size() const
Returns the number of bits in this bitvector.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
bool isExpensiveToSpeculativelyExecute(const Instruction *I)
bool isStridedAccess(const SCEV *Ptr) const
unsigned minRequiredElementSize(const Value *Val, bool &isSigned) const
const SCEVConstant * getConstantStrideStep(ScalarEvolution *SE, const SCEV *Ptr) const
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
CacheLevel
The possible cache levels.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:330
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:333
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
static Type * getDoubleTy(LLVMContext &C)
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:234
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
static IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:146
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:143
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
static IntegerType * getInt8Ty(LLVMContext &C)
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:157
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:216
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
static VectorType * getExtendedElementVectorType(VectorType *VTy)
This static method is like getInteger except that the element types are twice as wide as the elements...
Definition: DerivedTypes.h:463
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * getDoubleElementsVectorType(VectorType *VTy)
This static method returns a VectorType with twice as many elements as the input type and the same el...
Definition: DerivedTypes.h:517
Type * getElementType() const
Definition: DerivedTypes.h:436
bool hasSSE1() const
Definition: X86Subtarget.h:200
bool hasSSE42() const
Definition: X86Subtarget.h:205
bool useAVX512Regs() const
Definition: X86Subtarget.h:266
bool hasSSE3() const
Definition: X86Subtarget.h:202
bool hasAVX512() const
Definition: X86Subtarget.h:208
bool hasSSE41() const
Definition: X86Subtarget.h:204
bool hasSSE2() const
Definition: X86Subtarget.h:201
bool hasSSSE3() const
Definition: X86Subtarget.h:203
bool hasAVX() const
Definition: X86Subtarget.h:206
unsigned getPreferVectorWidth() const
Definition: X86Subtarget.h:239
bool hasAVX2() const
Definition: X86Subtarget.h:207
InstructionCost getInterleavedMemoryOpCostAVX512(unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
bool isLegalMaskedGather(Type *DataType, Align Alignment)
InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask, TTI::TargetCostKind CostKind) const
std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool isLegalNTStore(Type *DataType, Align Alignment)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
bool isLegalNTLoad(Type *DataType, Align Alignment)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment)
bool isLegalMaskedGatherScatter(Type *DataType, Align Alignment)
bool isLegalMaskedLoad(Type *DataType, Align Alignment)
bool supportsEfficientVectorElementLoadStore() const
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
bool prefersVectorizedAddressing() const
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment)
std::optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const override
bool isLegalMaskedStore(Type *DataType, Align Alignment)
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
Calculate the cost of Gather / Scatter operation.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
unsigned getMaxInterleaveFactor(ElementCount VF)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
bool isLegalMaskedCompressStore(Type *DataType, Align Alignment)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
unsigned getNumberOfRegisters(unsigned ClassID) const
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
unsigned getAtomicMemIntrinsicMaxElementSize() const
bool isLegalMaskedScatter(Type *DataType, Align Alignment)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
InstructionCost getIntImmCost(int64_t)
Calculate the cost of materializing a 64-bit value.
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isExpensiveToSpeculativelyExecute(const Instruction *I)
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr)
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
InstructionCost getMinMaxCost(Intrinsic::ID IID, Type *Ty, TTI::TargetCostKind CostKind, FastMathFlags FMF)
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Type) const
bool hasDivRemOp(Type *DataType, bool IsSigned)
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:187
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:3011
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:750
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:723
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:714
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:790
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:688
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:255
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:774
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:722
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:930
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:349
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:727
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:323
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:705
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:535
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:780
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:971
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:674
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:888
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:984
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:836
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:680
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:869
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:786
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:340
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1731
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:417
AddressSpace
Definition: NVPTXBaseInfo.h:21
Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:361
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1738
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:264
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1745
constexpr int PoisonMaskElem
void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1833
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
InstructionCost Cost
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:428
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
std::optional< unsigned > operator[](TargetTransformInfo::TargetCostKind Kind) const
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:34
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition: Alignment.h:141
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Type Conversion Cost Table.
Definition: CostTable.h:55