LLVM 19.0.0git
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements a TargetTransformInfo analysis pass specific to the
10/// X86 target machine. It uses the target's detailed information to provide
11/// more precise answers to certain TTI queries, while letting the target
12/// independent and default TTI implementations handle the rest.
13///
14//===----------------------------------------------------------------------===//
15/// About Cost Model numbers used below it's necessary to say the following:
16/// the numbers correspond to some "generic" X86 CPU instead of usage of a
17/// specific CPU model. Usually the numbers correspond to the CPU where the
18/// feature first appeared. For example, if we do Subtarget.hasSSE42() in
19/// the lookups below the cost is based on Nehalem as that was the first CPU
20/// to support that feature level and thus has most likely the worst case cost,
21/// although we may discard an outlying worst cost from one CPU (e.g. Atom).
22///
23/// Some examples of other technologies/CPUs:
24/// SSE 3 - Pentium4 / Athlon64
25/// SSE 4.1 - Penryn
26/// SSE 4.2 - Nehalem / Silvermont
27/// AVX - Sandy Bridge / Jaguar / Bulldozer
28/// AVX2 - Haswell / Ryzen
29/// AVX-512 - Xeon Phi / Skylake
30///
31/// And some examples of instruction target dependent costs (latency)
32/// divss sqrtss rsqrtss
33/// AMD K7 11-16 19 3
34/// Piledriver 9-24 13-15 5
35/// Jaguar 14 16 2
36/// Pentium II,III 18 30 2
37/// Nehalem 7-14 7-18 3
38/// Haswell 10-13 11 5
39///
40/// Interpreting the 4 TargetCostKind types:
41/// TCK_RecipThroughput and TCK_Latency should try to match the worst case
42/// values reported by the CPU scheduler models (and llvm-mca).
43/// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the
44/// actual encoding size of the instruction.
45/// TCK_SizeAndLatency should match the worst case micro-op counts reported by
46/// by the CPU scheduler models (and llvm-mca), to ensure that they are
47/// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are
48/// often used as the cost thresholds where TCK_SizeAndLatency is requested.
49//===----------------------------------------------------------------------===//
50
58#include "llvm/Support/Debug.h"
59#include <optional>
60
61using namespace llvm;
62
63#define DEBUG_TYPE "x86tti"
64
65//===----------------------------------------------------------------------===//
66//
67// X86 cost model.
68//
69//===----------------------------------------------------------------------===//
70
71// Helper struct to store/access costs for each cost kind.
72// TODO: Move this to allow other targets to use it?
74 unsigned RecipThroughputCost = ~0U;
75 unsigned LatencyCost = ~0U;
76 unsigned CodeSizeCost = ~0U;
77 unsigned SizeAndLatencyCost = ~0U;
78
79 std::optional<unsigned>
81 unsigned Cost = ~0U;
82 switch (Kind) {
85 break;
88 break;
91 break;
94 break;
95 }
96 if (Cost == ~0U)
97 return std::nullopt;
98 return Cost;
99 }
100};
103
106 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
107 // TODO: Currently the __builtin_popcount() implementation using SSE3
108 // instructions is inefficient. Once the problem is fixed, we should
109 // call ST->hasSSE3() instead of ST->hasPOPCNT().
110 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
111}
112
113std::optional<unsigned> X86TTIImpl::getCacheSize(
115 switch (Level) {
117 // - Penryn
118 // - Nehalem
119 // - Westmere
120 // - Sandy Bridge
121 // - Ivy Bridge
122 // - Haswell
123 // - Broadwell
124 // - Skylake
125 // - Kabylake
126 return 32 * 1024; // 32 KByte
128 // - Penryn
129 // - Nehalem
130 // - Westmere
131 // - Sandy Bridge
132 // - Ivy Bridge
133 // - Haswell
134 // - Broadwell
135 // - Skylake
136 // - Kabylake
137 return 256 * 1024; // 256 KByte
138 }
139
140 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
141}
142
143std::optional<unsigned> X86TTIImpl::getCacheAssociativity(
145 // - Penryn
146 // - Nehalem
147 // - Westmere
148 // - Sandy Bridge
149 // - Ivy Bridge
150 // - Haswell
151 // - Broadwell
152 // - Skylake
153 // - Kabylake
154 switch (Level) {
156 [[fallthrough]];
158 return 8;
159 }
160
161 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
162}
163
164unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
165 bool Vector = (ClassID == 1);
166 if (Vector && !ST->hasSSE1())
167 return 0;
168
169 if (ST->is64Bit()) {
170 if (Vector && ST->hasAVX512())
171 return 32;
172 return 16;
173 }
174 return 8;
175}
176
179 unsigned PreferVectorWidth = ST->getPreferVectorWidth();
180 switch (K) {
182 return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
184 if (ST->hasAVX512() && ST->hasEVEX512() && PreferVectorWidth >= 512)
185 return TypeSize::getFixed(512);
186 if (ST->hasAVX() && PreferVectorWidth >= 256)
187 return TypeSize::getFixed(256);
188 if (ST->hasSSE1() && PreferVectorWidth >= 128)
189 return TypeSize::getFixed(128);
190 return TypeSize::getFixed(0);
192 return TypeSize::getScalable(0);
193 }
194
195 llvm_unreachable("Unsupported register kind");
196}
197
200 .getFixedValue();
201}
202
204 // If the loop will not be vectorized, don't interleave the loop.
205 // Let regular unroll to unroll the loop, which saves the overflow
206 // check and memory check cost.
207 if (VF.isScalar())
208 return 1;
209
210 if (ST->isAtom())
211 return 1;
212
213 // Sandybridge and Haswell have multiple execution ports and pipelined
214 // vector units.
215 if (ST->hasAVX())
216 return 4;
217
218 return 2;
219}
220
222 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
225 const Instruction *CxtI) {
226
227 // vXi8 multiplications are always promoted to vXi16.
228 // Sub-128-bit types can be extended/packed more efficiently.
229 if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
230 Ty->getPrimitiveSizeInBits() <= 64 && Ty->getScalarSizeInBits() == 8) {
231 Type *WideVecTy =
232 VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
233 return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
235 CostKind) +
236 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
238 CostKind) +
239 getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info);
240 }
241
242 // Legalize the type.
243 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
244
245 int ISD = TLI->InstructionOpcodeToISD(Opcode);
246 assert(ISD && "Invalid opcode");
247
248 if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
249 (LT.second.getScalarType() == MVT::i32 ||
250 LT.second.getScalarType() == MVT::i64)) {
251 // Check if the operands can be represented as a smaller datatype.
252 bool Op1Signed = false, Op2Signed = false;
253 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
254 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
255 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
256 bool SignedMode = Op1Signed || Op2Signed;
257
258 // If both vXi32 are representable as i15 and at least one is constant,
259 // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
260 // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
261 if (OpMinSize <= 15 && !ST->isPMADDWDSlow() &&
262 LT.second.getScalarType() == MVT::i32) {
263 bool Op1Constant =
264 isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
265 bool Op2Constant =
266 isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
267 bool Op1Sext = isa<SExtInst>(Args[0]) &&
268 (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
269 bool Op2Sext = isa<SExtInst>(Args[1]) &&
270 (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
271
272 bool IsZeroExtended = !Op1Signed || !Op2Signed;
273 bool IsConstant = Op1Constant || Op2Constant;
274 bool IsSext = Op1Sext || Op2Sext;
275 if (IsConstant || IsZeroExtended || IsSext)
276 LT.second =
277 MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
278 }
279
280 // Check if the vXi32 operands can be shrunk into a smaller datatype.
281 // This should match the codegen from reduceVMULWidth.
282 // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()).
283 if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) {
284 if (OpMinSize <= 7)
285 return LT.first * 3; // pmullw/sext
286 if (!SignedMode && OpMinSize <= 8)
287 return LT.first * 3; // pmullw/zext
288 if (OpMinSize <= 15)
289 return LT.first * 5; // pmullw/pmulhw/pshuf
290 if (!SignedMode && OpMinSize <= 16)
291 return LT.first * 5; // pmullw/pmulhw/pshuf
292 }
293
294 // If both vXi64 are representable as (unsigned) i32, then we can perform
295 // the multiple with a single PMULUDQ instruction.
296 // TODO: Add (SSE41+) PMULDQ handling for signed extensions.
297 if (!SignedMode && OpMinSize <= 32 && LT.second.getScalarType() == MVT::i64)
298 ISD = X86ISD::PMULUDQ;
299 }
300
301 // Vector multiply by pow2 will be simplified to shifts.
302 // Vector multiply by -pow2 will be simplified to shifts/negates.
303 if (ISD == ISD::MUL && Op2Info.isConstant() &&
304 (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) {
306 getArithmeticInstrCost(Instruction::Shl, Ty, CostKind,
307 Op1Info.getNoProps(), Op2Info.getNoProps());
308 if (Op2Info.isNegatedPowerOf2())
309 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
310 return Cost;
311 }
312
313 // On X86, vector signed division by constants power-of-two are
314 // normally expanded to the sequence SRA + SRL + ADD + SRA.
315 // The OperandValue properties may not be the same as that of the previous
316 // operation; conservatively assume OP_None.
317 if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
318 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
320 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
321 Op1Info.getNoProps(), Op2Info.getNoProps());
322 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
323 Op1Info.getNoProps(), Op2Info.getNoProps());
324 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
325 Op1Info.getNoProps(), Op2Info.getNoProps());
326
327 if (ISD == ISD::SREM) {
328 // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
329 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(),
330 Op2Info.getNoProps());
331 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(),
332 Op2Info.getNoProps());
333 }
334
335 return Cost;
336 }
337
338 // Vector unsigned division/remainder will be simplified to shifts/masks.
339 if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
340 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
341 if (ISD == ISD::UDIV)
342 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
343 Op1Info.getNoProps(), Op2Info.getNoProps());
344 // UREM
345 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
346 Op1Info.getNoProps(), Op2Info.getNoProps());
347 }
348
349 static const CostKindTblEntry GFNIUniformConstCostTable[] = {
350 { ISD::SHL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
351 { ISD::SRL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
352 { ISD::SRA, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
353 { ISD::SHL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
354 { ISD::SRL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
355 { ISD::SRA, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
356 { ISD::SHL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
357 { ISD::SRL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
358 { ISD::SRA, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
359 };
360
361 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasGFNI())
362 if (const auto *Entry =
363 CostTableLookup(GFNIUniformConstCostTable, ISD, LT.second))
364 if (auto KindCost = Entry->Cost[CostKind])
365 return LT.first * *KindCost;
366
367 static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
368 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
369 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
370 { ISD::SRA, MVT::v16i8, { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb.
371 { ISD::SHL, MVT::v32i8, { 1, 8, 2, 3 } }, // psllw + pand.
372 { ISD::SRL, MVT::v32i8, { 1, 8, 2, 3 } }, // psrlw + pand.
373 { ISD::SRA, MVT::v32i8, { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb.
374 { ISD::SHL, MVT::v64i8, { 1, 8, 2, 3 } }, // psllw + pand.
375 { ISD::SRL, MVT::v64i8, { 1, 8, 2, 3 } }, // psrlw + pand.
376 { ISD::SRA, MVT::v64i8, { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb.
377
378 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // psllw
379 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
380 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
381 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // psllw
382 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
383 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
384 };
385
386 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI())
387 if (const auto *Entry =
388 CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second))
389 if (auto KindCost = Entry->Cost[CostKind])
390 return LT.first * *KindCost;
391
392 static const CostKindTblEntry AVX512UniformConstCostTable[] = {
393 { ISD::SHL, MVT::v64i8, { 2, 12, 5, 6 } }, // psllw + pand.
394 { ISD::SRL, MVT::v64i8, { 2, 12, 5, 6 } }, // psrlw + pand.
395 { ISD::SRA, MVT::v64i8, { 3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb.
396
397 { ISD::SHL, MVT::v16i16, { 2, 7, 4, 4 } }, // psllw + split.
398 { ISD::SRL, MVT::v16i16, { 2, 7, 4, 4 } }, // psrlw + split.
399 { ISD::SRA, MVT::v16i16, { 2, 7, 4, 4 } }, // psraw + split.
400
401 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, // pslld
402 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, // psrld
403 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, // psrad
404 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, // pslld
405 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, // psrld
406 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, // psrad
407
408 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, // psraq
409 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, // psllq
410 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, // psrlq
411 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, // psraq
412 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, // psllq
413 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, // psrlq
414 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, // psraq
415
416 { ISD::SDIV, MVT::v16i32, { 6 } }, // pmuludq sequence
417 { ISD::SREM, MVT::v16i32, { 8 } }, // pmuludq+mul+sub sequence
418 { ISD::UDIV, MVT::v16i32, { 5 } }, // pmuludq sequence
419 { ISD::UREM, MVT::v16i32, { 7 } }, // pmuludq+mul+sub sequence
420 };
421
422 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512())
423 if (const auto *Entry =
424 CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second))
425 if (auto KindCost = Entry->Cost[CostKind])
426 return LT.first * *KindCost;
427
428 static const CostKindTblEntry AVX2UniformConstCostTable[] = {
429 { ISD::SHL, MVT::v16i8, { 1, 8, 2, 3 } }, // psllw + pand.
430 { ISD::SRL, MVT::v16i8, { 1, 8, 2, 3 } }, // psrlw + pand.
431 { ISD::SRA, MVT::v16i8, { 2, 10, 5, 6 } }, // psrlw, pand, pxor, psubb.
432 { ISD::SHL, MVT::v32i8, { 2, 8, 2, 4 } }, // psllw + pand.
433 { ISD::SRL, MVT::v32i8, { 2, 8, 2, 4 } }, // psrlw + pand.
434 { ISD::SRA, MVT::v32i8, { 3, 10, 5, 9 } }, // psrlw, pand, pxor, psubb.
435
436 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw
437 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw
438 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw
439 { ISD::SHL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psllw
440 { ISD::SRL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psrlw
441 { ISD::SRA, MVT::v16i16,{ 2, 2, 1, 2 } }, // psraw
442
443 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
444 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld
445 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad
446 { ISD::SHL, MVT::v8i32, { 2, 2, 1, 2 } }, // pslld
447 { ISD::SRL, MVT::v8i32, { 2, 2, 1, 2 } }, // psrld
448 { ISD::SRA, MVT::v8i32, { 2, 2, 1, 2 } }, // psrad
449
450 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq
451 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq
452 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
453 { ISD::SHL, MVT::v4i64, { 2, 2, 1, 2 } }, // psllq
454 { ISD::SRL, MVT::v4i64, { 2, 2, 1, 2 } }, // psrlq
455 { ISD::SRA, MVT::v4i64, { 4, 4, 3, 6 } }, // psrad + shuffle + split.
456
457 { ISD::SDIV, MVT::v8i32, { 6 } }, // pmuludq sequence
458 { ISD::SREM, MVT::v8i32, { 8 } }, // pmuludq+mul+sub sequence
459 { ISD::UDIV, MVT::v8i32, { 5 } }, // pmuludq sequence
460 { ISD::UREM, MVT::v8i32, { 7 } }, // pmuludq+mul+sub sequence
461 };
462
463 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2())
464 if (const auto *Entry =
465 CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second))
466 if (auto KindCost = Entry->Cost[CostKind])
467 return LT.first * *KindCost;
468
469 static const CostKindTblEntry AVXUniformConstCostTable[] = {
470 { ISD::SHL, MVT::v16i8, { 2, 7, 2, 3 } }, // psllw + pand.
471 { ISD::SRL, MVT::v16i8, { 2, 7, 2, 3 } }, // psrlw + pand.
472 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
473 { ISD::SHL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psllw + pand) + split.
474 { ISD::SRL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psrlw + pand) + split.
475 { ISD::SRA, MVT::v32i8, { 7, 7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split.
476
477 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 1 } }, // psllw.
478 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 1 } }, // psrlw.
479 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 1 } }, // psraw.
480 { ISD::SHL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psllw + split.
481 { ISD::SRL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psrlw + split.
482 { ISD::SRA, MVT::v16i16,{ 3, 6, 4, 5 } }, // psraw + split.
483
484 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 1 } }, // pslld.
485 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 1 } }, // psrld.
486 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 1 } }, // psrad.
487 { ISD::SHL, MVT::v8i32, { 3, 6, 4, 5 } }, // pslld + split.
488 { ISD::SRL, MVT::v8i32, { 3, 6, 4, 5 } }, // psrld + split.
489 { ISD::SRA, MVT::v8i32, { 3, 6, 4, 5 } }, // psrad + split.
490
491 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 1 } }, // psllq.
492 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 1 } }, // psrlq.
493 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
494 { ISD::SHL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
495 { ISD::SRL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
496 { ISD::SRA, MVT::v4i64, { 5, 7, 8, 9 } }, // 2 x psrad + shuffle + split.
497
498 { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split.
499 { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split.
500 { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split.
501 { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split.
502 };
503
504 // XOP has faster vXi8 shifts.
505 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() &&
506 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
507 if (const auto *Entry =
508 CostTableLookup(AVXUniformConstCostTable, ISD, LT.second))
509 if (auto KindCost = Entry->Cost[CostKind])
510 return LT.first * *KindCost;
511
512 static const CostKindTblEntry SSE2UniformConstCostTable[] = {
513 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
514 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
515 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
516
517 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw.
518 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw.
519 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw.
520
521 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
522 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld.
523 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad.
524
525 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq.
526 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq.
527 { ISD::SRA, MVT::v2i64, { 3, 5, 6, 6 } }, // 2 x psrad + shuffle.
528
529 { ISD::SDIV, MVT::v4i32, { 6 } }, // pmuludq sequence
530 { ISD::SREM, MVT::v4i32, { 8 } }, // pmuludq+mul+sub sequence
531 { ISD::UDIV, MVT::v4i32, { 5 } }, // pmuludq sequence
532 { ISD::UREM, MVT::v4i32, { 7 } }, // pmuludq+mul+sub sequence
533 };
534
535 // XOP has faster vXi8 shifts.
536 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() &&
537 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
538 if (const auto *Entry =
539 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
540 if (auto KindCost = Entry->Cost[CostKind])
541 return LT.first * *KindCost;
542
543 static const CostKindTblEntry AVX512BWConstCostTable[] = {
544 { ISD::SDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
545 { ISD::SREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
546 { ISD::UDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
547 { ISD::UREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
548
549 { ISD::SDIV, MVT::v32i16, { 6 } }, // vpmulhw sequence
550 { ISD::SREM, MVT::v32i16, { 8 } }, // vpmulhw+mul+sub sequence
551 { ISD::UDIV, MVT::v32i16, { 6 } }, // vpmulhuw sequence
552 { ISD::UREM, MVT::v32i16, { 8 } }, // vpmulhuw+mul+sub sequence
553 };
554
555 if (Op2Info.isConstant() && ST->hasBWI())
556 if (const auto *Entry =
557 CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
558 if (auto KindCost = Entry->Cost[CostKind])
559 return LT.first * *KindCost;
560
561 static const CostKindTblEntry AVX512ConstCostTable[] = {
562 { ISD::SDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
563 { ISD::SREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
564 { ISD::UDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
565 { ISD::UREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
566
567 { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence
568 { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence
569 { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence
570 { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence
571
572 { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence
573 { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence
574 { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence
575 { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence
576 };
577
578 if (Op2Info.isConstant() && ST->hasAVX512())
579 if (const auto *Entry =
580 CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
581 if (auto KindCost = Entry->Cost[CostKind])
582 return LT.first * *KindCost;
583
584 static const CostKindTblEntry AVX2ConstCostTable[] = {
585 { ISD::SDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
586 { ISD::SREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
587 { ISD::UDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
588 { ISD::UREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
589
590 { ISD::SDIV, MVT::v16i16, { 6 } }, // vpmulhw sequence
591 { ISD::SREM, MVT::v16i16, { 8 } }, // vpmulhw+mul+sub sequence
592 { ISD::UDIV, MVT::v16i16, { 6 } }, // vpmulhuw sequence
593 { ISD::UREM, MVT::v16i16, { 8 } }, // vpmulhuw+mul+sub sequence
594
595 { ISD::SDIV, MVT::v8i32, { 15 } }, // vpmuldq sequence
596 { ISD::SREM, MVT::v8i32, { 19 } }, // vpmuldq+mul+sub sequence
597 { ISD::UDIV, MVT::v8i32, { 15 } }, // vpmuludq sequence
598 { ISD::UREM, MVT::v8i32, { 19 } }, // vpmuludq+mul+sub sequence
599 };
600
601 if (Op2Info.isConstant() && ST->hasAVX2())
602 if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
603 if (auto KindCost = Entry->Cost[CostKind])
604 return LT.first * *KindCost;
605
606 static const CostKindTblEntry AVXConstCostTable[] = {
607 { ISD::SDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
608 { ISD::SREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
609 { ISD::UDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
610 { ISD::UREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
611
612 { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split.
613 { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split.
614 { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split.
615 { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split.
616
617 { ISD::SDIV, MVT::v8i32, { 32 } }, // vpmuludq sequence
618 { ISD::SREM, MVT::v8i32, { 38 } }, // vpmuludq+mul+sub sequence
619 { ISD::UDIV, MVT::v8i32, { 32 } }, // 2*pmuludq sequence + split.
620 { ISD::UREM, MVT::v8i32, { 42 } }, // 2*pmuludq+mul+sub sequence + split.
621 };
622
623 if (Op2Info.isConstant() && ST->hasAVX())
624 if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second))
625 if (auto KindCost = Entry->Cost[CostKind])
626 return LT.first * *KindCost;
627
628 static const CostKindTblEntry SSE41ConstCostTable[] = {
629 { ISD::SDIV, MVT::v4i32, { 15 } }, // vpmuludq sequence
630 { ISD::SREM, MVT::v4i32, { 20 } }, // vpmuludq+mul+sub sequence
631 };
632
633 if (Op2Info.isConstant() && ST->hasSSE41())
634 if (const auto *Entry =
635 CostTableLookup(SSE41ConstCostTable, ISD, LT.second))
636 if (auto KindCost = Entry->Cost[CostKind])
637 return LT.first * *KindCost;
638
639 static const CostKindTblEntry SSE2ConstCostTable[] = {
640 { ISD::SDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
641 { ISD::SREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
642 { ISD::UDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
643 { ISD::UREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
644
645 { ISD::SDIV, MVT::v8i16, { 6 } }, // pmulhw sequence
646 { ISD::SREM, MVT::v8i16, { 8 } }, // pmulhw+mul+sub sequence
647 { ISD::UDIV, MVT::v8i16, { 6 } }, // pmulhuw sequence
648 { ISD::UREM, MVT::v8i16, { 8 } }, // pmulhuw+mul+sub sequence
649
650 { ISD::SDIV, MVT::v4i32, { 19 } }, // pmuludq sequence
651 { ISD::SREM, MVT::v4i32, { 24 } }, // pmuludq+mul+sub sequence
652 { ISD::UDIV, MVT::v4i32, { 15 } }, // pmuludq sequence
653 { ISD::UREM, MVT::v4i32, { 20 } }, // pmuludq+mul+sub sequence
654 };
655
656 if (Op2Info.isConstant() && ST->hasSSE2())
657 if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
658 if (auto KindCost = Entry->Cost[CostKind])
659 return LT.first * *KindCost;
660
661 static const CostKindTblEntry AVX512BWUniformCostTable[] = {
662 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
663 { ISD::SRL, MVT::v16i8, { 3,10, 5, 8 } }, // psrlw + pand.
664 { ISD::SRA, MVT::v16i8, { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb.
665 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
666 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
667 { ISD::SRA, MVT::v32i8, { 5,10,10,13 } }, // psrlw, pand, pxor, psubb.
668 { ISD::SHL, MVT::v64i8, { 4, 7, 6, 8 } }, // psllw + pand.
669 { ISD::SRL, MVT::v64i8, { 4, 8, 7,10 } }, // psrlw + pand.
670 { ISD::SRA, MVT::v64i8, { 5,10,10,15 } }, // psrlw, pand, pxor, psubb.
671
672 { ISD::SHL, MVT::v32i16, { 2, 4, 2, 3 } }, // psllw
673 { ISD::SRL, MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw
674 { ISD::SRA, MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw
675 };
676
677 if (ST->hasBWI() && Op2Info.isUniform())
678 if (const auto *Entry =
679 CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second))
680 if (auto KindCost = Entry->Cost[CostKind])
681 return LT.first * *KindCost;
682
683 static const CostKindTblEntry AVX512UniformCostTable[] = {
684 { ISD::SHL, MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split.
685 { ISD::SRL, MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split.
686 { ISD::SRA, MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split.
687
688 { ISD::SHL, MVT::v16i32, { 2, 4, 2, 3 } }, // pslld
689 { ISD::SRL, MVT::v16i32, { 2, 4, 2, 3 } }, // psrld
690 { ISD::SRA, MVT::v16i32, { 2, 4, 2, 3 } }, // psrad
691
692 { ISD::SRA, MVT::v2i64, { 1, 2, 1, 2 } }, // psraq
693 { ISD::SHL, MVT::v4i64, { 1, 4, 1, 2 } }, // psllq
694 { ISD::SRL, MVT::v4i64, { 1, 4, 1, 2 } }, // psrlq
695 { ISD::SRA, MVT::v4i64, { 1, 4, 1, 2 } }, // psraq
696 { ISD::SHL, MVT::v8i64, { 1, 4, 1, 2 } }, // psllq
697 { ISD::SRL, MVT::v8i64, { 1, 4, 1, 2 } }, // psrlq
698 { ISD::SRA, MVT::v8i64, { 1, 4, 1, 2 } }, // psraq
699 };
700
701 if (ST->hasAVX512() && Op2Info.isUniform())
702 if (const auto *Entry =
703 CostTableLookup(AVX512UniformCostTable, ISD, LT.second))
704 if (auto KindCost = Entry->Cost[CostKind])
705 return LT.first * *KindCost;
706
707 static const CostKindTblEntry AVX2UniformCostTable[] = {
708 // Uniform splats are cheaper for the following instructions.
709 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
710 { ISD::SRL, MVT::v16i8, { 3, 9, 5, 8 } }, // psrlw + pand.
711 { ISD::SRA, MVT::v16i8, { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb.
712 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
713 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
714 { ISD::SRA, MVT::v32i8, { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb.
715
716 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 2 } }, // psllw.
717 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 2 } }, // psrlw.
718 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 2 } }, // psraw.
719 { ISD::SHL, MVT::v16i16, { 2, 4, 2, 3 } }, // psllw.
720 { ISD::SRL, MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw.
721 { ISD::SRA, MVT::v16i16, { 2, 4, 2, 3 } }, // psraw.
722
723 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 2 } }, // pslld
724 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 2 } }, // psrld
725 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 2 } }, // psrad
726 { ISD::SHL, MVT::v8i32, { 2, 4, 2, 3 } }, // pslld
727 { ISD::SRL, MVT::v8i32, { 2, 4, 2, 3 } }, // psrld
728 { ISD::SRA, MVT::v8i32, { 2, 4, 2, 3 } }, // psrad
729
730 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 2 } }, // psllq
731 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 2 } }, // psrlq
732 { ISD::SRA, MVT::v2i64, { 2, 4, 5, 7 } }, // 2 x psrad + shuffle.
733 { ISD::SHL, MVT::v4i64, { 2, 4, 1, 2 } }, // psllq
734 { ISD::SRL, MVT::v4i64, { 2, 4, 1, 2 } }, // psrlq
735 { ISD::SRA, MVT::v4i64, { 4, 6, 5, 9 } }, // 2 x psrad + shuffle.
736 };
737
738 if (ST->hasAVX2() && Op2Info.isUniform())
739 if (const auto *Entry =
740 CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
741 if (auto KindCost = Entry->Cost[CostKind])
742 return LT.first * *KindCost;
743
744 static const CostKindTblEntry AVXUniformCostTable[] = {
745 { ISD::SHL, MVT::v16i8, { 4, 4, 6, 8 } }, // psllw + pand.
746 { ISD::SRL, MVT::v16i8, { 4, 8, 5, 8 } }, // psrlw + pand.
747 { ISD::SRA, MVT::v16i8, { 6, 6, 9,13 } }, // psrlw, pand, pxor, psubb.
748 { ISD::SHL, MVT::v32i8, { 7, 8,11,14 } }, // psllw + pand + split.
749 { ISD::SRL, MVT::v32i8, { 7, 9,10,14 } }, // psrlw + pand + split.
750 { ISD::SRA, MVT::v32i8, { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split.
751
752 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 2 } }, // psllw.
753 { ISD::SRL, MVT::v8i16, { 1, 3, 1, 2 } }, // psrlw.
754 { ISD::SRA, MVT::v8i16, { 1, 3, 1, 2 } }, // psraw.
755 { ISD::SHL, MVT::v16i16, { 3, 7, 5, 7 } }, // psllw + split.
756 { ISD::SRL, MVT::v16i16, { 3, 7, 5, 7 } }, // psrlw + split.
757 { ISD::SRA, MVT::v16i16, { 3, 7, 5, 7 } }, // psraw + split.
758
759 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 2 } }, // pslld.
760 { ISD::SRL, MVT::v4i32, { 1, 3, 1, 2 } }, // psrld.
761 { ISD::SRA, MVT::v4i32, { 1, 3, 1, 2 } }, // psrad.
762 { ISD::SHL, MVT::v8i32, { 3, 7, 5, 7 } }, // pslld + split.
763 { ISD::SRL, MVT::v8i32, { 3, 7, 5, 7 } }, // psrld + split.
764 { ISD::SRA, MVT::v8i32, { 3, 7, 5, 7 } }, // psrad + split.
765
766 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 2 } }, // psllq.
767 { ISD::SRL, MVT::v2i64, { 1, 3, 1, 2 } }, // psrlq.
768 { ISD::SRA, MVT::v2i64, { 3, 4, 5, 7 } }, // 2 x psrad + shuffle.
769 { ISD::SHL, MVT::v4i64, { 3, 7, 4, 6 } }, // psllq + split.
770 { ISD::SRL, MVT::v4i64, { 3, 7, 4, 6 } }, // psrlq + split.
771 { ISD::SRA, MVT::v4i64, { 6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split.
772 };
773
774 // XOP has faster vXi8 shifts.
775 if (ST->hasAVX() && Op2Info.isUniform() &&
776 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
777 if (const auto *Entry =
778 CostTableLookup(AVXUniformCostTable, ISD, LT.second))
779 if (auto KindCost = Entry->Cost[CostKind])
780 return LT.first * *KindCost;
781
782 static const CostKindTblEntry SSE2UniformCostTable[] = {
783 // Uniform splats are cheaper for the following instructions.
784 { ISD::SHL, MVT::v16i8, { 9, 10, 6, 9 } }, // psllw + pand.
785 { ISD::SRL, MVT::v16i8, { 9, 13, 5, 9 } }, // psrlw + pand.
786 { ISD::SRA, MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence.
787
788 { ISD::SHL, MVT::v8i16, { 2, 2, 1, 2 } }, // psllw.
789 { ISD::SRL, MVT::v8i16, { 2, 2, 1, 2 } }, // psrlw.
790 { ISD::SRA, MVT::v8i16, { 2, 2, 1, 2 } }, // psraw.
791
792 { ISD::SHL, MVT::v4i32, { 2, 2, 1, 2 } }, // pslld
793 { ISD::SRL, MVT::v4i32, { 2, 2, 1, 2 } }, // psrld.
794 { ISD::SRA, MVT::v4i32, { 2, 2, 1, 2 } }, // psrad.
795
796 { ISD::SHL, MVT::v2i64, { 2, 2, 1, 2 } }, // psllq.
797 { ISD::SRL, MVT::v2i64, { 2, 2, 1, 2 } }, // psrlq.
798 { ISD::SRA, MVT::v2i64, { 5, 9, 5, 7 } }, // 2*psrlq + xor + sub.
799 };
800
801 if (ST->hasSSE2() && Op2Info.isUniform() &&
802 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
803 if (const auto *Entry =
804 CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
805 if (auto KindCost = Entry->Cost[CostKind])
806 return LT.first * *KindCost;
807
808 static const CostKindTblEntry AVX512DQCostTable[] = {
809 { ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq
810 { ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq
811 { ISD::MUL, MVT::v8i64, { 3, 15, 1, 3 } } // pmullq
812 };
813
814 // Look for AVX512DQ lowering tricks for custom cases.
815 if (ST->hasDQI())
816 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
817 if (auto KindCost = Entry->Cost[CostKind])
818 return LT.first * *KindCost;
819
820 static const CostKindTblEntry AVX512BWCostTable[] = {
821 { ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence.
822 { ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence.
823 { ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence.
824 { ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence.
825 { ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence.
826 { ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence.
827 { ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence.
828 { ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence.
829 { ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence.
830
831 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw
832 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw
833 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw
834 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw
835 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw
836 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw
837 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw
838 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw
839 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw
840
841 { ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb
842 { ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw
843
844 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb
845 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw
846 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd
847 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq
848
849 { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb
850 { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw
851
852 { ISD::MUL, MVT::v64i8, { 5, 10,10,11 } },
853 { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw
854
855 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb
856 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw
857 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd
858 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq
859 };
860
861 // Look for AVX512BW lowering tricks for custom cases.
862 if (ST->hasBWI())
863 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
864 if (auto KindCost = Entry->Cost[CostKind])
865 return LT.first * *KindCost;
866
867 static const CostKindTblEntry AVX512CostTable[] = {
868 { ISD::SHL, MVT::v64i8, { 15, 19,27,33 } }, // vpblendv+split sequence.
869 { ISD::SRL, MVT::v64i8, { 15, 19,30,36 } }, // vpblendv+split sequence.
870 { ISD::SRA, MVT::v64i8, { 37, 37,51,63 } }, // vpblendv+split sequence.
871
872 { ISD::SHL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
873 { ISD::SRL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
874 { ISD::SRA, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence.
875
876 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } },
877 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } },
878 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } },
879 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } },
880 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } },
881 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } },
882 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } },
883 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } },
884 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } },
885
886 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } },
887 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } },
888 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } },
889 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } },
890 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } },
891 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } },
892 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } },
893 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } },
894 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } },
895
896 { ISD::ADD, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*paddb + split
897 { ISD::ADD, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*paddw + split
898
899 { ISD::SUB, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*psubb + split
900 { ISD::SUB, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*psubw + split
901
902 { ISD::AND, MVT::v32i8, { 1, 1, 1, 1 } },
903 { ISD::AND, MVT::v16i16, { 1, 1, 1, 1 } },
904 { ISD::AND, MVT::v8i32, { 1, 1, 1, 1 } },
905 { ISD::AND, MVT::v4i64, { 1, 1, 1, 1 } },
906
907 { ISD::OR, MVT::v32i8, { 1, 1, 1, 1 } },
908 { ISD::OR, MVT::v16i16, { 1, 1, 1, 1 } },
909 { ISD::OR, MVT::v8i32, { 1, 1, 1, 1 } },
910 { ISD::OR, MVT::v4i64, { 1, 1, 1, 1 } },
911
912 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 1 } },
913 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 1 } },
914 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } },
915 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } },
916
917 { ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
918 { ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
919 { ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
920 { ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add
921 { ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/
922
923 { X86ISD::PMULUDQ, MVT::v8i64, { 1, 5, 1, 1 } },
924
925 { ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
926 { ISD::FADD, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
927 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
928 { ISD::FSUB, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
929 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
930 { ISD::FMUL, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
931 { ISD::FMUL, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
932 { ISD::FMUL, MVT::v2f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
933 { ISD::FMUL, MVT::f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
934
935 { ISD::FDIV, MVT::f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
936 { ISD::FDIV, MVT::v2f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
937 { ISD::FDIV, MVT::v4f64, { 8, 14, 1, 1 } }, // Skylake from http://www.agner.org/
938 { ISD::FDIV, MVT::v8f64, { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/
939
940 { ISD::FNEG, MVT::v16f32, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
941 { ISD::FADD, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
942 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
943 { ISD::FSUB, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
944 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
945 { ISD::FMUL, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
946 { ISD::FMUL, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
947 { ISD::FMUL, MVT::v4f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
948 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
949
950 { ISD::FDIV, MVT::f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
951 { ISD::FDIV, MVT::v4f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
952 { ISD::FDIV, MVT::v8f32, { 5, 11, 1, 1 } }, // Skylake from http://www.agner.org/
953 { ISD::FDIV, MVT::v16f32, { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/
954 };
955
956 if (ST->hasAVX512())
957 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
958 if (auto KindCost = Entry->Cost[CostKind])
959 return LT.first * *KindCost;
960
961 static const CostKindTblEntry AVX2ShiftCostTable[] = {
962 // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
963 // customize them to detect the cases where shift amount is a scalar one.
964 { ISD::SHL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org)
965 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
966 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org)
967 { ISD::SHL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org)
968 { ISD::SRL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
969 { ISD::SRA, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org)
970 { ISD::SHL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org)
971 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org)
972 { ISD::SHL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org)
973 { ISD::SRL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org)
974 };
975
976 if (ST->hasAVX512()) {
977 if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant())
978 // On AVX512, a packed v32i16 shift left by a constant build_vector
979 // is lowered into a vector multiply (vpmullw).
980 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
981 Op1Info.getNoProps(), Op2Info.getNoProps());
982 }
983
984 // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
985 if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
986 if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
987 Op2Info.isConstant())
988 // On AVX2, a packed v16i16 shift left by a constant build_vector
989 // is lowered into a vector multiply (vpmullw).
990 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
991 Op1Info.getNoProps(), Op2Info.getNoProps());
992
993 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
994 if (auto KindCost = Entry->Cost[CostKind])
995 return LT.first * *KindCost;
996 }
997
998 static const CostKindTblEntry XOPShiftCostTable[] = {
999 // 128bit shifts take 1cy, but right shifts require negation beforehand.
1000 { ISD::SHL, MVT::v16i8, { 1, 3, 1, 1 } },
1001 { ISD::SRL, MVT::v16i8, { 2, 3, 1, 1 } },
1002 { ISD::SRA, MVT::v16i8, { 2, 3, 1, 1 } },
1003 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 1 } },
1004 { ISD::SRL, MVT::v8i16, { 2, 3, 1, 1 } },
1005 { ISD::SRA, MVT::v8i16, { 2, 3, 1, 1 } },
1006 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 1 } },
1007 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 1 } },
1008 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 1 } },
1009 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 1 } },
1010 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } },
1011 { ISD::SRA, MVT::v2i64, { 2, 3, 1, 1 } },
1012 // 256bit shifts require splitting if AVX2 didn't catch them above.
1013 { ISD::SHL, MVT::v32i8, { 4, 7, 5, 6 } },
1014 { ISD::SRL, MVT::v32i8, { 6, 7, 5, 6 } },
1015 { ISD::SRA, MVT::v32i8, { 6, 7, 5, 6 } },
1016 { ISD::SHL, MVT::v16i16, { 4, 7, 5, 6 } },
1017 { ISD::SRL, MVT::v16i16, { 6, 7, 5, 6 } },
1018 { ISD::SRA, MVT::v16i16, { 6, 7, 5, 6 } },
1019 { ISD::SHL, MVT::v8i32, { 4, 7, 5, 6 } },
1020 { ISD::SRL, MVT::v8i32, { 6, 7, 5, 6 } },
1021 { ISD::SRA, MVT::v8i32, { 6, 7, 5, 6 } },
1022 { ISD::SHL, MVT::v4i64, { 4, 7, 5, 6 } },
1023 { ISD::SRL, MVT::v4i64, { 6, 7, 5, 6 } },
1024 { ISD::SRA, MVT::v4i64, { 6, 7, 5, 6 } },
1025 };
1026
1027 // Look for XOP lowering tricks.
1028 if (ST->hasXOP()) {
1029 // If the right shift is constant then we'll fold the negation so
1030 // it's as cheap as a left shift.
1031 int ShiftISD = ISD;
1032 if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant())
1033 ShiftISD = ISD::SHL;
1034 if (const auto *Entry =
1035 CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
1036 if (auto KindCost = Entry->Cost[CostKind])
1037 return LT.first * *KindCost;
1038 }
1039
1040 if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) {
1041 MVT VT = LT.second;
1042 // Vector shift left by non uniform constant can be lowered
1043 // into vector multiply.
1044 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
1045 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
1046 ISD = ISD::MUL;
1047 }
1048
1049 static const CostKindTblEntry GLMCostTable[] = {
1050 { ISD::FDIV, MVT::f32, { 18, 19, 1, 1 } }, // divss
1051 { ISD::FDIV, MVT::v4f32, { 35, 36, 1, 1 } }, // divps
1052 { ISD::FDIV, MVT::f64, { 33, 34, 1, 1 } }, // divsd
1053 { ISD::FDIV, MVT::v2f64, { 65, 66, 1, 1 } }, // divpd
1054 };
1055
1056 if (ST->useGLMDivSqrtCosts())
1057 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second))
1058 if (auto KindCost = Entry->Cost[CostKind])
1059 return LT.first * *KindCost;
1060
1061 static const CostKindTblEntry SLMCostTable[] = {
1062 { ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld
1063 { ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw
1064 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd
1065 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss
1066 { ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd
1067 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // mulps
1068 { ISD::FDIV, MVT::f32, { 17, 19, 1, 1 } }, // divss
1069 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 6 } }, // divps
1070 { ISD::FDIV, MVT::f64, { 32, 34, 1, 1 } }, // divsd
1071 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 6 } }, // divpd
1072 { ISD::FADD, MVT::v2f64, { 2, 4, 1, 1 } }, // addpd
1073 { ISD::FSUB, MVT::v2f64, { 2, 4, 1, 1 } }, // subpd
1074 // v2i64/v4i64 mul is custom lowered as a series of long:
1075 // multiplies(3), shifts(3) and adds(2)
1076 // slm muldq version throughput is 2 and addq throughput 4
1077 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
1078 // 3X4 (addq throughput) = 17
1079 { ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } },
1080 // slm addq\subq throughput is 4
1081 { ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } },
1082 { ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } },
1083 };
1084
1085 if (ST->useSLMArithCosts())
1086 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second))
1087 if (auto KindCost = Entry->Cost[CostKind])
1088 return LT.first * *KindCost;
1089
1090 static const CostKindTblEntry AVX2CostTable[] = {
1091 { ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence.
1092 { ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence.
1093 { ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence.
1094 { ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1095
1096 { ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence.
1097 { ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence.
1098 { ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence.
1099 { ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1100
1101 { ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence.
1102 { ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence.
1103 { ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence.
1104 { ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence.
1105 { ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence.
1106 { ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence.
1107
1108 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb
1109 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb
1110 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw
1111 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw
1112 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd
1113 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd
1114 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq
1115 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq
1116
1117 { ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack
1118 { ISD::MUL, MVT::v32i8, { 6, 11,10,19 } }, // unpack/pmullw
1119 { ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw
1120 { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld
1121 { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld
1122 { ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add
1123 { ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
1124
1125 { X86ISD::PMULUDQ, MVT::v4i64, { 1, 5, 1, 1 } },
1126
1127 { ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd
1128 { ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps
1129
1130 { ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd
1131 { ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss
1132 { ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd
1133 { ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps
1134 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd
1135 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps
1136
1137 { ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd
1138 { ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss
1139 { ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd
1140 { ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps
1141 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd
1142 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps
1143
1144 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd
1145 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss
1146 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd
1147 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps
1148 { ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd
1149 { ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps
1150
1151 { ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss
1152 { ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps
1153 { ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps
1154 { ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd
1155 { ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd
1156 { ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd
1157 };
1158
1159 // Look for AVX2 lowering tricks for custom cases.
1160 if (ST->hasAVX2())
1161 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
1162 if (auto KindCost = Entry->Cost[CostKind])
1163 return LT.first * *KindCost;
1164
1165 static const CostKindTblEntry AVX1CostTable[] = {
1166 // We don't have to scalarize unsupported ops. We can issue two half-sized
1167 // operations and we only need to extract the upper YMM half.
1168 // Two ops + 1 extract + 1 insert = 4.
1169 { ISD::MUL, MVT::v32i8, { 12, 13, 22, 23 } }, // unpack/pmullw + split
1170 { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split
1171 { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split
1172 { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
1173 { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } },
1174
1175 { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps
1176 { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps
1177 { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps
1178 { ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps
1179
1180 { ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps
1181 { ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps
1182 { ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps
1183 { ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps
1184
1185 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps
1186 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps
1187 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps
1188 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps
1189
1190 { ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split
1191 { ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split
1192 { ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split
1193 { ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split
1194 { ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split
1195 { ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split
1196 { ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split
1197 { ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split
1198 { ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq
1199 { ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq
1200
1201 { ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence.
1202 { ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split.
1203 { ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence.
1204 { ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split.
1205 { ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld
1206 { ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split
1207 { ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1208 { ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1209
1210 { ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence.
1211 { ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split.
1212 { ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1213 { ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1214 { ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1215 { ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1216 { ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1217 { ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1218
1219 { ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence.
1220 { ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split.
1221 { ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1222 { ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1223 { ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1224 { ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1225 { ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend.
1226 { ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split.
1227
1228 { ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1229 { ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1230
1231 { ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1232 { ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1233 { ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1234 { ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1235 { ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1236 { ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1237
1238 { ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1239 { ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1240 { ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1241 { ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1242 { ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1243 { ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1244
1245 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1246 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1247 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1248 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1249 { ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1250 { ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1251
1252 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1253 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1254 { ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/
1255 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1256 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1257 { ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/
1258 };
1259
1260 if (ST->hasAVX())
1261 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
1262 if (auto KindCost = Entry->Cost[CostKind])
1263 return LT.first * *KindCost;
1264
1265 static const CostKindTblEntry SSE42CostTable[] = {
1266 { ISD::FADD, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1267 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1268 { ISD::FADD, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1269 { ISD::FADD, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1270
1271 { ISD::FSUB, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1272 { ISD::FSUB, MVT::f32 , { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1273 { ISD::FSUB, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1274 { ISD::FSUB, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1275
1276 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1277 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1278 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1279 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1280
1281 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1282 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1283 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1284 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1285
1286 { ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add
1287 };
1288
1289 if (ST->hasSSE42())
1290 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
1291 if (auto KindCost = Entry->Cost[CostKind])
1292 return LT.first * *KindCost;
1293
1294 static const CostKindTblEntry SSE41CostTable[] = {
1295 { ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence.
1296 { ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence.
1297 { ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld
1298
1299 { ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence.
1300 { ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1301 { ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1302 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1303
1304 { ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence.
1305 { ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1306 { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1307 { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence.
1308
1309 { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*unpack/2*pmullw/2*and/pack
1310 { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org)
1311 };
1312
1313 if (ST->hasSSE41())
1314 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
1315 if (auto KindCost = Entry->Cost[CostKind])
1316 return LT.first * *KindCost;
1317
1318 static const CostKindTblEntry SSE2CostTable[] = {
1319 // We don't correctly identify costs of casts because they are marked as
1320 // custom.
1321 { ISD::SHL, MVT::v16i8, { 13, 21,26,28 } }, // cmpgtb sequence.
1322 { ISD::SHL, MVT::v8i16, { 24, 27,16,20 } }, // cmpgtw sequence.
1323 { ISD::SHL, MVT::v4i32, { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq.
1324 { ISD::SHL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1325
1326 { ISD::SRL, MVT::v16i8, { 14, 28,27,30 } }, // cmpgtb sequence.
1327 { ISD::SRL, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1328 { ISD::SRL, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1329 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1330
1331 { ISD::SRA, MVT::v16i8, { 27, 30,54,54 } }, // unpacked cmpgtb sequence.
1332 { ISD::SRA, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1333 { ISD::SRA, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1334 { ISD::SRA, MVT::v2i64, { 8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence.
1335
1336 { ISD::AND, MVT::v16i8, { 1, 1, 1, 1 } }, // pand
1337 { ISD::AND, MVT::v8i16, { 1, 1, 1, 1 } }, // pand
1338 { ISD::AND, MVT::v4i32, { 1, 1, 1, 1 } }, // pand
1339 { ISD::AND, MVT::v2i64, { 1, 1, 1, 1 } }, // pand
1340
1341 { ISD::OR, MVT::v16i8, { 1, 1, 1, 1 } }, // por
1342 { ISD::OR, MVT::v8i16, { 1, 1, 1, 1 } }, // por
1343 { ISD::OR, MVT::v4i32, { 1, 1, 1, 1 } }, // por
1344 { ISD::OR, MVT::v2i64, { 1, 1, 1, 1 } }, // por
1345
1346 { ISD::XOR, MVT::v16i8, { 1, 1, 1, 1 } }, // pxor
1347 { ISD::XOR, MVT::v8i16, { 1, 1, 1, 1 } }, // pxor
1348 { ISD::XOR, MVT::v4i32, { 1, 1, 1, 1 } }, // pxor
1349 { ISD::XOR, MVT::v2i64, { 1, 1, 1, 1 } }, // pxor
1350
1351 { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq
1352 { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq
1353
1354 { ISD::MUL, MVT::v16i8, { 5, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack
1355 { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw
1356 { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle
1357 { ISD::MUL, MVT::v2i64, { 7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add
1358
1359 { X86ISD::PMULUDQ, MVT::v2i64, { 1, 5, 1, 1 } },
1360
1361 { ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/
1362 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/
1363 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/
1364 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/
1365
1366 { ISD::FNEG, MVT::f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1367 { ISD::FNEG, MVT::f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1368 { ISD::FNEG, MVT::v4f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1369 { ISD::FNEG, MVT::v2f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1370
1371 { ISD::FADD, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1372 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1373 { ISD::FADD, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1374
1375 { ISD::FSUB, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1376 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1377 { ISD::FSUB, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1378
1379 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1380 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1381 };
1382
1383 if (ST->hasSSE2())
1384 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1385 if (auto KindCost = Entry->Cost[CostKind])
1386 return LT.first * *KindCost;
1387
1388 static const CostKindTblEntry SSE1CostTable[] = {
1389 { ISD::FDIV, MVT::f32, { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/
1390 { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/
1391
1392 { ISD::FNEG, MVT::f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1393 { ISD::FNEG, MVT::v4f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1394
1395 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1396 { ISD::FADD, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1397
1398 { ISD::FSUB, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1399 { ISD::FSUB, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1400
1401 { ISD::FMUL, MVT::f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1402 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1403 };
1404
1405 if (ST->hasSSE1())
1406 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1407 if (auto KindCost = Entry->Cost[CostKind])
1408 return LT.first * *KindCost;
1409
1410 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
1411 { ISD::ADD, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1412 { ISD::SUB, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1413 { ISD::MUL, MVT::i64, { 2, 6, 1, 2 } },
1414 };
1415
1416 if (ST->is64Bit())
1417 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1418 if (auto KindCost = Entry->Cost[CostKind])
1419 return LT.first * *KindCost;
1420
1421 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1422 { ISD::ADD, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1423 { ISD::ADD, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1424 { ISD::ADD, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1425
1426 { ISD::SUB, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1427 { ISD::SUB, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1428 { ISD::SUB, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1429
1430 { ISD::MUL, MVT::i8, { 3, 4, 1, 1 } },
1431 { ISD::MUL, MVT::i16, { 2, 4, 1, 1 } },
1432 { ISD::MUL, MVT::i32, { 1, 4, 1, 1 } },
1433
1434 { ISD::FNEG, MVT::f64, { 2, 2, 1, 3 } }, // (x87)
1435 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1436 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1437 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // (x87)
1438 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87)
1439 };
1440
1441 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1442 if (auto KindCost = Entry->Cost[CostKind])
1443 return LT.first * *KindCost;
1444
1445 // It is not a good idea to vectorize division. We have to scalarize it and
1446 // in the process we will often end up having to spilling regular
1447 // registers. The overhead of division is going to dominate most kernels
1448 // anyways so try hard to prevent vectorization of division - it is
1449 // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1450 // to hide "20 cycles" for each lane.
1451 if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() &&
1452 (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
1453 ISD == ISD::UREM)) {
1454 InstructionCost ScalarCost =
1456 Op1Info.getNoProps(), Op2Info.getNoProps());
1457 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1458 }
1459
1460 // Handle some basic single instruction code size cases.
1461 if (CostKind == TTI::TCK_CodeSize) {
1462 switch (ISD) {
1463 case ISD::FADD:
1464 case ISD::FSUB:
1465 case ISD::FMUL:
1466 case ISD::FDIV:
1467 case ISD::FNEG:
1468 case ISD::AND:
1469 case ISD::OR:
1470 case ISD::XOR:
1471 return LT.first;
1472 break;
1473 }
1474 }
1475
1476 // Fallback to the default implementation.
1477 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1478 Args, CxtI);
1479}
1480
1483 unsigned Opcode1, const SmallBitVector &OpcodeMask,
1485 if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
1486 return TTI::TCC_Basic;
1488}
1489
1491 TTI::ShuffleKind Kind, VectorType *BaseTp, ArrayRef<int> Mask,
1493 ArrayRef<const Value *> Args, const Instruction *CxtI) {
1494 // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1495 // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1496 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(BaseTp);
1497
1498 Kind = improveShuffleKindFromMask(Kind, Mask, BaseTp, Index, SubTp);
1499
1500 // Recognize a basic concat_vector shuffle.
1501 if (Kind == TTI::SK_PermuteTwoSrc &&
1502 Mask.size() == (2 * BaseTp->getElementCount().getKnownMinValue()) &&
1503 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
1506 CostKind, Mask.size() / 2, BaseTp);
1507
1508 // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1509 if (Kind == TTI::SK_Transpose)
1510 Kind = TTI::SK_PermuteTwoSrc;
1511
1512 if (Kind == TTI::SK_Broadcast) {
1513 // For Broadcasts we are splatting the first element from the first input
1514 // register, so only need to reference that input and all the output
1515 // registers are the same.
1516 LT.first = 1;
1517
1518 // If we're broadcasting a load then AVX/AVX2 can do this for free.
1519 using namespace PatternMatch;
1520 if (!Args.empty() && match(Args[0], m_OneUse(m_Load(m_Value()))) &&
1521 (ST->hasAVX2() ||
1522 (ST->hasAVX() && LT.second.getScalarSizeInBits() >= 32)))
1523 return TTI::TCC_Free;
1524 }
1525
1526 // Treat <X x bfloat> shuffles as <X x half>.
1527 if (LT.second.isVector() && LT.second.getScalarType() == MVT::bf16)
1528 LT.second = LT.second.changeVectorElementType(MVT::f16);
1529
1530 // Subvector extractions are free if they start at the beginning of a
1531 // vector and cheap if the subvectors are aligned.
1532 if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1533 int NumElts = LT.second.getVectorNumElements();
1534 if ((Index % NumElts) == 0)
1535 return 0;
1536 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1537 if (SubLT.second.isVector()) {
1538 int NumSubElts = SubLT.second.getVectorNumElements();
1539 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1540 return SubLT.first;
1541 // Handle some cases for widening legalization. For now we only handle
1542 // cases where the original subvector was naturally aligned and evenly
1543 // fit in its legalized subvector type.
1544 // FIXME: Remove some of the alignment restrictions.
1545 // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1546 // vectors.
1547 int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1548 if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1549 (NumSubElts % OrigSubElts) == 0 &&
1550 LT.second.getVectorElementType() ==
1551 SubLT.second.getVectorElementType() &&
1552 LT.second.getVectorElementType().getSizeInBits() ==
1554 assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1555 "Unexpected number of elements!");
1556 auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
1557 LT.second.getVectorNumElements());
1558 auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
1559 SubLT.second.getVectorNumElements());
1560 int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1561 InstructionCost ExtractCost =
1562 getShuffleCost(TTI::SK_ExtractSubvector, VecTy, std::nullopt,
1563 CostKind, ExtractIndex, SubTy);
1564
1565 // If the original size is 32-bits or more, we can use pshufd. Otherwise
1566 // if we have SSSE3 we can use pshufb.
1567 if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1568 return ExtractCost + 1; // pshufd or pshufb
1569
1570 assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1571 "Unexpected vector size");
1572
1573 return ExtractCost + 2; // worst case pshufhw + pshufd
1574 }
1575 }
1576 // If the extract subvector is not optimal, treat it as single op shuffle.
1578 }
1579
1580 // Subvector insertions are cheap if the subvectors are aligned.
1581 // Note that in general, the insertion starting at the beginning of a vector
1582 // isn't free, because we need to preserve the rest of the wide vector.
1583 if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1584 int NumElts = LT.second.getVectorNumElements();
1585 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1586 if (SubLT.second.isVector()) {
1587 int NumSubElts = SubLT.second.getVectorNumElements();
1588 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1589 return SubLT.first;
1590 }
1591
1592 // If the insertion isn't aligned, treat it like a 2-op shuffle.
1593 Kind = TTI::SK_PermuteTwoSrc;
1594 }
1595
1596 // Handle some common (illegal) sub-vector types as they are often very cheap
1597 // to shuffle even on targets without PSHUFB.
1598 EVT VT = TLI->getValueType(DL, BaseTp);
1599 if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1600 !ST->hasSSSE3()) {
1601 static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
1602 {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw
1603 {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw
1604 {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw
1605 {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw
1606 {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck
1607
1608 {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw
1609 {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw
1610 {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus
1611 {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck
1612
1613 {TTI::SK_Splice, MVT::v4i16, 2}, // punpck+psrldq
1614 {TTI::SK_Splice, MVT::v2i16, 2}, // punpck+psrldq
1615 {TTI::SK_Splice, MVT::v4i8, 2}, // punpck+psrldq
1616 {TTI::SK_Splice, MVT::v2i8, 2}, // punpck+psrldq
1617
1618 {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw
1619 {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw
1620 {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw
1621 {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw
1622 {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck
1623
1624 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
1625 {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
1626 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw
1627 {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw
1628 {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck
1629 };
1630
1631 if (ST->hasSSE2())
1632 if (const auto *Entry =
1633 CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1634 return Entry->Cost;
1635 }
1636
1637 // We are going to permute multiple sources and the result will be in multiple
1638 // destinations. Providing an accurate cost only for splits where the element
1639 // type remains the same.
1640 if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
1641 MVT LegalVT = LT.second;
1642 if (LegalVT.isVector() &&
1643 LegalVT.getVectorElementType().getSizeInBits() ==
1645 LegalVT.getVectorNumElements() <
1646 cast<FixedVectorType>(BaseTp)->getNumElements()) {
1647 unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
1648 unsigned LegalVTSize = LegalVT.getStoreSize();
1649 // Number of source vectors after legalization:
1650 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1651 // Number of destination vectors after legalization:
1652 InstructionCost NumOfDests = LT.first;
1653
1654 auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
1655 LegalVT.getVectorNumElements());
1656
1657 if (!Mask.empty() && NumOfDests.isValid()) {
1658 // Try to perform better estimation of the permutation.
1659 // 1. Split the source/destination vectors into real registers.
1660 // 2. Do the mask analysis to identify which real registers are
1661 // permuted. If more than 1 source registers are used for the
1662 // destination register building, the cost for this destination register
1663 // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
1664 // source register is used, build mask and calculate the cost as a cost
1665 // of PermuteSingleSrc.
1666 // Also, for the single register permute we try to identify if the
1667 // destination register is just a copy of the source register or the
1668 // copy of the previous destination register (the cost is
1669 // TTI::TCC_Basic). If the source register is just reused, the cost for
1670 // this operation is 0.
1671 NumOfDests =
1673 FixedVectorType::get(BaseTp->getElementType(), Mask.size()))
1674 .first;
1675 unsigned E = *NumOfDests.getValue();
1676 unsigned NormalizedVF =
1677 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
1678 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
1679 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
1680 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
1681 copy(Mask, NormalizedMask.begin());
1682 unsigned PrevSrcReg = 0;
1683 ArrayRef<int> PrevRegMask;
1686 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
1687 [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask,
1688 &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
1689 if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) {
1690 // Check if the previous register can be just copied to the next
1691 // one.
1692 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
1693 PrevRegMask != RegMask)
1695 RegMask, CostKind, 0, nullptr);
1696 else
1697 // Just a copy of previous destination register.
1699 return;
1700 }
1701 if (SrcReg != DestReg &&
1702 any_of(RegMask, [](int I) { return I != PoisonMaskElem; })) {
1703 // Just a copy of the source register.
1705 }
1706 PrevSrcReg = SrcReg;
1707 PrevRegMask = RegMask;
1708 },
1709 [this, SingleOpTy, CostKind, &Cost](ArrayRef<int> RegMask,
1710 unsigned /*Unused*/,
1711 unsigned /*Unused*/) {
1712 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
1713 CostKind, 0, nullptr);
1714 });
1715 return Cost;
1716 }
1717
1718 InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1719 return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1720 std::nullopt, CostKind, 0, nullptr);
1721 }
1722
1723 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
1724 }
1725
1726 // For 2-input shuffles, we must account for splitting the 2 inputs into many.
1727 if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
1728 // We assume that source and destination have the same vector type.
1729 InstructionCost NumOfDests = LT.first;
1730 InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1;
1731 LT.first = NumOfDests * NumOfShufflesPerDest;
1732 }
1733
1734 static const CostTblEntry AVX512VBMIShuffleTbl[] = {
1735 {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
1736 {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
1737
1738 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
1739 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
1740
1741 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
1742 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
1743 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b
1744 };
1745
1746 if (ST->hasVBMI())
1747 if (const auto *Entry =
1748 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1749 return LT.first * Entry->Cost;
1750
1751 static const CostTblEntry AVX512BWShuffleTbl[] = {
1752 {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1753 {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
1754 {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1755
1756 {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
1757 {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
1758 {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
1759 {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
1760
1761 {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
1762 {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
1763 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
1764 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
1765 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
1766
1767 {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
1768 {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
1769 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
1770 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
1771 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1772
1773 {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
1774 {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
1775
1776 {TTI::SK_Splice, MVT::v32i16, 2}, // vshufi64x2 + palignr
1777 {TTI::SK_Splice, MVT::v32f16, 2}, // vshufi64x2 + palignr
1778 {TTI::SK_Splice, MVT::v64i8, 2}, // vshufi64x2 + palignr
1779 };
1780
1781 if (ST->hasBWI())
1782 if (const auto *Entry =
1783 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1784 return LT.first * Entry->Cost;
1785
1786 static const CostKindTblEntry AVX512ShuffleTbl[] = {
1787 {TTI::SK_Broadcast, MVT::v8f64, { 1, 1, 1, 1 } }, // vbroadcastsd
1788 {TTI::SK_Broadcast, MVT::v16f32, { 1, 1, 1, 1 } }, // vbroadcastss
1789 {TTI::SK_Broadcast, MVT::v8i64, { 1, 1, 1, 1 } }, // vpbroadcastq
1790 {TTI::SK_Broadcast, MVT::v16i32, { 1, 1, 1, 1 } }, // vpbroadcastd
1791 {TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw
1792 {TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw
1793 {TTI::SK_Broadcast, MVT::v64i8, { 1, 1, 1, 1 } }, // vpbroadcastb
1794
1795 {TTI::SK_Reverse, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1796 {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1797 {TTI::SK_Reverse, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1798 {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1799 {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca
1800 {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca
1801 {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca
1802
1803 {TTI::SK_Splice, MVT::v8f64, { 1, 1, 1, 1 } }, // vpalignd
1804 {TTI::SK_Splice, MVT::v4f64, { 1, 1, 1, 1 } }, // vpalignd
1805 {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd
1806 {TTI::SK_Splice, MVT::v8f32, { 1, 1, 1, 1 } }, // vpalignd
1807 {TTI::SK_Splice, MVT::v8i64, { 1, 1, 1, 1 } }, // vpalignd
1808 {TTI::SK_Splice, MVT::v4i64, { 1, 1, 1, 1 } }, // vpalignd
1809 {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd
1810 {TTI::SK_Splice, MVT::v8i32, { 1, 1, 1, 1 } }, // vpalignd
1811 {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr
1812 {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr
1813 {TTI::SK_Splice, MVT::v64i8, { 4, 4, 4, 4 } }, // split + palignr
1814
1815 {TTI::SK_PermuteSingleSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1816 {TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermpd
1817 {TTI::SK_PermuteSingleSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermpd
1818 {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1819 {TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermps
1820 {TTI::SK_PermuteSingleSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermps
1821 {TTI::SK_PermuteSingleSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1822 {TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermq
1823 {TTI::SK_PermuteSingleSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermq
1824 {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1825 {TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermd
1826 {TTI::SK_PermuteSingleSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermd
1827 {TTI::SK_PermuteSingleSrc, MVT::v16i8, { 1, 3, 1, 1 } }, // pshufb
1828
1829 {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermt2pd
1830 {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermt2ps
1831 {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermt2q
1832 {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermt2d
1833 {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermt2pd
1834 {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermt2ps
1835 {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermt2q
1836 {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermt2d
1837 {TTI::SK_PermuteTwoSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermt2pd
1838 {TTI::SK_PermuteTwoSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermt2ps
1839 {TTI::SK_PermuteTwoSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermt2q
1840 {TTI::SK_PermuteTwoSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermt2d
1841
1842 // FIXME: This just applies the type legalization cost rules above
1843 // assuming these completely split.
1844 {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } },
1845 {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } },
1846 {TTI::SK_PermuteSingleSrc, MVT::v64i8, { 14, 14, 14, 14 } },
1847 {TTI::SK_PermuteTwoSrc, MVT::v32i16, { 42, 42, 42, 42 } },
1848 {TTI::SK_PermuteTwoSrc, MVT::v32f16, { 42, 42, 42, 42 } },
1849 {TTI::SK_PermuteTwoSrc, MVT::v64i8, { 42, 42, 42, 42 } },
1850
1851 {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq
1852 {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq
1853 {TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vpternlogq
1854 {TTI::SK_Select, MVT::v8f64, { 1, 1, 1, 1 } }, // vblendmpd
1855 {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps
1856 {TTI::SK_Select, MVT::v8i64, { 1, 1, 1, 1 } }, // vblendmq
1857 {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd
1858 };
1859
1860 if (ST->hasAVX512())
1861 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1862 if (auto KindCost = Entry->Cost[CostKind])
1863 return LT.first * *KindCost;
1864
1865 static const CostTblEntry AVX2ShuffleTbl[] = {
1866 {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1867 {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1868 {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1869 {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1870 {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1871 {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
1872 {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1873
1874 {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1875 {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1876 {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1877 {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1878 {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1879 {TTI::SK_Reverse, MVT::v16f16, 2}, // vperm2i128 + pshufb
1880 {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1881
1882 {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1883 {TTI::SK_Select, MVT::v16f16, 1}, // vpblendvb
1884 {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1885
1886 {TTI::SK_Splice, MVT::v8i32, 2}, // vperm2i128 + vpalignr
1887 {TTI::SK_Splice, MVT::v8f32, 2}, // vperm2i128 + vpalignr
1888 {TTI::SK_Splice, MVT::v16i16, 2}, // vperm2i128 + vpalignr
1889 {TTI::SK_Splice, MVT::v16f16, 2}, // vperm2i128 + vpalignr
1890 {TTI::SK_Splice, MVT::v32i8, 2}, // vperm2i128 + vpalignr
1891
1892 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1893 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1894 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1895 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1896 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1897 // + vpblendvb
1898 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vperm2i128 + 2*vpshufb
1899 // + vpblendvb
1900 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1901 // + vpblendvb
1902
1903 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1904 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1905 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1906 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1907 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1908 // + vpblendvb
1909 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 7}, // 2*vperm2i128 + 4*vpshufb
1910 // + vpblendvb
1911 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
1912 // + vpblendvb
1913 };
1914
1915 if (ST->hasAVX2())
1916 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1917 return LT.first * Entry->Cost;
1918
1919 static const CostTblEntry XOPShuffleTbl[] = {
1920 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
1921 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
1922 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
1923 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
1924 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1925 // + vinsertf128
1926 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
1927 // + vinsertf128
1928
1929 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1930 // + vinsertf128
1931 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
1932 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
1933 // + vinsertf128
1934 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
1935 };
1936
1937 if (ST->hasXOP())
1938 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1939 return LT.first * Entry->Cost;
1940
1941 static const CostTblEntry AVX1ShuffleTbl[] = {
1942 {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1943 {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1944 {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1945 {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1946 {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1947 {TTI::SK_Broadcast, MVT::v16f16, 3}, // vpshuflw + vpshufd + vinsertf128
1948 {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
1949
1950 {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1951 {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1952 {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1953 {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1954 {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1955 // + vinsertf128
1956 {TTI::SK_Reverse, MVT::v16f16, 4}, // vextractf128 + 2*pshufb
1957 // + vinsertf128
1958 {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
1959 // + vinsertf128
1960
1961 {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
1962 {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
1963 {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
1964 {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
1965 {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
1966 {TTI::SK_Select, MVT::v16f16, 3}, // vpand + vpandn + vpor
1967 {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
1968
1969 {TTI::SK_Splice, MVT::v4i64, 2}, // vperm2f128 + shufpd
1970 {TTI::SK_Splice, MVT::v4f64, 2}, // vperm2f128 + shufpd
1971 {TTI::SK_Splice, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1972 {TTI::SK_Splice, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1973 {TTI::SK_Splice, MVT::v16i16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1974 {TTI::SK_Splice, MVT::v16f16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1975 {TTI::SK_Splice, MVT::v32i8, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
1976
1977 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
1978 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
1979 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1980 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1981 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
1982 // + 2*por + vinsertf128
1983 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 8}, // vextractf128 + 4*pshufb
1984 // + 2*por + vinsertf128
1985 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
1986 // + 2*por + vinsertf128
1987
1988 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
1989 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
1990 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1991 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1992 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
1993 // + 4*por + vinsertf128
1994 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 15}, // 2*vextractf128 + 8*pshufb
1995 // + 4*por + vinsertf128
1996 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
1997 // + 4*por + vinsertf128
1998 };
1999
2000 if (ST->hasAVX())
2001 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
2002 return LT.first * Entry->Cost;
2003
2004 static const CostTblEntry SSE41ShuffleTbl[] = {
2005 {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
2006 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
2007 {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
2008 {TTI::SK_Select, MVT::v4f32, 1}, // blendps
2009 {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
2010 {TTI::SK_Select, MVT::v8f16, 1}, // pblendw
2011 {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
2012 };
2013
2014 if (ST->hasSSE41())
2015 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
2016 return LT.first * Entry->Cost;
2017
2018 static const CostTblEntry SSSE3ShuffleTbl[] = {
2019 {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
2020 {TTI::SK_Broadcast, MVT::v8f16, 1}, // pshufb
2021 {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
2022
2023 {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
2024 {TTI::SK_Reverse, MVT::v8f16, 1}, // pshufb
2025 {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
2026
2027 {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
2028 {TTI::SK_Select, MVT::v8f16, 3}, // 2*pshufb + por
2029 {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
2030
2031 {TTI::SK_Splice, MVT::v4i32, 1}, // palignr
2032 {TTI::SK_Splice, MVT::v4f32, 1}, // palignr
2033 {TTI::SK_Splice, MVT::v8i16, 1}, // palignr
2034 {TTI::SK_Splice, MVT::v8f16, 1}, // palignr
2035 {TTI::SK_Splice, MVT::v16i8, 1}, // palignr
2036
2037 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
2038 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // pshufb
2039 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
2040
2041 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
2042 {TTI::SK_PermuteTwoSrc, MVT::v8f16, 3}, // 2*pshufb + por
2043 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
2044 };
2045
2046 if (ST->hasSSSE3())
2047 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
2048 return LT.first * Entry->Cost;
2049
2050 static const CostTblEntry SSE2ShuffleTbl[] = {
2051 {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
2052 {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
2053 {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
2054 {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
2055 {TTI::SK_Broadcast, MVT::v8f16, 2}, // pshuflw + pshufd
2056 {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
2057
2058 {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
2059 {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
2060 {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
2061 {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
2062 {TTI::SK_Reverse, MVT::v8f16, 3}, // pshuflw + pshufhw + pshufd
2063 {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
2064 // + 2*pshufd + 2*unpck + packus
2065
2066 {TTI::SK_Select, MVT::v2i64, 1}, // movsd
2067 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
2068 {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
2069 {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
2070 {TTI::SK_Select, MVT::v8f16, 3}, // pand + pandn + por
2071 {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
2072
2073 {TTI::SK_Splice, MVT::v2i64, 1}, // shufpd
2074 {TTI::SK_Splice, MVT::v2f64, 1}, // shufpd
2075 {TTI::SK_Splice, MVT::v4i32, 2}, // 2*{unpck,movsd,pshufd}
2076 {TTI::SK_Splice, MVT::v8i16, 3}, // psrldq + psrlldq + por
2077 {TTI::SK_Splice, MVT::v8f16, 3}, // psrldq + psrlldq + por
2078 {TTI::SK_Splice, MVT::v16i8, 3}, // psrldq + psrlldq + por
2079
2080 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
2081 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
2082 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
2083 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
2084 // + pshufd/unpck
2085 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 5}, // 2*pshuflw + 2*pshufhw
2086 // + pshufd/unpck
2087 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
2088 // + 2*pshufd + 2*unpck + 2*packus
2089
2090 { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
2091 { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
2092 { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
2093 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
2094 { TTI::SK_PermuteTwoSrc, MVT::v8f16, 8 }, // blend+permute
2095 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
2096 };
2097
2098 static const CostTblEntry SSE3BroadcastLoadTbl[] = {
2099 {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
2100 };
2101
2102 if (ST->hasSSE2()) {
2103 bool IsLoad =
2104 llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
2105 if (ST->hasSSE3() && IsLoad)
2106 if (const auto *Entry =
2107 CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
2109 LT.second.getVectorElementCount()) &&
2110 "Table entry missing from isLegalBroadcastLoad()");
2111 return LT.first * Entry->Cost;
2112 }
2113
2114 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
2115 return LT.first * Entry->Cost;
2116 }
2117
2118 static const CostTblEntry SSE1ShuffleTbl[] = {
2119 { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
2120 { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
2121 { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
2122 { TTI::SK_Splice, MVT::v4f32, 2 }, // 2*shufps
2123 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
2124 { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
2125 };
2126
2127 if (ST->hasSSE1())
2128 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
2129 return LT.first * Entry->Cost;
2130
2131 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
2132}
2133
2135 Type *Src,
2138 const Instruction *I) {
2139 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2140 assert(ISD && "Invalid opcode");
2141
2142 // The cost tables include both specific, custom (non-legal) src/dst type
2143 // conversions and generic, legalized types. We test for customs first, before
2144 // falling back to legalization.
2145 // FIXME: Need a better design of the cost table to handle non-simple types of
2146 // potential massive combinations (elem_num x src_type x dst_type).
2147 static const TypeConversionCostKindTblEntry AVX512BWConversionTbl[]{
2148 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2149 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2150
2151 // Mask sign extend has an instruction.
2152 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2153 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2154 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2155 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2156 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2157 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2158 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2159 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2160 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2161 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2162 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2163 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2164 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2165 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2166 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, { 1, 1, 1, 1 } },
2167 { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, { 1, 1, 1, 1 } },
2168 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, { 1, 1, 1, 1 } },
2169
2170 // Mask zero extend is a sext + shift.
2171 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2172 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2173 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2174 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2175 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2176 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2177 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2178 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2179 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2180 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2181 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2182 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2183 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2184 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2185 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, { 2, 1, 1, 1 } },
2186 { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, { 2, 1, 1, 1 } },
2187 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, { 2, 1, 1, 1 } },
2188
2189 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2190 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2191 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2192 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2193 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2194 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2195 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2196 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2197 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2198 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2199 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2200 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2201 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2202 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2203 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, { 2, 1, 1, 1 } },
2204 { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, { 2, 1, 1, 1 } },
2205 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, { 2, 1, 1, 1 } },
2206
2207 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 2, 1, 1, 1 } },
2208 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // widen to zmm
2209 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, { 2, 1, 1, 1 } }, // vpmovwb
2210 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, { 2, 1, 1, 1 } }, // vpmovwb
2211 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, { 2, 1, 1, 1 } }, // vpmovwb
2212 };
2213
2214 static const TypeConversionCostKindTblEntry AVX512DQConversionTbl[] = {
2215 // Mask sign extend has an instruction.
2216 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2217 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2218 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2219 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2220 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2221 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, { 1, 1, 1, 1 } },
2222 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } },
2223 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } },
2224
2225 // Mask zero extend is a sext + shift.
2226 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1, } },
2227 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1, } },
2228 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1, } },
2229 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1, } },
2230 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1, } },
2231 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, { 2, 1, 1, 1, } },
2232 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1, } },
2233 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1, } },
2234
2235 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2236 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2237 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2238 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2239 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2240 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } },
2241 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } },
2242 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, { 2, 1, 1, 1 } },
2243
2244 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2245 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2246
2247 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2248 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2249
2250 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2251 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2252
2253 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2254 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2255 };
2256
2257 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
2258 // 256-bit wide vectors.
2259
2260 static const TypeConversionCostKindTblEntry AVX512FConversionTbl[] = {
2261 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 1, 1, 1, 1 } },
2262 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, { 3, 1, 1, 1 } },
2263 { ISD::FP_EXTEND, MVT::v16f64, MVT::v16f32, { 4, 1, 1, 1 } }, // 2*vcvtps2pd+vextractf64x4
2264 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 1, 1, 1, 1 } },
2265
2266 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2267 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2268 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2269 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2270 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2271 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2272 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2273 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2274 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2275 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2276 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2277 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2278 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2279 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2280 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2281 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, { 2, 1, 1, 1 } }, // vpmovdb
2282 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, { 2, 1, 1, 1 } }, // vpmovdb
2283 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2284 { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2285 { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2286 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2287 { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2288 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, { 2, 1, 1, 1 } }, // vpmovqb
2289 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, { 1, 1, 1, 1 } }, // vpshufb
2290 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2291 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2292 { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2293 { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2294 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2295 { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2296 { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2297 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, { 1, 1, 1, 1 } }, // vpmovqd
2298 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // zmm vpmovqd
2299 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, { 5, 1, 1, 1 } },// 2*vpmovqd+concat+vpmovdb
2300
2301 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } }, // extend to v16i32
2302 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 8, 1, 1, 1 } },
2303 { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, { 8, 1, 1, 1 } },
2304
2305 // Sign extend is zmm vpternlogd+vptruncdb.
2306 // Zero extend is zmm broadcast load+vptruncdw.
2307 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 3, 1, 1, 1 } },
2308 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 4, 1, 1, 1 } },
2309 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 3, 1, 1, 1 } },
2310 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 4, 1, 1, 1 } },
2311 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 3, 1, 1, 1 } },
2312 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 4, 1, 1, 1 } },
2313 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 3, 1, 1, 1 } },
2314 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 4, 1, 1, 1 } },
2315
2316 // Sign extend is zmm vpternlogd+vptruncdw.
2317 // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
2318 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 3, 1, 1, 1 } },
2319 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2320 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 3, 1, 1, 1 } },
2321 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2322 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 3, 1, 1, 1 } },
2323 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2324 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 3, 1, 1, 1 } },
2325 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2326
2327 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2328 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2329 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2330 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2331 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2332 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2333 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2334 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2335 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2336 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2337
2338 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2339 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2340 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogq
2341 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2342
2343 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2344 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2345 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2346 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2347 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2348 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2349 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2350 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2351 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2352 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2353
2354 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2355 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2356
2357 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2358 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2359 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2360 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2361 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2362 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2363 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2364 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2365
2366 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2367 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2368 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2369 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2370 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2371 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2372 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2373 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2374 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, {26, 1, 1, 1 } },
2375 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 5, 1, 1, 1 } },
2376
2377 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2378 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, { 7, 1, 1, 1 } },
2379 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64, {15, 1, 1, 1 } },
2380 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32, {11, 1, 1, 1 } },
2381 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64, {31, 1, 1, 1 } },
2382 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2383 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, { 7, 1, 1, 1 } },
2384 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, { 5, 1, 1, 1 } },
2385 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64, {15, 1, 1, 1 } },
2386 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2387 { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, { 3, 1, 1, 1 } },
2388
2389 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2390 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2391 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, { 3, 1, 1, 1 } },
2392 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, { 1, 1, 1, 1 } },
2393 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, { 3, 1, 1, 1 } },
2394 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, { 3, 1, 1, 1 } },
2395 };
2396
2397 static const TypeConversionCostKindTblEntry AVX512BWVLConversionTbl[] {
2398 // Mask sign extend has an instruction.
2399 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2400 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2401 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2402 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2403 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2404 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2405 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2406 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2407 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2408 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2409 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2410 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2411 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2412 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2413 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, { 1, 1, 1, 1 } },
2414 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, { 1, 1, 1, 1 } },
2415 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, { 1, 1, 1, 1 } },
2416
2417 // Mask zero extend is a sext + shift.
2418 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2419 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2420 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2421 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2422 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2423 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2424 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2425 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2426 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2427 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2428 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2429 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2430 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2431 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2432 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, { 2, 1, 1, 1 } },
2433 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, { 2, 1, 1, 1 } },
2434 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, { 2, 1, 1, 1 } },
2435
2436 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2437 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2438 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2439 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2440 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2441 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2442 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2443 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2444 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2445 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2446 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2447 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2448 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2449 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2450 { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, { 2, 1, 1, 1 } },
2451 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, { 2, 1, 1, 1 } },
2452 { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, { 2, 1, 1, 1 } },
2453
2454 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } },
2455 };
2456
2457 static const TypeConversionCostKindTblEntry AVX512DQVLConversionTbl[] = {
2458 // Mask sign extend has an instruction.
2459 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2460 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2461 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2462 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, { 1, 1, 1, 1 } },
2463 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2464 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, { 1, 1, 1, 1 } },
2465 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } },
2466 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2467
2468 // Mask zero extend is a sext + shift.
2469 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } },
2470 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1 } },
2471 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } },
2472 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, { 2, 1, 1, 1 } },
2473 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } },
2474 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, { 2, 1, 1, 1 } },
2475 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } },
2476 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } },
2477
2478 { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, { 2, 1, 1, 1 } },
2479 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } },
2480 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2481 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2482 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2483 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2484 { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, { 2, 1, 1, 1 } },
2485 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2486
2487 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2488 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2489 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2490 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2491
2492 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2493 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2494 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2495 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2496
2497 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2498 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2499 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2500 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2501
2502 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2503 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2504 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2505 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2506 };
2507
2508 static const TypeConversionCostKindTblEntry AVX512VLConversionTbl[] = {
2509 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2510 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2511 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2512 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 8, 1, 1, 1 } }, // split+2*v8i8
2513 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2514 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2515 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2516 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 8, 1, 1, 1 } }, // split+2*v8i16
2517 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2518 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2519 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2520 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2521 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2522 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2523 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // vpmovqd
2524 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqb
2525 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqw
2526 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, { 2, 1, 1, 1 } }, // vpmovwb
2527
2528 // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
2529 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
2530 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 5, 1, 1, 1 } },
2531 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 6, 1, 1, 1 } },
2532 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 5, 1, 1, 1 } },
2533 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 6, 1, 1, 1 } },
2534 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 5, 1, 1, 1 } },
2535 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 6, 1, 1, 1 } },
2536 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, {10, 1, 1, 1 } },
2537 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, {12, 1, 1, 1 } },
2538
2539 // sign extend is vpcmpeq+maskedmove+vpmovdw
2540 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
2541 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2542 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 5, 1, 1, 1 } },
2543 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2544 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 5, 1, 1, 1 } },
2545 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2546 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 5, 1, 1, 1 } },
2547 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, {10, 1, 1, 1 } },
2548 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, {12, 1, 1, 1 } },
2549
2550 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogd
2551 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2552 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogd
2553 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2554 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogd
2555 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2556 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2557 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2558
2559 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogq
2560 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2561 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogq
2562 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2563
2564 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2565 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2566 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2567 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2568 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2569 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2570 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2571 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2572 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2573 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2574 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2575 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2576
2577 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2578 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2579 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2580 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2581
2582 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
2583 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
2584 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2585 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2586 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2587 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2588 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 1, 1, 1, 1 } },
2589 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
2590 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2591 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2592 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 5, 1, 1, 1 } },
2593 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2594 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 5, 1, 1, 1 } },
2595
2596 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2597 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2598 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, { 5, 1, 1, 1 } },
2599
2600 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
2601 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
2602 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
2603 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
2604 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2605 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2606 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2607 };
2608
2609 static const TypeConversionCostKindTblEntry AVX2ConversionTbl[] = {
2610 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2611 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2612 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2613 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2614 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2615 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2616
2617 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2618 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2619 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2620 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2621 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2622 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2623 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2624 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2625 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2626 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2627 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2628 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2629 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2630 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2631
2632 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2633
2634 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 4, 1, 1, 1 } },
2635 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 4, 1, 1, 1 } },
2636 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 1, 1, 1, 1 } },
2637 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 1, 1, 1, 1 } },
2638 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 1, 1, 1, 1 } },
2639 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 4, 1, 1, 1 } },
2640 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 4, 1, 1, 1 } },
2641 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 1, 1, 1, 1 } },
2642 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 1, 1, 1, 1 } },
2643 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 5, 1, 1, 1 } },
2644 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } },
2645 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 2, 1, 1, 1 } },
2646
2647 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 3, 1, 1, 1 } },
2648 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 3, 1, 1, 1 } },
2649
2650 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2651 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2652 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2653 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 3, 1, 1, 1 } },
2654
2655 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 3, 1, 1, 1 } },
2656 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 3, 1, 1, 1 } },
2657 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2658 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2659 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2660 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 4, 1, 1, 1 } },
2661 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 3, 1, 1, 1 } },
2662 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 4, 1, 1, 1 } },
2663
2664 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2665 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2666 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2667 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2668 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2669 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2670 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 3, 1, 1, 1 } },
2671
2672 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2673 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2674 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2675 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2676 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 2, 1, 1, 1 } },
2677 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 1, 1, 1, 1 } },
2678 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 2, 1, 1, 1 } },
2679 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2680 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2681 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2682 };
2683
2684 static const TypeConversionCostKindTblEntry AVXConversionTbl[] = {
2685 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2686 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2687 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2688 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2689 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2690 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2691
2692 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2693 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2694 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2695 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2696 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2697 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2698 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2699 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2700 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2701 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2702 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2703 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2704
2705 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 4, 1, 1, 1 } },
2706 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 5, 1, 1, 1 } },
2707 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 4, 1, 1, 1 } },
2708 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 9, 1, 1, 1 } },
2709 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, {11, 1, 1, 1 } },
2710
2711 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 6, 1, 1, 1 } },
2712 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 6, 1, 1, 1 } },
2713 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // and+extract+packuswb
2714 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 5, 1, 1, 1 } },
2715 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
2716 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 5, 1, 1, 1 } },
2717 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 3, 1, 1, 1 } }, // and+extract+2*packusdw
2718 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 2, 1, 1, 1 } },
2719
2720 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, { 3, 1, 1, 1 } },
2721 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, { 3, 1, 1, 1 } },
2722 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, { 8, 1, 1, 1 } },
2723 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2724 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2725 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2726 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2727 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2728 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2729 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2730 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 5, 1, 1, 1 } },
2731 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 8, 1, 1, 1 } },
2732
2733 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, { 7, 1, 1, 1 } },
2734 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, { 7, 1, 1, 1 } },
2735 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, { 6, 1, 1, 1 } },
2736 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2737 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2738 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2739 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2740 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 4, 1, 1, 1 } },
2741 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 4, 1, 1, 1 } },
2742 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
2743 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 6, 1, 1, 1 } },
2744 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 8, 1, 1, 1 } },
2745 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, {10, 1, 1, 1 } },
2746 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, {10, 1, 1, 1 } },
2747 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {18, 1, 1, 1 } },
2748 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2749 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, {10, 1, 1, 1 } },
2750
2751 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2752 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2753 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2754 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2755 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2756 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2757 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2758 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2759 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 2, 1, 1, 1 } },
2760 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 2, 1, 1, 1 } },
2761 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 5, 1, 1, 1 } },
2762
2763 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2764 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2765 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2766 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2767 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2768 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2769 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2770 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2771 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2772 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2773 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 6, 1, 1, 1 } },
2774 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 7, 1, 1, 1 } },
2775 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 7, 1, 1, 1 } },
2776
2777 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, { 1, 1, 1, 1 } },
2778 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, { 1, 1, 1, 1 } },
2779 };
2780
2781 static const TypeConversionCostKindTblEntry SSE41ConversionTbl[] = {
2782 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
2783 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
2784 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
2785 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
2786 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2787 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2788 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
2789 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
2790 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2791 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2792 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2793 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2794
2795 // These truncates end up widening elements.
2796 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 1, 1, 1, 1 } }, // PMOVXZBQ
2797 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 1, 1, 1, 1 } }, // PMOVXZWQ
2798 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 1, 1, 1, 1 } }, // PMOVXZBD
2799
2800 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 2, 1, 1, 1 } },
2801 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 2, 1, 1, 1 } },
2802 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 2, 1, 1, 1 } },
2803
2804 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
2805 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
2806 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
2807 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
2808 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
2809 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2810 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
2811 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2812 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
2813 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 1, 1, 1, 1 } },
2814 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2815
2816 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
2817 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
2818 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 4, 1, 1, 1 } },
2819 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 4, 1, 1, 1 } },
2820 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
2821 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2822 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
2823 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2824 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 3, 1, 1, 1 } },
2825 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
2826 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 2, 1, 1, 1 } },
2827 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {12, 1, 1, 1 } },
2828 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {22, 1, 1, 1 } },
2829 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 4, 1, 1, 1 } },
2830
2831 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
2832 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
2833 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
2834 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
2835 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
2836 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
2837 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
2838 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
2839 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
2840 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
2841
2842 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
2843 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
2844 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
2845 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
2846 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
2847 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
2848 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
2849 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
2850 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
2851 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2852 };
2853
2854 static const TypeConversionCostKindTblEntry SSE2ConversionTbl[] = {
2855 // These are somewhat magic numbers justified by comparing the
2856 // output of llvm-mca for our various supported scheduler models
2857 // and basing it off the worst case scenario.
2858 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
2859 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
2860 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 3, 1, 1, 1 } },
2861 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 3, 1, 1, 1 } },
2862 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 3, 1, 1, 1 } },
2863 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
2864 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 3, 1, 1, 1 } },
2865 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
2866 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
2867 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 4, 1, 1, 1 } },
2868 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 8, 1, 1, 1 } },
2869 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 8, 1, 1, 1 } },
2870
2871 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
2872 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
2873 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 8, 1, 1, 1 } },
2874 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 9, 1, 1, 1 } },
2875 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
2876 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 4, 1, 1, 1 } },
2877 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 4, 1, 1, 1 } },
2878 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
2879 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 7, 1, 1, 1 } },
2880 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 7, 1, 1, 1 } },
2881 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
2882 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, {15, 1, 1, 1 } },
2883 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {18, 1, 1, 1 } },
2884
2885 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
2886 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
2887 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
2888 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
2889 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
2890 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
2891 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
2892 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
2893 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
2894 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2895
2896 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
2897 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
2898 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
2899 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, {15, 1, 1, 1 } },
2900 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
2901 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
2902 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
2903 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
2904 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 8, 1, 1, 1 } },
2905 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 8, 1, 1, 1 } },
2906
2907 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
2908 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
2909 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 2, 1, 1, 1 } },
2910 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 3, 1, 1, 1 } },
2911 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2912 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 2, 1, 1, 1 } },
2913 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 2, 1, 1, 1 } },
2914 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 3, 1, 1, 1 } },
2915 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2916 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 2, 1, 1, 1 } },
2917 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2918 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 2, 1, 1, 1 } },
2919
2920 // These truncates are really widening elements.
2921 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 1, 1, 1, 1 } }, // PSHUFD
2922 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } }, // PUNPCKLWD+DQ
2923 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // PUNPCKLBW+WD+PSHUFD
2924 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 1, 1, 1, 1 } }, // PUNPCKLWD
2925 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } }, // PUNPCKLBW+WD
2926 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 1, 1, 1, 1 } }, // PUNPCKLBW
2927
2928 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 2, 1, 1, 1 } }, // PAND+PACKUSWB
2929 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } },
2930 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 3, 1, 1, 1 } }, // PAND+2*PACKUSWB
2931 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 7, 1, 1, 1 } },
2932 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, { 1, 1, 1, 1 } },
2933 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 3, 1, 1, 1 } },
2934 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
2935 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, {10, 1, 1, 1 } },
2936 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 4, 1, 1, 1 } }, // PAND+3*PACKUSWB
2937 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 2, 1, 1, 1 } }, // PSHUFD+PSHUFLW
2938 { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, { 1, 1, 1, 1 } }, // PSHUFD
2939 };
2940
2941 // Attempt to map directly to (simple) MVT types to let us match custom entries.
2942 EVT SrcTy = TLI->getValueType(DL, Src);
2943 EVT DstTy = TLI->getValueType(DL, Dst);
2944
2945 // The function getSimpleVT only handles simple value types.
2946 if (SrcTy.isSimple() && DstTy.isSimple()) {
2947 MVT SimpleSrcTy = SrcTy.getSimpleVT();
2948 MVT SimpleDstTy = DstTy.getSimpleVT();
2949
2950 if (ST->useAVX512Regs()) {
2951 if (ST->hasBWI())
2952 if (const auto *Entry = ConvertCostTableLookup(
2953 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2954 if (auto KindCost = Entry->Cost[CostKind])
2955 return *KindCost;
2956
2957 if (ST->hasDQI())
2958 if (const auto *Entry = ConvertCostTableLookup(
2959 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2960 if (auto KindCost = Entry->Cost[CostKind])
2961 return *KindCost;
2962
2963 if (ST->hasAVX512())
2964 if (const auto *Entry = ConvertCostTableLookup(
2965 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2966 if (auto KindCost = Entry->Cost[CostKind])
2967 return *KindCost;
2968 }
2969
2970 if (ST->hasBWI())
2971 if (const auto *Entry = ConvertCostTableLookup(
2972 AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2973 if (auto KindCost = Entry->Cost[CostKind])
2974 return *KindCost;
2975
2976 if (ST->hasDQI())
2977 if (const auto *Entry = ConvertCostTableLookup(
2978 AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2979 if (auto KindCost = Entry->Cost[CostKind])
2980 return *KindCost;
2981
2982 if (ST->hasAVX512())
2983 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2984 SimpleDstTy, SimpleSrcTy))
2985 if (auto KindCost = Entry->Cost[CostKind])
2986 return *KindCost;
2987
2988 if (ST->hasAVX2()) {
2989 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2990 SimpleDstTy, SimpleSrcTy))
2991 if (auto KindCost = Entry->Cost[CostKind])
2992 return *KindCost;
2993 }
2994
2995 if (ST->hasAVX()) {
2996 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2997 SimpleDstTy, SimpleSrcTy))
2998 if (auto KindCost = Entry->Cost[CostKind])
2999 return *KindCost;
3000 }
3001
3002 if (ST->hasSSE41()) {
3003 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3004 SimpleDstTy, SimpleSrcTy))
3005 if (auto KindCost = Entry->Cost[CostKind])
3006 return *KindCost;
3007 }
3008
3009 if (ST->hasSSE2()) {
3010 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3011 SimpleDstTy, SimpleSrcTy))
3012 if (auto KindCost = Entry->Cost[CostKind])
3013 return *KindCost;
3014 }
3015 }
3016
3017 // Fall back to legalized types.
3018 std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src);
3019 std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst);
3020
3021 // If we're truncating to the same legalized type - just assume its free.
3022 if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
3023 return TTI::TCC_Free;
3024
3025 if (ST->useAVX512Regs()) {
3026 if (ST->hasBWI())
3027 if (const auto *Entry = ConvertCostTableLookup(
3028 AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
3029 if (auto KindCost = Entry->Cost[CostKind])
3030 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3031
3032 if (ST->hasDQI())
3033 if (const auto *Entry = ConvertCostTableLookup(
3034 AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
3035 if (auto KindCost = Entry->Cost[CostKind])
3036 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3037
3038 if (ST->hasAVX512())
3039 if (const auto *Entry = ConvertCostTableLookup(
3040 AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
3041 if (auto KindCost = Entry->Cost[CostKind])
3042 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3043 }
3044
3045 if (ST->hasBWI())
3046 if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
3047 LTDest.second, LTSrc.second))
3048 if (auto KindCost = Entry->Cost[CostKind])
3049 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3050
3051 if (ST->hasDQI())
3052 if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
3053 LTDest.second, LTSrc.second))
3054 if (auto KindCost = Entry->Cost[CostKind])
3055 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3056
3057 if (ST->hasAVX512())
3058 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3059 LTDest.second, LTSrc.second))
3060 if (auto KindCost = Entry->Cost[CostKind])
3061 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3062
3063 if (ST->hasAVX2())
3064 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3065 LTDest.second, LTSrc.second))
3066 if (auto KindCost = Entry->Cost[CostKind])
3067 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3068
3069 if (ST->hasAVX())
3070 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3071 LTDest.second, LTSrc.second))
3072 if (auto KindCost = Entry->Cost[CostKind])
3073 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3074
3075 if (ST->hasSSE41())
3076 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3077 LTDest.second, LTSrc.second))
3078 if (auto KindCost = Entry->Cost[CostKind])
3079 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3080
3081 if (ST->hasSSE2())
3082 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3083 LTDest.second, LTSrc.second))
3084 if (auto KindCost = Entry->Cost[CostKind])
3085 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3086
3087 // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
3088 // sitofp.
3089 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3090 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
3091 Type *ExtSrc = Src->getWithNewBitWidth(32);
3092 unsigned ExtOpc =
3093 (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
3094
3095 // For scalar loads the extend would be free.
3096 InstructionCost ExtCost = 0;
3097 if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
3098 ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
3099
3100 return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
3102 }
3103
3104 // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
3105 // i32.
3106 if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
3107 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
3108 Type *TruncDst = Dst->getWithNewBitWidth(32);
3109 return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
3110 getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
3112 }
3113
3114 // TODO: Allow non-throughput costs that aren't binary.
3115 auto AdjustCost = [&CostKind](InstructionCost Cost,
3118 return Cost == 0 ? 0 : N;
3119 return Cost * N;
3120 };
3121 return AdjustCost(
3122 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3123}
3124
3126 Type *CondTy,
3127 CmpInst::Predicate VecPred,
3129 const Instruction *I) {
3130 // Early out if this type isn't scalar/vector integer/float.
3131 if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy()))
3132 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3133 I);
3134
3135 // Legalize the type.
3136 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3137
3138 MVT MTy = LT.second;
3139
3140 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3141 assert(ISD && "Invalid opcode");
3142
3143 InstructionCost ExtraCost = 0;
3144 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
3145 // Some vector comparison predicates cost extra instructions.
3146 // TODO: Adjust ExtraCost based on CostKind?
3147 // TODO: Should we invert this and assume worst case cmp costs
3148 // and reduce for particular predicates?
3149 if (MTy.isVector() &&
3150 !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
3151 (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
3152 ST->hasBWI())) {
3153 // Fallback to I if a specific predicate wasn't specified.
3154 CmpInst::Predicate Pred = VecPred;
3155 if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
3157 Pred = cast<CmpInst>(I)->getPredicate();
3158
3159 bool CmpWithConstant = false;
3160 if (auto *CmpInstr = dyn_cast_or_null<CmpInst>(I))
3161 CmpWithConstant = isa<Constant>(CmpInstr->getOperand(1));
3162
3163 switch (Pred) {
3165 // xor(cmpeq(x,y),-1)
3166 ExtraCost = CmpWithConstant ? 0 : 1;
3167 break;
3170 // xor(cmpgt(x,y),-1)
3171 ExtraCost = CmpWithConstant ? 0 : 1;
3172 break;
3175 // cmpgt(xor(x,signbit),xor(y,signbit))
3176 // xor(cmpeq(pmaxu(x,y),x),-1)
3177 ExtraCost = CmpWithConstant ? 1 : 2;
3178 break;
3181 if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
3182 (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
3183 // cmpeq(psubus(x,y),0)
3184 // cmpeq(pminu(x,y),x)
3185 ExtraCost = 1;
3186 } else {
3187 // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
3188 ExtraCost = CmpWithConstant ? 2 : 3;
3189 }
3190 break;
3193 // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases.
3194 // Use FCMP_UEQ expansion - FCMP_ONE should be the same.
3195 if (CondTy && !ST->hasAVX())
3196 return getCmpSelInstrCost(Opcode, ValTy, CondTy,
3198 getCmpSelInstrCost(Opcode, ValTy, CondTy,
3200 getArithmeticInstrCost(Instruction::Or, CondTy, CostKind);
3201
3202 break;
3205 // Assume worst case scenario and add the maximum extra cost.
3206 ExtraCost = 3;
3207 break;
3208 default:
3209 break;
3210 }
3211 }
3212 }
3213
3214 static const CostKindTblEntry SLMCostTbl[] = {
3215 // slm pcmpeq/pcmpgt throughput is 2
3216 { ISD::SETCC, MVT::v2i64, { 2, 5, 1, 2 } },
3217 // slm pblendvb/blendvpd/blendvps throughput is 4
3218 { ISD::SELECT, MVT::v2f64, { 4, 4, 1, 3 } }, // vblendvpd
3219 { ISD::SELECT, MVT::v4f32, { 4, 4, 1, 3 } }, // vblendvps
3220 { ISD::SELECT, MVT::v2i64, { 4, 4, 1, 3 } }, // pblendvb
3221 { ISD::SELECT, MVT::v8i32, { 4, 4, 1, 3 } }, // pblendvb
3222 { ISD::SELECT, MVT::v8i16, { 4, 4, 1, 3 } }, // pblendvb
3223 { ISD::SELECT, MVT::v16i8, { 4, 4, 1, 3 } }, // pblendvb
3224 };
3225
3226 static const CostKindTblEntry AVX512BWCostTbl[] = {
3227 { ISD::SETCC, MVT::v32i16, { 1, 1, 1, 1 } },
3228 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 1 } },
3229 { ISD::SETCC, MVT::v64i8, { 1, 1, 1, 1 } },
3230 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 1 } },
3231
3232 { ISD::SELECT, MVT::v32i16, { 1, 1, 1, 1 } },
3233 { ISD::SELECT, MVT::v64i8, { 1, 1, 1, 1 } },
3234 };
3235
3236 static const CostKindTblEntry AVX512CostTbl[] = {
3237 { ISD::SETCC, MVT::v8f64, { 1, 4, 1, 1 } },
3238 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 1 } },
3239 { ISD::SETCC, MVT::v16f32, { 1, 4, 1, 1 } },
3240 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 1 } },
3241
3242 { ISD::SETCC, MVT::v8i64, { 1, 1, 1, 1 } },
3243 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 1 } },
3244 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3245 { ISD::SETCC, MVT::v16i32, { 1, 1, 1, 1 } },
3246 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 1 } },
3247 { ISD::SETCC, MVT::v32i16, { 3, 7, 5, 5 } },
3248 { ISD::SETCC, MVT::v64i8, { 3, 7, 5, 5 } },
3249
3250 { ISD::SELECT, MVT::v8i64, { 1, 1, 1, 1 } },
3251 { ISD::SELECT, MVT::v4i64, { 1, 1, 1, 1 } },
3252 { ISD::SELECT, MVT::v2i64, { 1, 1, 1, 1 } },
3253 { ISD::SELECT, MVT::v16i32, { 1, 1, 1, 1 } },
3254 { ISD::SELECT, MVT::v8i32, { 1, 1, 1, 1 } },
3255 { ISD::SELECT, MVT::v4i32, { 1, 1, 1, 1 } },
3256 { ISD::SELECT, MVT::v8f64, { 1, 1, 1, 1 } },
3257 { ISD::SELECT, MVT::v4f64, { 1, 1, 1, 1 } },
3258 { ISD::SELECT, MVT::v2f64, { 1, 1, 1, 1 } },
3259 { ISD::SELECT, MVT::f64, { 1, 1, 1, 1 } },
3260 { ISD::SELECT, MVT::v16f32, { 1, 1, 1, 1 } },
3261 { ISD::SELECT, MVT::v8f32 , { 1, 1, 1, 1 } },
3262 { ISD::SELECT, MVT::v4f32, { 1, 1, 1, 1 } },
3263 { ISD::SELECT, MVT::f32 , { 1, 1, 1, 1 } },
3264
3265 { ISD::SELECT, MVT::v32i16, { 2, 2, 4, 4 } },
3266 { ISD::SELECT, MVT::v16i16, { 1, 1, 1, 1 } },
3267 { ISD::SELECT, MVT::v8i16, { 1, 1, 1, 1 } },
3268 { ISD::SELECT, MVT::v64i8, { 2, 2, 4, 4 } },
3269 { ISD::SELECT, MVT::v32i8, { 1, 1, 1, 1 } },
3270 { ISD::SELECT, MVT::v16i8, { 1, 1, 1, 1 } },
3271 };
3272
3273 static const CostKindTblEntry AVX2CostTbl[] = {
3274 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 2 } },
3275 { ISD::SETCC, MVT::v2f64, { 1, 4, 1, 1 } },
3276 { ISD::SETCC, MVT::f64, { 1, 4, 1, 1 } },
3277 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 2 } },
3278 { ISD::SETCC, MVT::v4f32, { 1, 4, 1, 1 } },
3279 { ISD::SETCC, MVT::f32, { 1, 4, 1, 1 } },
3280
3281 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 2 } },
3282 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 2 } },
3283 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 2 } },
3284 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 2 } },
3285
3286 { ISD::SELECT, MVT::v4f64, { 2, 2, 1, 2 } }, // vblendvpd
3287 { ISD::SELECT, MVT::v8f32, { 2, 2, 1, 2 } }, // vblendvps
3288 { ISD::SELECT, MVT::v4i64, { 2, 2, 1, 2 } }, // pblendvb
3289 { ISD::SELECT, MVT::v8i32, { 2, 2, 1, 2 } }, // pblendvb
3290 { ISD::SELECT, MVT::v16i16, { 2, 2, 1, 2 } }, // pblendvb
3291 { ISD::SELECT, MVT::v32i8, { 2, 2, 1, 2 } }, // pblendvb
3292 };
3293
3294 static const CostKindTblEntry XOPCostTbl[] = {
3295 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3296 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3297 };
3298
3299 static const CostKindTblEntry AVX1CostTbl[] = {
3300 { ISD::SETCC, MVT::v4f64, { 2, 3, 1, 2 } },
3301 { ISD::SETCC, MVT::v2f64, { 1, 3, 1, 1 } },
3302 { ISD::SETCC, MVT::f64, { 1, 3, 1, 1 } },
3303 { ISD::SETCC, MVT::v8f32, { 2, 3, 1, 2 } },
3304 { ISD::SETCC, MVT::v4f32, { 1, 3, 1, 1 } },
3305 { ISD::SETCC, MVT::f32, { 1, 3, 1, 1 } },
3306
3307 // AVX1 does not support 8-wide integer compare.
3308 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3309 { ISD::SETCC, MVT::v8i32, { 4, 2, 5, 6 } },
3310 { ISD::SETCC, MVT::v16i16, { 4, 2, 5, 6 } },
3311 { ISD::SETCC, MVT::v32i8, { 4, 2, 5, 6 } },
3312
3313 { ISD::SELECT, MVT::v4f64, { 3, 3, 1, 2 } }, // vblendvpd
3314 { ISD::SELECT, MVT::v8f32, { 3, 3, 1, 2 } }, // vblendvps
3315 { ISD::SELECT, MVT::v4i64, { 3, 3, 1, 2 } }, // vblendvpd
3316 { ISD::SELECT, MVT::v8i32, { 3, 3, 1, 2 } }, // vblendvps
3317 { ISD::SELECT, MVT::v16i16, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3318 { ISD::SELECT, MVT::v32i8, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3319 };
3320
3321 static const CostKindTblEntry SSE42CostTbl[] = {
3322 { ISD::SETCC, MVT::v2i64, { 1, 2, 1, 2 } },
3323 };
3324
3325 static const CostKindTblEntry SSE41CostTbl[] = {
3326 { ISD::SETCC, MVT::v2f64, { 1, 5, 1, 1 } },
3327 { ISD::SETCC, MVT::v4f32, { 1, 5, 1, 1 } },
3328
3329 { ISD::SELECT, MVT::v2f64, { 2, 2, 1, 2 } }, // blendvpd
3330 { ISD::SELECT, MVT::f64, { 2, 2, 1, 2 } }, // blendvpd
3331 { ISD::SELECT, MVT::v4f32, { 2, 2, 1, 2 } }, // blendvps
3332 { ISD::SELECT, MVT::f32 , { 2, 2, 1, 2 } }, // blendvps
3333 { ISD::SELECT, MVT::v2i64, { 2, 2, 1, 2 } }, // pblendvb
3334 { ISD::SELECT, MVT::v4i32, { 2, 2, 1, 2 } }, // pblendvb
3335 { ISD::SELECT, MVT::v8i16, { 2, 2, 1, 2 } }, // pblendvb
3336 { ISD::SELECT, MVT::v16i8, { 2, 2, 1, 2 } }, // pblendvb
3337 };
3338
3339 static const CostKindTblEntry SSE2CostTbl[] = {
3340 { ISD::SETCC, MVT::v2f64, { 2, 5, 1, 1 } },
3341 { ISD::SETCC, MVT::f64, { 1, 5, 1, 1 } },
3342
3343 { ISD::SETCC, MVT::v2i64, { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion
3344 { ISD::SETCC, MVT::v4i32, { 1, 1, 1, 1 } },
3345 { ISD::SETCC, MVT::v8i16, { 1, 1, 1, 1 } },
3346 { ISD::SETCC, MVT::v16i8, { 1, 1, 1, 1 } },
3347
3348 { ISD::SELECT, MVT::v2f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3349 { ISD::SELECT, MVT::f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3350 { ISD::SELECT, MVT::v2i64, { 2, 2, 3, 3 } }, // pand + pandn + por
3351 { ISD::SELECT, MVT::v4i32, { 2, 2, 3, 3 } }, // pand + pandn + por
3352 { ISD::SELECT, MVT::v8i16, { 2, 2, 3, 3 } }, // pand + pandn + por
3353 { ISD::SELECT, MVT::v16i8, { 2, 2, 3, 3 } }, // pand + pandn + por
3354 };
3355
3356 static const CostKindTblEntry SSE1CostTbl[] = {
3357 { ISD::SETCC, MVT::v4f32, { 2, 5, 1, 1 } },
3358 { ISD::SETCC, MVT::f32, { 1, 5, 1, 1 } },
3359
3360 { ISD::SELECT, MVT::v4f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3361 { ISD::SELECT, MVT::f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3362 };
3363
3364 if (ST->useSLMArithCosts())
3365 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
3366 if (auto KindCost = Entry->Cost[CostKind])
3367 return LT.first * (ExtraCost + *KindCost);
3368
3369 if (ST->hasBWI())
3370 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3371 if (auto KindCost = Entry->Cost[CostKind])
3372 return LT.first * (ExtraCost + *KindCost);
3373
3374 if (ST->hasAVX512())
3375 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3376 if (auto KindCost = Entry->Cost[CostKind])
3377 return LT.first * (ExtraCost + *KindCost);
3378
3379 if (ST->hasAVX2())
3380 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3381 if (auto KindCost = Entry->Cost[CostKind])
3382 return LT.first * (ExtraCost + *KindCost);
3383
3384 if (ST->hasXOP())
3385 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3386 if (auto KindCost = Entry->Cost[CostKind])
3387 return LT.first * (ExtraCost + *KindCost);
3388
3389 if (ST->hasAVX())
3390 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3391 if (auto KindCost = Entry->Cost[CostKind])
3392 return LT.first * (ExtraCost + *KindCost);
3393
3394 if (ST->hasSSE42())
3395 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3396 if (auto KindCost = Entry->Cost[CostKind])
3397 return LT.first * (ExtraCost + *KindCost);
3398
3399 if (ST->hasSSE41())
3400 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3401 if (auto KindCost = Entry->Cost[CostKind])
3402 return LT.first * (ExtraCost + *KindCost);
3403
3404 if (ST->hasSSE2())
3405 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3406 if (auto KindCost = Entry->Cost[CostKind])
3407 return LT.first * (ExtraCost + *KindCost);
3408
3409 if (ST->hasSSE1())
3410 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3411 if (auto KindCost = Entry->Cost[CostKind])
3412 return LT.first * (ExtraCost + *KindCost);
3413
3414 // Assume a 3cy latency for fp select ops.
3415 if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select)
3416 if (ValTy->getScalarType()->isFloatingPointTy())
3417 return 3;
3418
3419 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
3420}
3421
3423
3427 // Costs should match the codegen from:
3428 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
3429 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
3430 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
3431 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
3432 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
3433
3434 // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
3435 // specialized in these tables yet.
3436 static const CostKindTblEntry AVX512VBMI2CostTbl[] = {
3437 { ISD::FSHL, MVT::v8i64, { 1, 1, 1, 1 } },
3438 { ISD::FSHL, MVT::v4i64, { 1, 1, 1, 1 } },
3439 { ISD::FSHL, MVT::v2i64, { 1, 1, 1, 1 } },
3440 { ISD::FSHL, MVT::v16i32, { 1, 1, 1, 1 } },
3441 { ISD::FSHL, MVT::v8i32, { 1, 1, 1, 1 } },
3442 { ISD::FSHL, MVT::v4i32, { 1, 1, 1, 1 } },
3443 { ISD::FSHL, MVT::v32i16, { 1, 1, 1, 1 } },
3444 { ISD::FSHL, MVT::v16i16, { 1, 1, 1, 1 } },
3445 { ISD::FSHL, MVT::v8i16, { 1, 1, 1, 1 } },
3446 { ISD::ROTL, MVT::v32i16, { 1, 1, 1, 1 } },
3447 { ISD::ROTL, MVT::v16i16, { 1, 1, 1, 1 } },
3448 { ISD::ROTL, MVT::v8i16, { 1, 1, 1, 1 } },
3449 { ISD::ROTR, MVT::v32i16, { 1, 1, 1, 1 } },
3450 { ISD::ROTR, MVT::v16i16, { 1, 1, 1, 1 } },
3451 { ISD::ROTR, MVT::v8i16, { 1, 1, 1, 1 } },
3452 { X86ISD::VROTLI, MVT::v32i16, { 1, 1, 1, 1 } },
3453 { X86ISD::VROTLI, MVT::v16i16, { 1, 1, 1, 1 } },
3454 { X86ISD::VROTLI, MVT::v8i16, { 1, 1, 1, 1 } },
3455 };
3456 static const CostKindTblEntry AVX512BITALGCostTbl[] = {
3457 { ISD::CTPOP, MVT::v32i16, { 1, 1, 1, 1 } },
3458 { ISD::CTPOP, MVT::v64i8, { 1, 1, 1, 1 } },
3459 { ISD::CTPOP, MVT::v16i16, { 1, 1, 1, 1 } },
3460 { ISD::CTPOP, MVT::v32i8, { 1, 1, 1, 1 } },
3461 { ISD::CTPOP, MVT::v8i16, { 1, 1, 1, 1 } },
3462 { ISD::CTPOP, MVT::v16i8, { 1, 1, 1, 1 } },
3463 };
3464 static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = {
3465 { ISD::CTPOP, MVT::v8i64, { 1, 1, 1, 1 } },
3466 { ISD::CTPOP, MVT::v16i32, { 1, 1, 1, 1 } },
3467 { ISD::CTPOP, MVT::v4i64, { 1, 1, 1, 1 } },
3468 { ISD::CTPOP, MVT::v8i32, { 1, 1, 1, 1 } },
3469 { ISD::CTPOP, MVT::v2i64, { 1, 1, 1, 1 } },
3470 { ISD::CTPOP, MVT::v4i32, { 1, 1, 1, 1 } },
3471 };
3472 static const CostKindTblEntry AVX512CDCostTbl[] = {
3473 { ISD::CTLZ, MVT::v8i64, { 1, 5, 1, 1 } },
3474 { ISD::CTLZ, MVT::v16i32, { 1, 5, 1, 1 } },
3475 { ISD::CTLZ, MVT::v32i16, { 18, 27, 23, 27 } },
3476 { ISD::CTLZ, MVT::v64i8, { 3, 16, 9, 11 } },
3477 { ISD::CTLZ, MVT::v4i64, { 1, 5, 1, 1 } },
3478 { ISD::CTLZ, MVT::v8i32, { 1, 5, 1, 1 } },
3479 { ISD::CTLZ, MVT::v16i16, { 8, 19, 11, 13 } },
3480 { ISD::CTLZ, MVT::v32i8, { 2, 11, 9, 10 } },
3481 { ISD::CTLZ, MVT::v2i64, { 1, 5, 1, 1 } },
3482 { ISD::CTLZ, MVT::v4i32, { 1, 5, 1, 1 } },
3483 { ISD::CTLZ, MVT::v8i16, { 3, 15, 4, 6 } },
3484 { ISD::CTLZ, MVT::v16i8, { 2, 10, 9, 10 } },
3485
3486 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3487 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3488 { ISD::CTTZ, MVT::v4i64, { 1, 8, 6, 6 } },
3489 { ISD::CTTZ, MVT::v8i32, { 1, 8, 6, 6 } },
3490 { ISD::CTTZ, MVT::v2i64, { 1, 8, 6, 6 } },
3491 { ISD::CTTZ, MVT::v4i32, { 1, 8, 6, 6 } },
3492 };
3493 static const CostKindTblEntry AVX512BWCostTbl[] = {
3494 { ISD::ABS, MVT::v32i16, { 1, 1, 1, 1 } },
3495 { ISD::ABS, MVT::v64i8, { 1, 1, 1, 1 } },
3496 { ISD::BITREVERSE, MVT::v2i64, { 3, 10, 10, 11 } },
3497 { ISD::BITREVERSE, MVT::v4i64, { 3, 11, 10, 11 } },
3498 { ISD::BITREVERSE, MVT::v8i64, { 3, 12, 10, 14 } },
3499 { ISD::BITREVERSE, MVT::v4i32, { 3, 10, 10, 11 } },
3500 { ISD::BITREVERSE, MVT::v8i32, { 3, 11, 10, 11 } },
3501 { ISD::BITREVERSE, MVT::v16i32, { 3, 12, 10, 14 } },
3502 { ISD::BITREVERSE, MVT::v8i16, { 3, 10, 10, 11 } },
3503 { ISD::BITREVERSE, MVT::v16i16, { 3, 11, 10, 11 } },
3504 { ISD::BITREVERSE, MVT::v32i16, { 3, 12, 10, 14 } },
3505 { ISD::BITREVERSE, MVT::v16i8, { 2, 5, 9, 9 } },
3506 { ISD::BITREVERSE, MVT::v32i8, { 2, 5, 9, 9 } },
3507 { ISD::BITREVERSE, MVT::v64i8, { 2, 5, 9, 12 } },
3508 { ISD::BSWAP, MVT::v2i64, { 1, 1, 1, 2 } },
3509 { ISD::BSWAP, MVT::v4i64, { 1, 1, 1, 2 } },
3510 { ISD::BSWAP, MVT::v8i64, { 1, 1, 1, 2 } },
3511 { ISD::BSWAP, MVT::v4i32, { 1, 1, 1, 2 } },
3512 { ISD::BSWAP, MVT::v8i32, { 1, 1, 1, 2 } },
3513 { ISD::BSWAP, MVT::v16i32, { 1, 1, 1, 2 } },
3514 { ISD::BSWAP, MVT::v8i16, { 1, 1, 1, 2 } },
3515 { ISD::BSWAP, MVT::v16i16, { 1, 1, 1, 2 } },
3516 { ISD::BSWAP, MVT::v32i16, { 1, 1, 1, 2 } },
3517 { ISD::CTLZ, MVT::v8i64, { 8, 22, 23, 23 } },
3518 { ISD::CTLZ, MVT::v16i32, { 8, 23, 25, 25 } },
3519 { ISD::CTLZ, MVT::v32i16, { 4, 15, 15, 16 } },
3520 { ISD::CTLZ, MVT::v64i8, { 3, 12, 10, 9 } },
3521 { ISD::CTPOP, MVT::v2i64, { 3, 7, 10, 10 } },
3522 { ISD::CTPOP, MVT::v4i64, { 3, 7, 10, 10 } },
3523 { ISD::CTPOP, MVT::v8i64, { 3, 8, 10, 12 } },
3524 { ISD::CTPOP, MVT::v4i32, { 7, 11, 14, 14 } },
3525 { ISD::CTPOP, MVT::v8i32, { 7, 11, 14, 14 } },
3526 { ISD::CTPOP, MVT::v16i32, { 7, 12, 14, 16 } },
3527 { ISD::CTPOP, MVT::v8i16, { 2, 7, 11, 11 } },
3528 { ISD::CTPOP, MVT::v16i16, { 2, 7, 11, 11 } },
3529 { ISD::CTPOP, MVT::v32i16, { 3, 7, 11, 13 } },
3530 { ISD::CTPOP, MVT::v16i8, { 2, 4, 8, 8 } },
3531 { ISD::CTPOP, MVT::v32i8, { 2, 4, 8, 8 } },
3532 { ISD::CTPOP, MVT::v64i8, { 2, 5, 8, 10 } },
3533 { ISD::CTTZ, MVT::v8i16, { 3, 9, 14, 14 } },
3534 { ISD::CTTZ, MVT::v16i16, { 3, 9, 14, 14 } },
3535 { ISD::CTTZ, MVT::v32i16, { 3, 10, 14, 16 } },
3536 { ISD::CTTZ, MVT::v16i8, { 2, 6, 11, 11 } },
3537 { ISD::CTTZ, MVT::v32i8, { 2, 6, 11, 11 } },
3538 { ISD::CTTZ, MVT::v64i8, { 3, 7, 11, 13 } },
3539 { ISD::ROTL, MVT::v32i16, { 2, 8, 6, 8 } },
3540 { ISD::ROTL, MVT::v16i16, { 2, 8, 6, 7 } },
3541 { ISD::ROTL, MVT::v8i16, { 2, 7, 6, 7 } },
3542 { ISD::ROTL, MVT::v64i8, { 5, 6, 11, 12 } },
3543 { ISD::ROTL, MVT::v32i8, { 5, 15, 7, 10 } },
3544 { ISD::ROTL, MVT::v16i8, { 5, 15, 7, 10 } },
3545 { ISD::ROTR, MVT::v32i16, { 2, 8, 6, 8 } },
3546 { ISD::ROTR, MVT::v16i16, { 2, 8, 6, 7 } },
3547 { ISD::ROTR, MVT::v8i16, { 2, 7, 6, 7 } },
3548 { ISD::ROTR, MVT::v64i8, { 5, 6, 12, 14 } },
3549 { ISD::ROTR, MVT::v32i8, { 5, 14, 6, 9 } },
3550 { ISD::ROTR, MVT::v16i8, { 5, 14, 6, 9 } },
3551 { X86ISD::VROTLI, MVT::v32i16, { 2, 5, 3, 3 } },
3552 { X86ISD::VROTLI, MVT::v16i16, { 1, 5, 3, 3 } },
3553 { X86ISD::VROTLI, MVT::v8i16, { 1, 5, 3, 3 } },
3554 { X86ISD::VROTLI, MVT::v64i8, { 2, 9, 3, 4 } },
3555 { X86ISD::VROTLI, MVT::v32i8, { 1, 9, 3, 4 } },
3556 { X86ISD::VROTLI, MVT::v16i8, { 1, 8, 3, 4 } },
3557 { ISD::SADDSAT, MVT::v32i16, { 1 } },
3558 { ISD::SADDSAT, MVT::v64i8, { 1 } },
3559 { ISD::SMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3560 { ISD::SMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3561 { ISD::SMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3562 { ISD::SMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3563 { ISD::SSUBSAT, MVT::v32i16, { 1 } },
3564 { ISD::SSUBSAT, MVT::v64i8, { 1 } },
3565 { ISD::UADDSAT, MVT::v32i16, { 1 } },
3566 { ISD::UADDSAT, MVT::v64i8, { 1 } },
3567 { ISD::UMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3568 { ISD::UMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3569 { ISD::UMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3570 { ISD::UMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3571 { ISD::USUBSAT, MVT::v32i16, { 1 } },
3572 { ISD::USUBSAT, MVT::v64i8, { 1 } },
3573 };
3574 static const CostKindTblEntry AVX512CostTbl[] = {
3575 { ISD::ABS, MVT::v8i64, { 1, 1, 1, 1 } },
3576 { ISD::ABS, MVT::v4i64, { 1, 1, 1, 1 } },
3577 { ISD::ABS, MVT::v2i64, { 1, 1, 1, 1 } },
3578 { ISD::ABS, MVT::v16i32, { 1, 1, 1, 1 } },
3579 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 1 } },
3580 { ISD::ABS, MVT::v32i16, { 2, 7, 4, 4 } },
3581 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 1 } },
3582 { ISD::ABS, MVT::v64i8, { 2, 7, 4, 4 } },
3583 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 1 } },
3584 { ISD::BITREVERSE, MVT::v8i64, { 9, 13, 20, 20 } },
3585 { ISD::BITREVERSE, MVT::v16i32, { 9, 13, 20, 20 } },
3586 { ISD::BITREVERSE, MVT::v32i16, { 9, 13, 20, 20 } },
3587 { ISD::BITREVERSE, MVT::v64i8, { 6, 11, 17, 17 } },
3588 { ISD::BSWAP, MVT::v8i64, { 4, 7, 5, 5 } },
3589 { ISD::BSWAP, MVT::v16i32, { 4, 7, 5, 5 } },
3590 { ISD::BSWAP, MVT::v32i16, { 4, 7, 5, 5 } },
3591 { ISD::CTLZ, MVT::v8i64, { 10, 28, 32, 32 } },
3592 { ISD::CTLZ, MVT::v16i32, { 12, 30, 38, 38 } },
3593 { ISD::CTLZ, MVT::v32i16, { 8, 15, 29, 29 } },
3594 { ISD::CTLZ, MVT::v64i8, { 6, 11, 19, 19 } },
3595 { ISD::CTPOP, MVT::v8i64, { 16, 16, 19, 19 } },
3596 { ISD::CTPOP, MVT::v16i32, { 24, 19, 27, 27 } },
3597 { ISD::CTPOP, MVT::v32i16, { 18, 15, 22, 22 } },
3598 { ISD::CTPOP, MVT::v64i8, { 12, 11, 16, 16 } },
3599 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3600 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3601 { ISD::CTTZ, MVT::v32i16, { 7, 17, 27, 27 } },
3602 { ISD::CTTZ, MVT::v64i8, { 6, 13, 21, 21 } },
3603 { ISD::ROTL, MVT::v8i64, { 1, 1, 1, 1 } },
3604 { ISD::ROTL, MVT::v4i64, { 1, 1, 1, 1 } },
3605 { ISD::ROTL, MVT::v2i64, { 1, 1, 1, 1 } },
3606 { ISD::ROTL, MVT::v16i32, { 1, 1, 1, 1 } },
3607 { ISD::ROTL, MVT::v8i32, { 1, 1, 1, 1 } },
3608 { ISD::ROTL, MVT::v4i32, { 1, 1, 1, 1 } },
3609 { ISD::ROTR, MVT::v8i64, { 1, 1, 1, 1 } },
3610 { ISD::ROTR, MVT::v4i64, { 1, 1, 1, 1 } },
3611 { ISD::ROTR, MVT::v2i64, { 1, 1, 1, 1 } },
3612 { ISD::ROTR, MVT::v16i32, { 1, 1, 1, 1 } },
3613 { ISD::ROTR, MVT::v8i32, { 1, 1, 1, 1 } },
3614 { ISD::ROTR, MVT::v4i32, { 1, 1, 1, 1 } },
3615 { X86ISD::VROTLI, MVT::v8i64, { 1, 1, 1, 1 } },
3616 { X86ISD::VROTLI, MVT::v4i64, { 1, 1, 1, 1 } },
3617 { X86ISD::VROTLI, MVT::v2i64, { 1, 1, 1, 1 } },
3618 { X86ISD::VROTLI, MVT::v16i32, { 1, 1, 1, 1 } },
3619 { X86ISD::VROTLI, MVT::v8i32, { 1, 1, 1, 1 } },
3620 { X86ISD::VROTLI, MVT::v4i32, { 1, 1, 1, 1 } },
3621 { ISD::SMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3622 { ISD::SMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3623 { ISD::SMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3624 { ISD::SMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3625 { ISD::SMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3626 { ISD::SMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3627 { ISD::SMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3628 { ISD::SMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3629 { ISD::SMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3630 { ISD::SMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3631 { ISD::SMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3632 { ISD::SMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3633 { ISD::UMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3634 { ISD::UMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3635 { ISD::UMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3636 { ISD::UMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3637 { ISD::UMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3638 { ISD::UMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3639 { ISD::UMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3640 { ISD::UMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3641 { ISD::UMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3642 { ISD::UMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3643 { ISD::UMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3644 { ISD::UMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3645 { ISD::USUBSAT, MVT::v16i32, { 2 } }, // pmaxud + psubd
3646 { ISD::USUBSAT, MVT::v2i64, { 2 } }, // pmaxuq + psubq
3647 { ISD::USUBSAT, MVT::v4i64, { 2 } }, // pmaxuq + psubq
3648 { ISD::USUBSAT, MVT::v8i64, { 2 } }, // pmaxuq + psubq
3649 { ISD::UADDSAT, MVT::v16i32, { 3 } }, // not + pminud + paddd
3650 { ISD::UADDSAT, MVT::v2i64, { 3 } }, // not + pminuq + paddq
3651 { ISD::UADDSAT, MVT::v4i64, { 3 } }, // not + pminuq + paddq
3652 { ISD::UADDSAT, MVT::v8i64, { 3 } }, // not + pminuq + paddq
3653 { ISD::SADDSAT, MVT::v32i16, { 2 } },
3654 { ISD::SADDSAT, MVT::v64i8, { 2 } },
3655 { ISD::SSUBSAT, MVT::v32i16, { 2 } },
3656 { ISD::SSUBSAT, MVT::v64i8, { 2 } },
3657 { ISD::UADDSAT, MVT::v32i16, { 2 } },
3658 { ISD::UADDSAT, MVT::v64i8, { 2 } },
3659 { ISD::USUBSAT, MVT::v32i16, { 2 } },
3660 { ISD::USUBSAT, MVT::v64i8, { 2 } },
3661 { ISD::FMAXNUM, MVT::f32, { 2, 2, 3, 3 } },
3662 { ISD::FMAXNUM, MVT::v4f32, { 1, 1, 3, 3 } },
3663 { ISD::FMAXNUM, MVT::v8f32, { 2, 2, 3, 3 } },
3664 { ISD::FMAXNUM, MVT::v16f32, { 4, 4, 3, 3 } },
3665 { ISD::FMAXNUM, MVT::f64, { 2, 2, 3, 3 } },
3666 { ISD::FMAXNUM, MVT::v2f64, { 1, 1, 3, 3 } },
3667 { ISD::FMAXNUM, MVT::v4f64, { 2, 2, 3, 3 } },
3668 { ISD::FMAXNUM, MVT::v8f64, { 3, 3, 3, 3 } },
3669 { ISD::FSQRT, MVT::f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3670 { ISD::FSQRT, MVT::v4f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3671 { ISD::FSQRT, MVT::v8f32, { 6, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3672 { ISD::FSQRT, MVT::v16f32, { 12, 20, 1, 3 } }, // Skylake from http://www.agner.org/
3673 { ISD::FSQRT, MVT::f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3674 { ISD::FSQRT, MVT::v2f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3675 { ISD::FSQRT, MVT::v4f64, { 12, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3676 { ISD::FSQRT, MVT::v8f64, { 24, 32, 1, 3 } }, // Skylake from http://www.agner.org/
3677 };
3678 static const CostKindTblEntry XOPCostTbl[] = {
3679 { ISD::BITREVERSE, MVT::v4i64, { 3, 6, 5, 6 } },
3680 { ISD::BITREVERSE, MVT::v8i32, { 3, 6, 5, 6 } },
3681 { ISD::BITREVERSE, MVT::v16i16, { 3, 6, 5, 6 } },
3682 { ISD::BITREVERSE, MVT::v32i8, { 3, 6, 5, 6 } },
3683 { ISD::BITREVERSE, MVT::v2i64, { 2, 7, 1, 1 } },
3684 { ISD::BITREVERSE, MVT::v4i32, { 2, 7, 1, 1 } },
3685 { ISD::BITREVERSE, MVT::v8i16, { 2, 7, 1, 1 } },
3686 { ISD::BITREVERSE, MVT::v16i8, { 2, 7, 1, 1 } },
3687 { ISD::BITREVERSE, MVT::i64, { 2, 2, 3, 4 } },
3688 { ISD::BITREVERSE, MVT::i32, { 2, 2, 3, 4 } },
3689 { ISD::BITREVERSE, MVT::i16, { 2, 2, 3, 4 } },
3690 { ISD::BITREVERSE, MVT::i8, { 2, 2, 3, 4 } },
3691 // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
3692 { ISD::ROTL, MVT::v4i64, { 4, 7, 5, 6 } },
3693 { ISD::ROTL, MVT::v8i32, { 4, 7, 5, 6 } },
3694 { ISD::ROTL, MVT::v16i16, { 4, 7, 5, 6 } },
3695 { ISD::ROTL, MVT::v32i8, { 4, 7, 5, 6 } },
3696 { ISD::ROTL, MVT::v2i64, { 1, 3, 1, 1 } },
3697 { ISD::ROTL, MVT::v4i32, { 1, 3, 1, 1 } },
3698 { ISD::ROTL, MVT::v8i16, { 1, 3, 1, 1 } },
3699 { ISD::ROTL, MVT::v16i8, { 1, 3, 1, 1 } },
3700 { ISD::ROTR, MVT::v4i64, { 4, 7, 8, 9 } },
3701 { ISD::ROTR, MVT::v8i32, { 4, 7, 8, 9 } },
3702 { ISD::ROTR, MVT::v16i16, { 4, 7, 8, 9 } },
3703 { ISD::ROTR, MVT::v32i8, { 4, 7, 8, 9 } },
3704 { ISD::ROTR, MVT::v2i64, { 1, 3, 3, 3 } },
3705 { ISD::ROTR, MVT::v4i32, { 1, 3, 3, 3 } },
3706 { ISD::ROTR, MVT::v8i16, { 1, 3, 3, 3 } },
3707 { ISD::ROTR, MVT::v16i8, { 1, 3, 3, 3 } },
3708 { X86ISD::VROTLI, MVT::v4i64, { 4, 7, 5, 6 } },
3709 { X86ISD::VROTLI, MVT::v8i32, { 4, 7, 5, 6 } },
3710 { X86ISD::VROTLI, MVT::v16i16, { 4, 7, 5, 6 } },
3711 { X86ISD::VROTLI, MVT::v32i8, { 4, 7, 5, 6 } },
3712 { X86ISD::VROTLI, MVT::v2i64, { 1, 3, 1, 1 } },
3713 { X86ISD::VROTLI, MVT::v4i32, { 1, 3, 1, 1 } },
3714 { X86ISD::VROTLI, MVT::v8i16, { 1, 3, 1, 1 } },
3715 { X86ISD::VROTLI, MVT::v16i8, { 1, 3, 1, 1 } },
3716 };
3717 static const CostKindTblEntry AVX2CostTbl[] = {
3718 { ISD::ABS, MVT::v2i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3719 { ISD::ABS, MVT::v4i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3720 { ISD::ABS, MVT::v4i32, { 1, 1, 1, 1 } },
3721 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 2 } },
3722 { ISD::ABS, MVT::v8i16, { 1, 1, 1, 1 } },
3723 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 2 } },
3724 { ISD::ABS, MVT::v16i8, { 1, 1, 1, 1 } },
3725 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 2 } },
3726 { ISD::BITREVERSE, MVT::v2i64, { 3, 11, 10, 11 } },
3727 { ISD::BITREVERSE, MVT::v4i64, { 5, 11, 10, 17 } },
3728 { ISD::BITREVERSE, MVT::v4i32, { 3, 11, 10, 11 } },
3729 { ISD::BITREVERSE, MVT::v8i32, { 5, 11, 10, 17 } },
3730 { ISD::BITREVERSE, MVT::v8i16, { 3, 11, 10, 11 } },
3731 { ISD::BITREVERSE, MVT::v16i16, { 5, 11, 10, 17 } },
3732 { ISD::BITREVERSE, MVT::v16i8, { 3, 6, 9, 9 } },
3733 { ISD::BITREVERSE, MVT::v32i8, { 4, 5, 9, 15 } },
3734 { ISD::BSWAP, MVT::v2i64, { 1, 2, 1, 2 } },
3735 { ISD::BSWAP, MVT::v4i64, { 1, 3, 1, 2 } },
3736 { ISD::BSWAP, MVT::v4i32, { 1, 2, 1, 2 } },
3737 { ISD::BSWAP, MVT::v8i32, { 1, 3, 1, 2 } },
3738 { ISD::BSWAP, MVT::v8i16, { 1, 2, 1, 2 } },
3739 { ISD::BSWAP, MVT::v16i16, { 1, 3, 1, 2 } },
3740 { ISD::CTLZ, MVT::v2i64, { 7, 18, 24, 25 } },
3741 { ISD::CTLZ, MVT::v4i64, { 14, 18, 24, 44 } },
3742 { ISD::CTLZ, MVT::v4i32, { 5, 16, 19, 20 } },
3743 { ISD::CTLZ, MVT::v8i32, { 10, 16, 19, 34 } },
3744 { ISD::CTLZ, MVT::v8i16, { 4, 13, 14, 15 } },
3745 { ISD::CTLZ, MVT::v16i16, { 6, 14, 14, 24 } },
3746 { ISD::CTLZ, MVT::v16i8, { 3, 12, 9, 10 } },
3747 { ISD::CTLZ, MVT::v32i8, { 4, 12, 9, 14 } },
3748 { ISD::CTPOP, MVT::v2i64, { 3, 9, 10, 10 } },
3749 { ISD::CTPOP, MVT::v4i64, { 4, 9, 10, 14 } },
3750 { ISD::CTPOP, MVT::v4i32, { 7, 12, 14, 14 } },
3751 { ISD::CTPOP, MVT::v8i32, { 7, 12, 14, 18 } },
3752 { ISD::CTPOP, MVT::v8i16, { 3, 7, 11, 11 } },
3753 { ISD::CTPOP, MVT::v16i16, { 6, 8, 11, 18 } },
3754 { ISD::CTPOP, MVT::v16i8, { 2, 5, 8, 8 } },
3755 { ISD::CTPOP, MVT::v32i8, { 3, 5, 8, 12 } },
3756 { ISD::CTTZ, MVT::v2i64, { 4, 11, 13, 13 } },
3757 { ISD::CTTZ, MVT::v4i64, { 5, 11, 13, 20 } },
3758 { ISD::CTTZ, MVT::v4i32, { 7, 14, 17, 17 } },
3759 { ISD::CTTZ, MVT::v8i32, { 7, 15, 17, 24 } },
3760 { ISD::CTTZ, MVT::v8i16, { 4, 9, 14, 14 } },
3761 { ISD::CTTZ, MVT::v16i16, { 6, 9, 14, 24 } },
3762 { ISD::CTTZ, MVT::v16i8, { 3, 7, 11, 11 } },
3763 { ISD::CTTZ, MVT::v32i8, { 5, 7, 11, 18 } },
3764 { ISD::SADDSAT, MVT::v16i16, { 1 } },
3765 { ISD::SADDSAT, MVT::v32i8, { 1 } },
3766 { ISD::SMAX, MVT::v2i64, { 2, 7, 2, 3 } },
3767 { ISD::SMAX, MVT::v4i64, { 2, 7, 2, 3 } },
3768 { ISD::SMAX, MVT::v8i32, { 1, 1, 1, 2 } },
3769 { ISD::SMAX, MVT::v16i16, { 1, 1, 1, 2 } },
3770 { ISD::SMAX, MVT::v32i8, { 1, 1, 1, 2 } },
3771 { ISD::SMIN, MVT::v2i64, { 2, 7, 2, 3 } },
3772 { ISD::SMIN, MVT::v4i64, { 2, 7, 2, 3 } },
3773 { ISD::SMIN, MVT::v8i32, { 1, 1, 1, 2 } },
3774 { ISD::SMIN, MVT::v16i16, { 1, 1, 1, 2 } },
3775 { ISD::SMIN, MVT::v32i8, { 1, 1, 1, 2 } },
3776 { ISD::SSUBSAT, MVT::v16i16, { 1 } },
3777 { ISD::SSUBSAT, MVT::v32i8, { 1 } },
3778 { ISD::UADDSAT, MVT::v16i16, { 1 } },
3779 { ISD::UADDSAT, MVT::v32i8, { 1 } },
3780 { ISD::UADDSAT, MVT::v8i32, { 3 } }, // not + pminud + paddd
3781 { ISD::UMAX, MVT::v2i64, { 2, 8, 5, 6 } },
3782 { ISD::UMAX, MVT::v4i64, { 2, 8, 5, 8 } },
3783 { ISD::UMAX, MVT::v8i32, { 1, 1, 1, 2 } },
3784 { ISD::UMAX, MVT::v16i16, { 1, 1, 1, 2 } },
3785 { ISD::UMAX, MVT::v32i8, { 1, 1, 1, 2 } },
3786 { ISD::UMIN, MVT::v2i64, { 2, 8, 5, 6 } },
3787 { ISD::UMIN, MVT::v4i64, { 2, 8, 5, 8 } },
3788 { ISD::UMIN, MVT::v8i32, { 1, 1, 1, 2 } },
3789 { ISD::UMIN, MVT::v16i16, { 1, 1, 1, 2 } },
3790 { ISD::UMIN, MVT::v32i8, { 1, 1, 1, 2 } },
3791 { ISD::USUBSAT, MVT::v16i16, { 1 } },
3792 { ISD::USUBSAT, MVT::v32i8, { 1 } },
3793 { ISD::USUBSAT, MVT::v8i32, { 2 } }, // pmaxud + psubd
3794 { ISD::FMAXNUM, MVT::f32, { 2, 7, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3795 { ISD::FMAXNUM, MVT::v4f32, { 2, 7, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3796 { ISD::FMAXNUM, MVT::v8f32, { 3, 7, 3, 6 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3797 { ISD::FMAXNUM, MVT::f64, { 2, 7, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3798 { ISD::FMAXNUM, MVT::v2f64, { 2, 7, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3799 { ISD::FMAXNUM, MVT::v4f64, { 3, 7, 3, 6 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3800 { ISD::FSQRT, MVT::f32, { 7, 15, 1, 1 } }, // vsqrtss
3801 { ISD::FSQRT, MVT::v4f32, { 7, 15, 1, 1 } }, // vsqrtps
3802 { ISD::FSQRT, MVT::v8f32, { 14, 21, 1, 3 } }, // vsqrtps
3803 { ISD::FSQRT, MVT::f64, { 14, 21, 1, 1 } }, // vsqrtsd
3804 { ISD::FSQRT, MVT::v2f64, { 14, 21, 1, 1 } }, // vsqrtpd
3805 { ISD::FSQRT, MVT::v4f64, { 28, 35, 1, 3 } }, // vsqrtpd
3806 };
3807 static const CostKindTblEntry AVX1CostTbl[] = {
3808 { ISD::ABS, MVT::v4i64, { 6, 8, 6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3809 { ISD::ABS, MVT::v8i32, { 3, 6, 4, 5 } },
3810 { ISD::ABS, MVT::v16i16, { 3, 6, 4, 5 } },
3811 { ISD::ABS, MVT::v32i8, { 3, 6, 4, 5 } },
3812 { ISD::BITREVERSE, MVT::v4i64, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3813 { ISD::BITREVERSE, MVT::v2i64, { 8, 13, 10, 16 } },
3814 { ISD::BITREVERSE, MVT::v8i32, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3815 { ISD::BITREVERSE, MVT::v4i32, { 8, 13, 10, 16 } },
3816 { ISD::BITREVERSE, MVT::v16i16, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3817 { ISD::BITREVERSE, MVT::v8i16, { 8, 13, 10, 16 } },
3818 { ISD::BITREVERSE, MVT::v32i8, { 13, 15, 17, 26 } }, // 2 x 128-bit Op + extract/insert
3819 { ISD::BITREVERSE, MVT::v16i8, { 7, 7, 9, 13 } },
3820 { ISD::BSWAP, MVT::v4i64, { 5, 6, 5, 10 } },
3821 { ISD::BSWAP, MVT::v2i64, { 2, 2, 1, 3 } },
3822 { ISD::BSWAP, MVT::v8i32, { 5, 6, 5, 10 } },
3823 { ISD::BSWAP, MVT::v4i32, { 2, 2, 1, 3 } },
3824 { ISD::BSWAP, MVT::v16i16, { 5, 6, 5, 10 } },
3825 { ISD::BSWAP, MVT::v8i16, { 2, 2, 1, 3 } },
3826 { ISD::CTLZ, MVT::v4i64, { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert
3827 { ISD::CTLZ, MVT::v2i64, { 14, 24, 24, 28 } },
3828 { ISD::CTLZ, MVT::v8i32, { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert
3829 { ISD::CTLZ, MVT::v4i32, { 12, 20, 19, 23 } },
3830 { ISD::CTLZ, MVT::v16i16, { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert
3831 { ISD::CTLZ, MVT::v8i16, { 9, 16, 14, 18 } },
3832 { ISD::CTLZ, MVT::v32i8, { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert
3833 { ISD::CTLZ, MVT::v16i8, { 7, 12, 9, 13 } },
3834 { ISD::CTPOP, MVT::v4i64, { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert
3835 { ISD::CTPOP, MVT::v2i64, { 7, 14, 10, 14 } },
3836 { ISD::CTPOP, MVT::v8i32, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
3837 { ISD::CTPOP, MVT::v4i32, { 9, 20, 14, 18 } },
3838 { ISD::CTPOP, MVT::v16i16, { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert
3839 { ISD::CTPOP, MVT::v8i16, { 8, 18, 11, 15 } },
3840 { ISD::CTPOP, MVT::v32i8, { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert
3841 { ISD::CTPOP, MVT::v16i8, { 6, 12, 8, 12 } },
3842 { ISD::CTTZ, MVT::v4i64, { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert
3843 { ISD::CTTZ, MVT::v2i64, { 9, 19, 13, 17 } },
3844 { ISD::CTTZ, MVT::v8i32, { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert
3845 { ISD::CTTZ, MVT::v4i32, { 11, 24, 17, 21 } },
3846 { ISD::CTTZ, MVT::v16i16, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
3847 { ISD::CTTZ, MVT::v8i16, { 9, 21, 14, 18 } },
3848 { ISD::CTTZ, MVT::v32i8, { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert
3849 { ISD::CTTZ, MVT::v16i8, { 8, 16, 11, 15 } },
3850 { ISD::SADDSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3851 { ISD::SADDSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3852 { ISD::SMAX, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
3853 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 4 } },
3854 { ISD::SMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3855 { ISD::SMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3856 { ISD::SMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3857 { ISD::SMIN, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
3858 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
3859 { ISD::SMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3860 { ISD::SMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3861 { ISD::SMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3862 { ISD::SSUBSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3863 { ISD::SSUBSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3864 { ISD::UADDSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3865 { ISD::UADDSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3866 { ISD::UADDSAT, MVT::v8i32, { 8 } }, // 2 x 128-bit Op + extract/insert
3867 { ISD::UMAX, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
3868 { ISD::UMAX, MVT::v2i64, { 4, 8, 5, 7 } },
3869 { ISD::UMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3870 { ISD::UMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3871 { ISD::UMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3872 { ISD::UMIN, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
3873 { ISD::UMIN, MVT::v2i64, { 4, 8, 5, 7 } },
3874 { ISD::UMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3875 { ISD::UMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3876 { ISD::UMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3877 { ISD::USUBSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert
3878 { ISD::USUBSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert
3879 { ISD::USUBSAT, MVT::v8i32, { 6 } }, // 2 x 128-bit Op + extract/insert
3880 { ISD::FMAXNUM, MVT::f32, { 3, 6, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3881 { ISD::FMAXNUM, MVT::v4f32, { 3, 6, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3882 { ISD::FMAXNUM, MVT::v8f32, { 5, 7, 3, 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3883 { ISD::FMAXNUM, MVT::f64, { 3, 6, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3884 { ISD::FMAXNUM, MVT::v2f64, { 3, 6, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3885 { ISD::FMAXNUM, MVT::v4f64, { 5, 7, 3, 10 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3886 { ISD::FSQRT, MVT::f32, { 21, 21, 1, 1 } }, // vsqrtss
3887 { ISD::FSQRT, MVT::v4f32, { 21, 21, 1, 1 } }, // vsqrtps
3888 { ISD::FSQRT, MVT::v8f32, { 42, 42, 1, 3 } }, // vsqrtps
3889 { ISD::FSQRT, MVT::f64, { 27, 27, 1, 1 } }, // vsqrtsd
3890 { ISD::FSQRT, MVT::v2f64, { 27, 27, 1, 1 } }, // vsqrtpd
3891 { ISD::FSQRT, MVT::v4f64, { 54, 54, 1, 3 } }, // vsqrtpd
3892 };
3893 static const CostKindTblEntry GFNICostTbl[] = {
3894 { ISD::BITREVERSE, MVT::i8, { 3, 3, 3, 4 } }, // gf2p8affineqb
3895 { ISD::BITREVERSE, MVT::i16, { 3, 3, 4, 6 } }, // gf2p8affineqb
3896 { ISD::BITREVERSE, MVT::i32, { 3, 3, 4, 5 } }, // gf2p8affineqb
3897 { ISD::BITREVERSE, MVT::i64, { 3, 3, 4, 6 } }, // gf2p8affineqb
3898 { ISD::BITREVERSE, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3899 { ISD::BITREVERSE, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3900 { ISD::BITREVERSE, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3901 { ISD::BITREVERSE, MVT::v8i16, { 1, 8, 2, 4 } }, // gf2p8affineqb
3902 { ISD::BITREVERSE, MVT::v16i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
3903 { ISD::BITREVERSE, MVT::v32i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
3904 { ISD::BITREVERSE, MVT::v4i32, { 1, 8, 2, 4 } }, // gf2p8affineqb
3905 { ISD::BITREVERSE, MVT::v8i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
3906 { ISD::BITREVERSE, MVT::v16i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
3907 { ISD::BITREVERSE, MVT::v2i64, { 1, 8, 2, 4 } }, // gf2p8affineqb
3908 { ISD::BITREVERSE, MVT::v4i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
3909 { ISD::BITREVERSE, MVT::v8i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
3910 { X86ISD::VROTLI, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3911 { X86ISD::VROTLI, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3912 { X86ISD::VROTLI, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3913 };
3914 static const CostKindTblEntry GLMCostTbl[] = {
3915 { ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss
3916 { ISD::FSQRT, MVT::v4f32, { 37, 41, 1, 5 } }, // sqrtps
3917 { ISD::FSQRT, MVT::f64, { 34, 35, 1, 1 } }, // sqrtsd
3918 { ISD::FSQRT, MVT::v2f64, { 67, 71, 1, 5 } }, // sqrtpd
3919 };
3920 static const CostKindTblEntry SLMCostTbl[] = {
3921 { ISD::BSWAP, MVT::v2i64, { 5, 5, 1, 5 } },
3922 { ISD::BSWAP, MVT::v4i32, { 5, 5, 1, 5 } },
3923 { ISD::BSWAP, MVT::v8i16, { 5, 5, 1, 5 } },
3924 { ISD::FSQRT, MVT::f32, { 20, 20, 1, 1 } }, // sqrtss
3925 { ISD::FSQRT, MVT::v4f32, { 40, 41, 1, 5 } }, // sqrtps
3926 { ISD::FSQRT, MVT::f64, { 35, 35, 1, 1 } }, // sqrtsd
3927 { ISD::FSQRT, MVT::v2f64, { 70, 71, 1, 5 } }, // sqrtpd
3928 };
3929 static const CostKindTblEntry SSE42CostTbl[] = {
3930 { ISD::USUBSAT, MVT::v4i32, { 2 } }, // pmaxud + psubd
3931 { ISD::UADDSAT, MVT::v4i32, { 3 } }, // not + pminud + paddd
3932 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3933 { ISD::FMAXNUM, MVT::v4f32, { 4, 4, 4, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3934 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3935 { ISD::FMAXNUM, MVT::v2f64, { 4, 4, 4, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3936 { ISD::FSQRT, MVT::f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
3937 { ISD::FSQRT, MVT::v4f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
3938 };
3939 static const CostKindTblEntry SSE41CostTbl[] = {
3940 { ISD::ABS, MVT::v2i64, { 3, 4, 3, 5 } }, // BLENDVPD(X,PSUBQ(0,X),X)
3941 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 3 } },
3942 { ISD::SMAX, MVT::v4i32, { 1, 1, 1, 1 } },
3943 { ISD::SMAX, MVT::v16i8, { 1, 1, 1, 1 } },
3944 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
3945 { ISD::SMIN, MVT::v4i32, { 1, 1, 1, 1 } },
3946 { ISD::SMIN, MVT::v16i8, { 1, 1, 1, 1 } },
3947 { ISD::UMAX, MVT::v2i64, { 2, 11, 6, 7 } },
3948 { ISD::UMAX, MVT::v4i32, { 1, 1, 1, 1 } },
3949 { ISD::UMAX, MVT::v8i16, { 1, 1, 1, 1 } },
3950 { ISD::UMIN, MVT::v2i64, { 2, 11, 6, 7 } },
3951 { ISD::UMIN, MVT::v4i32, { 1, 1, 1, 1 } },
3952 { ISD::UMIN, MVT::v8i16, { 1, 1, 1, 1 } },
3953 };
3954 static const CostKindTblEntry SSSE3CostTbl[] = {
3955 { ISD::ABS, MVT::v4i32, { 1, 2, 1, 1 } },
3956 { ISD::ABS, MVT::v8i16, { 1, 2, 1, 1 } },
3957 { ISD::ABS, MVT::v16i8, { 1, 2, 1, 1 } },
3958 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 11, 21 } },
3959 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 11, 21 } },
3960 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 11, 21 } },
3961 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 10, 16 } },
3962 { ISD::BSWAP, MVT::v2i64, { 2, 3, 1, 5 } },
3963 { ISD::BSWAP, MVT::v4i32, { 2, 3, 1, 5 } },
3964 { ISD::BSWAP, MVT::v8i16, { 2, 3, 1, 5 } },
3965 { ISD::CTLZ, MVT::v2i64, { 18, 28, 28, 35 } },
3966 { ISD::CTLZ, MVT::v4i32, { 15, 20, 22, 28 } },
3967 { ISD::CTLZ, MVT::v8i16, { 13, 17, 16, 22 } },
3968 { ISD::CTLZ, MVT::v16i8, { 11, 15, 10, 16 } },
3969 { ISD::CTPOP, MVT::v2i64, { 13, 19, 12, 18 } },
3970 { ISD::CTPOP, MVT::v4i32, { 18, 24, 16, 22 } },
3971 { ISD::CTPOP, MVT::v8i16, { 13, 18, 14, 20 } },
3972 { ISD::CTPOP, MVT::v16i8, { 11, 12, 10, 16 } },
3973 { ISD::CTTZ, MVT::v2i64, { 13, 25, 15, 22 } },
3974 { ISD::CTTZ, MVT::v4i32, { 18, 26, 19, 25 } },
3975 { ISD::CTTZ, MVT::v8i16, { 13, 20, 17, 23 } },
3976 { ISD::CTTZ, MVT::v16i8, { 11, 16, 13, 19 } }
3977 };
3978 static const CostKindTblEntry SSE2CostTbl[] = {
3979 { ISD::ABS, MVT::v2i64, { 3, 6, 5, 5 } },
3980 { ISD::ABS, MVT::v4i32, { 1, 4, 4, 4 } },
3981 { ISD::ABS, MVT::v8i16, { 1, 2, 3, 3 } },
3982 { ISD::ABS, MVT::v16i8, { 1, 2, 3, 3 } },
3983 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 32, 32 } },
3984 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 30, 30 } },
3985 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 25, 25 } },
3986 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 21, 21 } },
3987 { ISD::BSWAP, MVT::v2i64, { 5, 6, 11, 11 } },
3988 { ISD::BSWAP, MVT::v4i32, { 5, 5, 9, 9 } },
3989 { ISD::BSWAP, MVT::v8i16, { 5, 5, 4, 5 } },
3990 { ISD::CTLZ, MVT::v2i64, { 10, 45, 36, 38 } },
3991 { ISD::CTLZ, MVT::v4i32, { 10, 45, 38, 40 } },
3992 { ISD::CTLZ, MVT::v8i16, { 9, 38, 32, 34 } },
3993 { ISD::CTLZ, MVT::v16i8, { 8, 39, 29, 32 } },
3994 { ISD::CTPOP, MVT::v2i64, { 12, 26, 16, 18 } },
3995 { ISD::CTPOP, MVT::v4i32, { 15, 29, 21, 23 } },
3996 { ISD::CTPOP, MVT::v8i16, { 13, 25, 18, 20 } },
3997 { ISD::CTPOP, MVT::v16i8, { 10, 21, 14, 16 } },
3998 { ISD::CTTZ, MVT::v2i64, { 14, 28, 19, 21 } },
3999 { ISD::CTTZ, MVT::v4i32, { 18, 31, 24, 26 } },
4000 { ISD::CTTZ, MVT::v8i16, { 16, 27, 21, 23 } },
4001 { ISD::CTTZ, MVT::v16i8, { 13, 23, 17, 19 } },
4002 { ISD::SADDSAT, MVT::v8i16, { 1 } },
4003 { ISD::SADDSAT, MVT::v16i8, { 1 } },
4004 { ISD::SMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4005 { ISD::SMAX, MVT::v4i32, { 2, 4, 5, 5 } },
4006 { ISD::SMAX, MVT::v8i16, { 1, 1, 1, 1 } },
4007 { ISD::SMAX, MVT::v16i8, { 2, 4, 5, 5 } },
4008 { ISD::SMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4009 { ISD::SMIN, MVT::v4i32, { 2, 4, 5, 5 } },
4010 { ISD::SMIN, MVT::v8i16, { 1, 1, 1, 1 } },
4011 { ISD::SMIN, MVT::v16i8, { 2, 4, 5, 5 } },
4012 { ISD::SSUBSAT, MVT::v8i16, { 1 } },
4013 { ISD::SSUBSAT, MVT::v16i8, { 1 } },
4014 { ISD::UADDSAT, MVT::v8i16, { 1 } },
4015 { ISD::UADDSAT, MVT::v16i8, { 1 } },
4016 { ISD::UMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4017 { ISD::UMAX, MVT::v4i32, { 2, 5, 8, 8 } },
4018 { ISD::UMAX, MVT::v8i16, { 1, 3, 3, 3 } },
4019 { ISD::UMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4020 { ISD::UMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4021 { ISD::UMIN, MVT::v4i32, { 2, 5, 8, 8 } },
4022 { ISD::UMIN, MVT::v8i16, { 1, 3, 3, 3 } },
4023 { ISD::UMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4024 { ISD::USUBSAT, MVT::v8i16, { 1 } },
4025 { ISD::USUBSAT, MVT::v16i8, { 1 } },
4026 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } },
4027 { ISD::FMAXNUM, MVT::v2f64, { 4, 6, 6, 6 } },
4028 { ISD::FSQRT, MVT::f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4029 { ISD::FSQRT, MVT::v2f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4030 };
4031 static const CostKindTblEntry SSE1CostTbl[] = {
4032 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } },
4033 { ISD::FMAXNUM, MVT::v4f32, { 4, 6, 6, 6 } },
4034 { ISD::FSQRT, MVT::f32, { 28, 30, 1, 2 } }, // Pentium III from http://www.agner.org/
4035 { ISD::FSQRT, MVT::v4f32, { 56, 56, 1, 2 } }, // Pentium III from http://www.agner.org/
4036 };
4037 static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets
4038 { ISD::CTTZ, MVT::i64, { 1 } },
4039 };
4040 static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
4041 { ISD::CTTZ, MVT::i32, { 1 } },
4042 { ISD::CTTZ, MVT::i16, { 1 } },
4043 { ISD::CTTZ, MVT::i8, { 1 } },
4044 };
4045 static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets
4046 { ISD::CTLZ, MVT::i64, { 1 } },
4047 };
4048 static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
4049 { ISD::CTLZ, MVT::i32, { 1 } },
4050 { ISD::CTLZ, MVT::i16, { 2 } },
4051 { ISD::CTLZ, MVT::i8, { 2 } },
4052 };
4053 static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets
4054 { ISD::CTPOP, MVT::i64, { 1, 1, 1, 1 } }, // popcnt
4055 };
4056 static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
4057 { ISD::CTPOP, MVT::i32, { 1, 1, 1, 1 } }, // popcnt
4058 { ISD::CTPOP, MVT::i16, { 1, 1, 2, 2 } }, // popcnt(zext())
4059 { ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext())
4060 };
4061 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
4062 { ISD::ABS, MVT::i64, { 1, 2, 3, 4 } }, // SUB+CMOV
4063 { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } },
4064 { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } },
4065 { ISD::CTLZ, MVT::i64, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
4066 { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 1, 1, 1 } }, // BSR+XOR
4067 { ISD::CTTZ, MVT::i64, { 3 } }, // TEST+BSF+CMOV/BRANCH
4068 { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 1, 1, 1 } }, // BSR
4069 { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } },
4070 { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } },
4071 { ISD::ROTR, MVT::i64, { 2, 3, 1, 3 } },
4072 { X86ISD::VROTLI, MVT::i64, { 1, 1, 1, 1 } },
4073 { ISD::FSHL, MVT::i64, { 4, 4, 1, 4 } },
4074 { ISD::SMAX, MVT::i64, { 1, 3, 2, 3 } },
4075 { ISD::SMIN, MVT::i64, { 1, 3, 2, 3 } },
4076 { ISD::UMAX, MVT::i64, { 1, 3, 2, 3 } },
4077 { ISD::UMIN, MVT::i64, { 1, 3, 2, 3 } },
4078 { ISD::SADDO, MVT::i64, { 1 } },
4079 { ISD::UADDO, MVT::i64, { 1 } },
4080 { ISD::UMULO, MVT::i64, { 2 } }, // mulq + seto
4081 };
4082 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
4083 { ISD::ABS, MVT::i32, { 1, 2, 3, 4 } }, // SUB+XOR+SRA or SUB+CMOV
4084 { ISD::ABS, MVT::i16, { 2, 2, 3, 4 } }, // SUB+XOR+SRA or SUB+CMOV
4085 { ISD::ABS, MVT::i8, { 2, 4, 4, 4 } }, // SUB+XOR+SRA
4086 { ISD::BITREVERSE, MVT::i32, { 9, 12, 17, 19 } },
4087 { ISD::BITREVERSE, MVT::i16, { 9, 12, 17, 19 } },
4088 { ISD::BITREVERSE, MVT::i8, { 7, 9, 13, 14 } },
4089 { ISD::BSWAP, MVT::i32, { 1, 1, 1, 1 } },
4090 { ISD::BSWAP, MVT::i16, { 1, 2, 1, 2 } }, // ROL
4091 { ISD::CTLZ, MVT::i32, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
4092 { ISD::CTLZ, MVT::i16, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
4093 { ISD::CTLZ, MVT::i8, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
4094 { ISD::CTLZ_ZERO_UNDEF, MVT::i32,{ 1, 1, 1, 1 } }, // BSR+XOR
4095 { ISD::CTLZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 3, 3 } }, // BSR+XOR
4096 { ISD::CTLZ_ZERO_UNDEF, MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR
4097 { ISD::CTTZ, MVT::i32, { 3 } }, // TEST+BSF+CMOV/BRANCH
4098 { ISD::CTTZ, MVT::i16, { 3 } }, // TEST+BSF+CMOV/BRANCH
4099 { ISD::CTTZ, MVT::i8, { 3 } }, // TEST+BSF+CMOV/BRANCH
4100 { ISD::CTTZ_ZERO_UNDEF, MVT::i32,{ 1, 1, 1, 1 } }, // BSF
4101 { ISD::CTTZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 1, 1 } }, // BSF
4102 { ISD::CTTZ_ZERO_UNDEF, MVT::i8, { 2, 2, 1, 1 } }, // BSF
4103 { ISD::CTPOP, MVT::i32, { 8, 7, 15, 15 } },
4104 { ISD::CTPOP, MVT::i16, { 9, 8, 17, 17 } },
4105 { ISD::CTPOP, MVT::i8, { 7, 6, 6, 6 } },
4106 { ISD::ROTL, MVT::i32, { 2, 3, 1, 3 } },
4107 { ISD::ROTL, MVT::i16, { 2, 3, 1, 3 } },
4108 { ISD::ROTL, MVT::i8, { 2, 3, 1, 3 } },
4109 { ISD::ROTR, MVT::i32, { 2, 3, 1, 3 } },
4110 { ISD::ROTR, MVT::i16, { 2, 3, 1, 3 } },
4111 { ISD::ROTR, MVT::i8, { 2, 3, 1, 3 } },
4112 { X86ISD::VROTLI, MVT::i32, { 1, 1, 1, 1 } },
4113 { X86ISD::VROTLI, MVT::i16, { 1, 1, 1, 1 } },
4114 { X86ISD::VROTLI, MVT::i8, { 1, 1, 1, 1 } },
4115 { ISD::FSHL, MVT::i32, { 4, 4, 1, 4 } },
4116 { ISD::FSHL, MVT::i16, { 4, 4, 2, 5 } },
4117 { ISD::FSHL, MVT::i8, { 4, 4, 2, 5 } },
4118 { ISD::SMAX, MVT::i32, { 1, 2, 2, 3 } },
4119 { ISD::SMAX, MVT::i16, { 1, 4, 2, 4 } },
4120 { ISD::SMAX, MVT::i8, { 1, 4, 2, 4 } },
4121 { ISD::SMIN, MVT::i32, { 1, 2, 2, 3 } },
4122 { ISD::SMIN, MVT::i16, { 1, 4, 2, 4 } },
4123 { ISD::SMIN, MVT::i8, { 1, 4, 2, 4 } },
4124 { ISD::UMAX, MVT::i32, { 1, 2, 2, 3 } },
4125 { ISD::UMAX, MVT::i16, { 1, 4, 2, 4 } },
4126 { ISD::UMAX, MVT::i8, { 1, 4, 2, 4 } },
4127 { ISD::UMIN, MVT::i32, { 1, 2, 2, 3 } },
4128 { ISD::UMIN, MVT::i16, { 1, 4, 2, 4 } },
4129 { ISD::UMIN, MVT::i8, { 1, 4, 2, 4 } },
4130 { ISD::SADDO, MVT::i32, { 1 } },
4131 { ISD::SADDO, MVT::i16, { 1 } },
4132 { ISD::SADDO, MVT::i8, { 1 } },
4133 { ISD::UADDO, MVT::i32, { 1 } },
4134 { ISD::UADDO, MVT::i16, { 1 } },
4135 { ISD::UADDO, MVT::i8, { 1 } },
4136 { ISD::UMULO, MVT::i32, { 2 } }, // mul + seto
4137 { ISD::UMULO, MVT::i16, { 2 } },
4138 { ISD::UMULO, MVT::i8, { 2 } },
4139 };
4140
4141 Type *RetTy = ICA.getReturnType();
4142 Type *OpTy = RetTy;
4143 Intrinsic::ID IID = ICA.getID();
4144 unsigned ISD = ISD::DELETED_NODE;
4145 switch (IID) {
4146 default:
4147 break;
4148 case Intrinsic::abs:
4149 ISD = ISD::ABS;
4150 break;
4151 case Intrinsic::bitreverse:
4152 ISD = ISD::BITREVERSE;
4153 break;
4154 case Intrinsic::bswap:
4155 ISD = ISD::BSWAP;
4156 break;
4157 case Intrinsic::ctlz:
4158 ISD = ISD::CTLZ;
4159 break;
4160 case Intrinsic::ctpop:
4161 ISD = ISD::CTPOP;
4162 break;
4163 case Intrinsic::cttz:
4164 ISD = ISD::CTTZ;
4165 break;
4166 case Intrinsic::fshl:
4167 ISD = ISD::FSHL;
4168 if (!ICA.isTypeBasedOnly()) {
4169 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4170 if (Args[0] == Args[1]) {
4171 ISD = ISD::ROTL;
4172 // Handle uniform constant rotation amounts.
4173 // TODO: Handle funnel-shift cases.
4174 const APInt *Amt;
4175 if (Args[2] &&
4177 ISD = X86ISD::VROTLI;
4178 }
4179 }
4180 break;
4181 case Intrinsic::fshr:
4182 // FSHR has same costs so don't duplicate.
4183 ISD = ISD::FSHL;
4184 if (!ICA.isTypeBasedOnly()) {
4185 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4186 if (Args[0] == Args[1]) {
4187 ISD = ISD::ROTR;
4188 // Handle uniform constant rotation amount.
4189 // TODO: Handle funnel-shift cases.
4190 const APInt *Amt;
4191 if (Args[2] &&
4193 ISD = X86ISD::VROTLI;
4194 }
4195 }
4196 break;
4197 case Intrinsic::lrint:
4198 case Intrinsic::llrint:
4199 // X86 can use the CVTP2SI instructions to lower lrint/llrint calls, which
4200 // have the same costs as the CVTTP2SI (fptosi) instructions
4201 if (!ICA.isTypeBasedOnly()) {
4202 const SmallVectorImpl<Type *> &ArgTys = ICA.getArgTypes();
4203 return getCastInstrCost(Instruction::FPToSI, RetTy, ArgTys[0],
4205 }
4206 break;
4207 case Intrinsic::maxnum:
4208 case Intrinsic::minnum:
4209 // FMINNUM has same costs so don't duplicate.
4210 ISD = ISD::FMAXNUM;
4211 break;
4212 case Intrinsic::sadd_sat:
4213 ISD = ISD::SADDSAT;
4214 break;
4215 case Intrinsic::smax:
4216 ISD = ISD::SMAX;
4217 break;
4218 case Intrinsic::smin:
4219 ISD = ISD::SMIN;
4220 break;
4221 case Intrinsic::ssub_sat:
4222 ISD = ISD::SSUBSAT;
4223 break;
4224 case Intrinsic::uadd_sat:
4225 ISD = ISD::UADDSAT;
4226 break;
4227 case Intrinsic::umax:
4228 ISD = ISD::UMAX;
4229 break;
4230 case Intrinsic::umin:
4231 ISD = ISD::UMIN;
4232 break;
4233 case Intrinsic::usub_sat:
4234 ISD = ISD::USUBSAT;
4235 break;
4236 case Intrinsic::sqrt:
4237 ISD = ISD::FSQRT;
4238 break;
4239 case Intrinsic::sadd_with_overflow:
4240 case Intrinsic::ssub_with_overflow:
4241 // SSUBO has same costs so don't duplicate.
4242 ISD = ISD::SADDO;
4243 OpTy = RetTy->getContainedType(0);
4244 break;
4245 case Intrinsic::uadd_with_overflow:
4246 case Intrinsic::usub_with_overflow:
4247 // USUBO has same costs so don't duplicate.
4248 ISD = ISD::UADDO;
4249 OpTy = RetTy->getContainedType(0);
4250 break;
4251 case Intrinsic::umul_with_overflow:
4252 case Intrinsic::smul_with_overflow:
4253 // SMULO has same costs so don't duplicate.
4254 ISD = ISD::UMULO;
4255 OpTy = RetTy->getContainedType(0);
4256 break;
4257 }
4258
4259 if (ISD != ISD::DELETED_NODE) {
4260 // Legalize the type.
4261 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy);
4262 MVT MTy = LT.second;
4263
4264 // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost.
4265 if (((ISD == ISD::CTTZ && !ST->hasBMI()) ||
4266 (ISD == ISD::CTLZ && !ST->hasLZCNT())) &&
4267 !MTy.isVector() && !ICA.isTypeBasedOnly()) {
4268 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4269 if (auto *Cst = dyn_cast<ConstantInt>(Args[1]))
4270 if (Cst->isAllOnesValue())
4272 }
4273
4274 // FSQRT is a single instruction.
4275 if (ISD == ISD::FSQRT && CostKind == TTI::TCK_CodeSize)
4276 return LT.first;
4277
4278 auto adjustTableCost = [](int ISD, unsigned Cost,
4279 InstructionCost LegalizationCost,
4280 FastMathFlags FMF) {
4281 // If there are no NANs to deal with, then these are reduced to a
4282 // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
4283 // assume is used in the non-fast case.
4284 if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) {
4285 if (FMF.noNaNs())
4286 return LegalizationCost * 1;
4287 }
4288 return LegalizationCost * (int)Cost;
4289 };
4290
4291 if (ST->useGLMDivSqrtCosts())
4292 if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
4293 if (auto KindCost = Entry->Cost[CostKind])
4294 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4295 ICA.getFlags());
4296
4297 if (ST->useSLMArithCosts())
4298 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
4299 if (auto KindCost = Entry->Cost[CostKind])
4300 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4301 ICA.getFlags());
4302
4303 if (ST->hasVBMI2())
4304 if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy))
4305 if (auto KindCost = Entry->Cost[CostKind])
4306 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4307 ICA.getFlags());
4308
4309 if (ST->hasBITALG())
4310 if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
4311 if (auto KindCost = Entry->Cost[CostKind])
4312 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4313 ICA.getFlags());
4314
4315 if (ST->hasVPOPCNTDQ())
4316 if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
4317 if (auto KindCost = Entry->Cost[CostKind])
4318 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4319 ICA.getFlags());
4320
4321 if (ST->hasGFNI())
4322 if (const auto *Entry = CostTableLookup(GFNICostTbl, ISD, MTy))
4323 if (auto KindCost = Entry->Cost[CostKind])
4324 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4325 ICA.getFlags());
4326
4327 if (ST->hasCDI())
4328 if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
4329 if (auto KindCost = Entry->Cost[CostKind])
4330 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4331 ICA.getFlags());
4332
4333 if (ST->hasBWI())
4334 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
4335 if (auto KindCost = Entry->Cost[CostKind])
4336 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4337 ICA.getFlags());
4338
4339 if (ST->hasAVX512())
4340 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
4341 if (auto KindCost = Entry->Cost[CostKind])
4342 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4343 ICA.getFlags());
4344
4345 if (ST->hasXOP())
4346 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
4347 if (auto KindCost = Entry->Cost[CostKind])
4348 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4349 ICA.getFlags());
4350
4351 if (ST->hasAVX2())
4352 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
4353 if (auto KindCost = Entry->Cost[CostKind])
4354 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4355 ICA.getFlags());
4356
4357 if (ST->hasAVX())
4358 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
4359 if (auto KindCost = Entry->Cost[CostKind])
4360 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4361 ICA.getFlags());
4362
4363 if (ST->hasSSE42())
4364 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
4365 if (auto KindCost = Entry->Cost[CostKind])
4366 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4367 ICA.getFlags());
4368
4369 if (ST->hasSSE41())
4370 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
4371 if (auto KindCost = Entry->Cost[CostKind])
4372 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4373 ICA.getFlags());
4374
4375 if (ST->hasSSSE3())
4376 if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
4377 if (auto KindCost = Entry->Cost[CostKind])
4378 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4379 ICA.getFlags());
4380
4381 if (ST->hasSSE2())
4382 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
4383 if (auto KindCost = Entry->Cost[CostKind])
4384 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4385 ICA.getFlags());
4386
4387 if (ST->hasSSE1())
4388 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
4389 if (auto KindCost = Entry->Cost[CostKind])
4390 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4391 ICA.getFlags());
4392
4393 if (ST->hasBMI()) {
4394 if (ST->is64Bit())
4395 if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
4396 if (auto KindCost = Entry->Cost[CostKind])
4397 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4398 ICA.getFlags());
4399
4400 if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
4401 if (auto KindCost = Entry->Cost[CostKind])
4402 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4403 ICA.getFlags());
4404 }
4405
4406 if (ST->hasLZCNT()) {
4407 if (ST->is64Bit())
4408 if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
4409 if (auto KindCost = Entry->Cost[CostKind])
4410 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4411 ICA.getFlags());
4412
4413 if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
4414 if (auto KindCost = Entry->Cost[CostKind])
4415 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4416 ICA.getFlags());
4417 }
4418
4419 if (ST->hasPOPCNT()) {
4420 if (ST->is64Bit())
4421 if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
4422 if (auto KindCost = Entry->Cost[CostKind])
4423 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4424 ICA.getFlags());
4425
4426 if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
4427 if (auto KindCost = Entry->Cost[CostKind])
4428 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4429 ICA.getFlags());
4430 }
4431
4432 if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
4433 if (const Instruction *II = ICA.getInst()) {
4434 if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
4435 return TTI::TCC_Free;
4436 if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
4437 if (LI->hasOneUse())
4438 return TTI::TCC_Free;
4439 }
4440 }
4441 }
4442
4443 if (ST->is64Bit())
4444 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
4445 if (auto KindCost = Entry->Cost[CostKind])
4446 return adjustTableCost(Entry->ISD, *KindCost, LT.first,
4447 ICA.getFlags());
4448
4449 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
4450 if (auto KindCost = Entry->Cost[CostKind])
4451 return adjustTableCost(Entry->ISD, *KindCost, LT.first, ICA.getFlags());
4452 }
4453
4455}
4456
4459 unsigned Index, Value *Op0,
4460 Value *Op1) {
4461 static const CostTblEntry SLMCostTbl[] = {
4462 { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },
4463 { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 },
4464 { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 },
4465 { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 }
4466 };
4467
4468 assert(Val->isVectorTy() && "This must be a vector type");
4469 Type *ScalarType = Val->getScalarType();
4470 InstructionCost RegisterFileMoveCost = 0;
4471
4472 // Non-immediate extraction/insertion can be handled as a sequence of
4473 // aliased loads+stores via the stack.
4474 if (Index == -1U && (Opcode == Instruction::ExtractElement ||
4475 Opcode == Instruction::InsertElement)) {
4476 // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
4477 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
4478
4479 // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
4480 assert(isa<FixedVectorType>(Val) && "Fixed vector type expected");
4481 Align VecAlign = DL.getPrefTypeAlign(Val);
4482 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
4483
4484 // Extract - store vector to stack, load scalar.
4485 if (Opcode == Instruction::ExtractElement) {
4486 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4487 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
4488 CostKind);
4489 }
4490 // Insert - store vector to stack, store scalar, load vector.
4491 if (Opcode == Instruction::InsertElement) {
4492 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4493 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
4494 CostKind) +
4495 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind);
4496 }
4497 }
4498
4499 if (Index != -1U && (Opcode == Instruction::ExtractElement ||
4500 Opcode == Instruction::InsertElement)) {
4501 // Extraction of vXi1 elements are now efficiently handled by MOVMSK.
4502 if (Opcode == Instruction::ExtractElement &&
4503 ScalarType->getScalarSizeInBits() == 1 &&
4504 cast<FixedVectorType>(Val)->getNumElements() > 1)
4505 return 1;
4506
4507 // Legalize the type.
4508 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4509
4510 // This type is legalized to a scalar type.
4511 if (!LT.second.isVector())
4512 return 0;
4513
4514 // The type may be split. Normalize the index to the new type.
4515 unsigned SizeInBits = LT.second.getSizeInBits();
4516 unsigned NumElts = LT.second.getVectorNumElements();
4517 unsigned SubNumElts = NumElts;
4518 Index = Index % NumElts;
4519
4520 // For >128-bit vectors, we need to extract higher 128-bit subvectors.
4521 // For inserts, we also need to insert the subvector back.
4522 if (SizeInBits > 128) {
4523 assert((SizeInBits % 128) == 0 && "Illegal vector");
4524 unsigned NumSubVecs = SizeInBits / 128;
4525 SubNumElts = NumElts / NumSubVecs;
4526 if (SubNumElts <= Index) {
4527 RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
4528 Index %= SubNumElts;
4529 }
4530 }
4531
4532 MVT MScalarTy = LT.second.getScalarType();
4533 auto IsCheapPInsrPExtrInsertPS = [&]() {
4534 // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
4535 // Also, assume insertps is relatively cheap on all >= SSE41 targets.
4536 return (MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4537 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4538 (MScalarTy == MVT::f32 && ST->hasSSE41() &&
4539 Opcode == Instruction::InsertElement);
4540 };
4541
4542 if (Index == 0) {
4543 // Floating point scalars are already located in index #0.
4544 // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
4545 // true for all.
4546 if (ScalarType->isFloatingPointTy() &&
4547 (Opcode != Instruction::InsertElement || !Op0 ||
4548 isa<UndefValue>(Op0)))
4549 return RegisterFileMoveCost;
4550
4551 if (Opcode == Instruction::InsertElement &&
4552 isa_and_nonnull<UndefValue>(Op0)) {
4553 // Consider the gather cost to be cheap.
4554 if (isa_and_nonnull<LoadInst>(Op1))
4555 return RegisterFileMoveCost;
4556 if (!IsCheapPInsrPExtrInsertPS()) {
4557 // mov constant-to-GPR + movd/movq GPR -> XMM.
4558 if (isa_and_nonnull<Constant>(Op1) && Op1->getType()->isIntegerTy())
4559 return 2 + RegisterFileMoveCost;
4560 // Assume movd/movq GPR -> XMM is relatively cheap on all targets.
4561 return 1 + RegisterFileMoveCost;
4562 }
4563 }
4564
4565 // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
4566 if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
4567 return 1 + RegisterFileMoveCost;
4568 }
4569
4570 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4571 assert(ISD && "Unexpected vector opcode");
4572 if (ST->useSLMArithCosts())
4573 if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
4574 return Entry->Cost + RegisterFileMoveCost;
4575
4576 // Consider cheap cases.
4577 if (IsCheapPInsrPExtrInsertPS())
4578 return 1 + RegisterFileMoveCost;
4579
4580 // For extractions we just need to shuffle the element to index 0, which
4581 // should be very cheap (assume cost = 1). For insertions we need to shuffle
4582 // the elements to its destination. In both cases we must handle the
4583 // subvector move(s).
4584 // If the vector type is already less than 128-bits then don't reduce it.
4585 // TODO: Under what circumstances should we shuffle using the full width?
4586 InstructionCost ShuffleCost = 1;
4587 if (Opcode == Instruction::InsertElement) {
4588 auto *SubTy = cast<VectorType>(Val);
4589 EVT VT = TLI->getValueType(DL, Val);
4590 if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
4591 SubTy = FixedVectorType::get(ScalarType, SubNumElts);
4592 ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, std::nullopt,
4593 CostKind, 0, SubTy);
4594 }
4595 int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
4596 return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
4597 }
4598
4599 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1) +
4600 RegisterFileMoveCost;
4601}
4602
4605 bool Insert, bool Extract,
4607 assert(DemandedElts.getBitWidth() ==
4608 cast<FixedVectorType>(Ty)->getNumElements() &&
4609 "Vector size mismatch");
4610
4611 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4612 MVT MScalarTy = LT.second.getScalarType();
4613 unsigned LegalVectorBitWidth = LT.second.getSizeInBits();
4615
4616 constexpr unsigned LaneBitWidth = 128;
4617 assert((LegalVectorBitWidth < LaneBitWidth ||
4618 (LegalVectorBitWidth % LaneBitWidth) == 0) &&
4619 "Illegal vector");
4620
4621 const int NumLegalVectors = *LT.first.getValue();
4622 assert(NumLegalVectors >= 0 && "Negative cost!");
4623
4624 // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
4625 // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
4626 if (Insert) {
4627 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4628 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4629 (MScalarTy == MVT::f32 && ST->hasSSE41())) {
4630 // For types we can insert directly, insertion into 128-bit sub vectors is
4631 // cheap, followed by a cheap chain of concatenations.
4632 if (LegalVectorBitWidth <= LaneBitWidth) {
4633 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
4634 /*Extract*/ false, CostKind);
4635 } else {
4636 // In each 128-lane, if at least one index is demanded but not all
4637 // indices are demanded and this 128-lane is not the first 128-lane of
4638 // the legalized-vector, then this 128-lane needs a extracti128; If in
4639 // each 128-lane, there is at least one demanded index, this 128-lane
4640 // needs a inserti128.
4641
4642 // The following cases will help you build a better understanding:
4643 // Assume we insert several elements into a v8i32 vector in avx2,
4644 // Case#1: inserting into 1th index needs vpinsrd + inserti128.
4645 // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
4646 // inserti128.
4647 // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
4648 assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector");
4649 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
4650 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
4651 unsigned NumLegalElts =
4652 LT.second.getVectorNumElements() * NumLegalVectors;
4653 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
4654 "Vector has been legalized to smaller element count");
4655 assert((NumLegalElts % NumLanesTotal) == 0 &&
4656 "Unexpected elts per lane");
4657 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
4658
4659 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
4660 auto *LaneTy =
4661 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
4662
4663 for (unsigned I = 0; I != NumLanesTotal; ++I) {
4664 APInt LaneEltMask = WidenedDemandedElts.extractBits(
4665 NumEltsPerLane, NumEltsPerLane * I);
4666 if (LaneEltMask.isZero())
4667 continue;
4668 // FIXME: we don't need to extract if all non-demanded elements
4669 // are legalization-inserted padding.
4670 if (!LaneEltMask.isAllOnes())
4671 Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
4672 CostKind, I * NumEltsPerLane, LaneTy);
4673 Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert,
4674 /*Extract*/ false, CostKind);
4675 }
4676
4677 APInt AffectedLanes =
4678 APIntOps::ScaleBitMask(WidenedDemandedElts, NumLanesTotal);
4679 APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask(
4680 AffectedLanes, NumLegalVectors, /*MatchAllBits=*/true);
4681 for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) {
4682 for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) {
4683 unsigned I = NumLegalLanes * LegalVec + Lane;
4684 // No need to insert unaffected lane; or lane 0 of each legal vector
4685 // iff ALL lanes of that vector were affected and will be inserted.
4686 if (!AffectedLanes[I] ||
4687 (Lane == 0 && FullyAffectedLegalVectors[LegalVec]))
4688 continue;
4689 Cost += getShuffleCost(TTI::SK_InsertSubvector, Ty, std::nullopt,
4690 CostKind, I * NumEltsPerLane, LaneTy);
4691 }
4692 }
4693 }
4694 } else if (LT.second.isVector()) {
4695 // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
4696 // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
4697 // series of UNPCK followed by CONCAT_VECTORS - all of these can be
4698 // considered cheap.
4699 if (Ty->isIntOrIntVectorTy())
4700 Cost += DemandedElts.popcount();
4701
4702 // Get the smaller of the legalized or original pow2-extended number of
4703 // vector elements, which represents the number of unpacks we'll end up
4704 // performing.
4705 unsigned NumElts = LT.second.getVectorNumElements();
4706 unsigned Pow2Elts =
4707 PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements());
4708 Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
4709 }
4710 }
4711
4712 if (Extract) {
4713 // vXi1 can be efficiently extracted with MOVMSK.
4714 // TODO: AVX512 predicate mask handling.
4715 // NOTE: This doesn't work well for roundtrip scalarization.
4716 if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) {
4717 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
4718 unsigned MaxElts = ST->hasAVX2() ? 32 : 16;
4719 unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts;
4720 return MOVMSKCost;
4721 }
4722
4723 if (LT.second.isVector()) {
4724 unsigned NumLegalElts =
4725 LT.second.getVectorNumElements() * NumLegalVectors;
4726 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
4727 "Vector has been legalized to smaller element count");
4728
4729 // If we're extracting elements from a 128-bit subvector lane,
4730 // we only need to extract each lane once, not for every element.
4731 if (LegalVectorBitWidth > LaneBitWidth) {
4732 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
4733 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
4734 assert((NumLegalElts % NumLanesTotal) == 0 &&
4735 "Unexpected elts per lane");
4736 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
4737
4738 // Add cost for each demanded 128-bit subvector extraction.
4739 // Luckily this is a lot easier than for insertion.
4740 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
4741 auto *LaneTy =
4742 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
4743
4744 for (unsigned I = 0; I != NumLanesTotal; ++I) {
4745 APInt LaneEltMask = WidenedDemandedElts.extractBits(
4746 NumEltsPerLane, I * NumEltsPerLane);
4747 if (LaneEltMask.isZero())
4748 continue;
4749 Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
4750 CostKind, I * NumEltsPerLane, LaneTy);
4752 LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind);
4753 }
4754
4755 return Cost;
4756 }
4757 }
4758
4759 // Fallback to default extraction.
4760 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ false,
4761 Extract, CostKind);
4762 }
4763
4764 return Cost;
4765}
4766
4768X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
4769 int VF, const APInt &DemandedDstElts,
4771 const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy);
4772 // We don't differentiate element types here, only element bit width.
4773 EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits);
4774
4775 auto bailout = [&]() {
4776 return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
4777 DemandedDstElts, CostKind);
4778 };
4779
4780 // For now, only deal with AVX512 cases.
4781 if (!ST->hasAVX512())
4782 return bailout();
4783
4784 // Do we have a native shuffle for this element type, or should we promote?
4785 unsigned PromEltTyBits = EltTyBits;
4786 switch (EltTyBits) {
4787 case 32:
4788 case 64:
4789 break; // AVX512F.
4790 case 16:
4791 if (!ST->hasBWI())
4792 PromEltTyBits = 32; // promote to i32, AVX512F.
4793 break; // AVX512BW
4794 case 8:
4795 if (!ST->hasVBMI())
4796 PromEltTyBits = 32; // promote to i32, AVX512F.
4797 break; // AVX512VBMI
4798 case 1:
4799 // There is no support for shuffling i1 elements. We *must* promote.
4800 if (ST->hasBWI()) {
4801 if (ST->hasVBMI())
4802 PromEltTyBits = 8; // promote to i8, AVX512VBMI.
4803 else
4804 PromEltTyBits = 16; // promote to i16, AVX512BW.
4805 break;
4806 }
4807 PromEltTyBits = 32; // promote to i32, AVX512F.
4808 break;
4809 default:
4810 return bailout();
4811 }
4812 auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits);
4813
4814 auto *SrcVecTy = FixedVectorType::get(EltTy, VF);
4815 auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF);
4816
4817 int NumDstElements = VF * ReplicationFactor;
4818 auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements);
4819 auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements);
4820
4821 // Legalize the types.
4822 MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second;
4823 MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second;
4824 MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second;
4825 MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second;
4826 // They should have legalized into vector types.
4827 if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() ||
4828 !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector())
4829 return bailout();
4830
4831 if (PromEltTyBits != EltTyBits) {
4832 // If we have to perform the shuffle with wider elt type than our data type,
4833 // then we will first need to anyext (we don't care about the new bits)
4834 // the source elements, and then truncate Dst elements.
4835 InstructionCost PromotionCost;
4836 PromotionCost += getCastInstrCost(
4837 Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy,
4839 PromotionCost +=
4840 getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy,
4841 /*Src=*/PromDstVecTy,
4843 return PromotionCost + getReplicationShuffleCost(PromEltTy,
4844 ReplicationFactor, VF,
4845 DemandedDstElts, CostKind);
4846 }
4847
4848 assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&
4849 LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&
4850 "We expect that the legalization doesn't affect the element width, "
4851 "doesn't coalesce/split elements.");
4852
4853 unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements();
4854 unsigned NumDstVectors =
4855 divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec);
4856
4857 auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec);
4858
4859 // Not all the produced Dst elements may be demanded. In our case,
4860 // given that a single Dst vector is formed by a single shuffle,
4861 // if all elements that will form a single Dst vector aren't demanded,
4862 // then we won't need to do that shuffle, so adjust the cost accordingly.
4863 APInt DemandedDstVectors = APIntOps::ScaleBitMask(
4864 DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors);
4865 unsigned NumDstVectorsDemanded = DemandedDstVectors.popcount();
4866
4867 InstructionCost SingleShuffleCost = getShuffleCost(
4868 TTI::SK_PermuteSingleSrc, SingleDstVecTy, /*Mask=*/std::nullopt, CostKind,
4869 /*Index=*/0, /*SubTp=*/nullptr);
4870 return NumDstVectorsDemanded * SingleShuffleCost;
4871}
4872
4874 MaybeAlign Alignment,
4875 unsigned AddressSpace,
4877 TTI::OperandValueInfo OpInfo,
4878 const Instruction *I) {
4879 // TODO: Handle other cost kinds.
4881 if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
4882 // Store instruction with index and scale costs 2 Uops.
4883 // Check the preceding GEP to identify non-const indices.
4884 if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
4885 if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
4886 return TTI::TCC_Basic * 2;
4887 }
4888 }
4889 return TTI::TCC_Basic;
4890 }
4891
4892 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
4893 "Invalid Opcode");
4894 // Type legalization can't handle structs
4895 if (TLI->getValueType(DL, Src, true) == MVT::Other)
4896 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4897 CostKind);
4898
4899 // Legalize the type.
4900 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
4901
4902 auto *VTy = dyn_cast<FixedVectorType>(Src);
4903
4905
4906 // Add a cost for constant load to vector.
4907 if (Opcode == Instruction::Store && OpInfo.isConstant())
4908 Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src),
4909 /*AddressSpace=*/0, CostKind);
4910
4911 // Handle the simple case of non-vectors.
4912 // NOTE: this assumes that legalization never creates vector from scalars!
4913 if (!VTy || !LT.second.isVector()) {
4914 // Each load/store unit costs 1.
4915 return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1;
4916 }
4917
4918 bool IsLoad = Opcode == Instruction::Load;
4919
4920 Type *EltTy = VTy->getElementType();
4921
4922 const int EltTyBits = DL.getTypeSizeInBits(EltTy);
4923
4924 // Source of truth: how many elements were there in the original IR vector?
4925 const unsigned SrcNumElt = VTy->getNumElements();
4926
4927 // How far have we gotten?
4928 int NumEltRemaining = SrcNumElt;
4929 // Note that we intentionally capture by-reference, NumEltRemaining changes.
4930 auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
4931
4932 const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
4933
4934 // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
4935 const unsigned XMMBits = 128;
4936 if (XMMBits % EltTyBits != 0)
4937 // Vector size must be a multiple of the element size. I.e. no padding.
4938 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4939 CostKind);
4940 const int NumEltPerXMM = XMMBits / EltTyBits;
4941
4942 auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
4943
4944 for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
4945 NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
4946 // How many elements would a single op deal with at once?
4947 if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
4948 // Vector size must be a multiple of the element size. I.e. no padding.
4949 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4950 CostKind);
4951 int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
4952
4953 assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
4954 assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||
4955 (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
4956 "Unless we haven't halved the op size yet, "
4957 "we have less than two op's sized units of work left.");
4958
4959 auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
4960 ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
4961 : XMMVecTy;
4962
4963 assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
4964 "After halving sizes, the vector elt count is no longer a multiple "
4965 "of number of elements per operation?");
4966 auto *CoalescedVecTy =
4967 CurrNumEltPerOp == 1
4968 ? CurrVecTy
4970 IntegerType::get(Src->getContext(),
4971 EltTyBits * CurrNumEltPerOp),
4972 CurrVecTy->getNumElements() / CurrNumEltPerOp);
4973 assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
4974 DL.getTypeSizeInBits(CurrVecTy) &&
4975 "coalesciing elements doesn't change vector width.");
4976
4977 while (NumEltRemaining > 0) {
4978 assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
4979
4980 // Can we use this vector size, as per the remaining element count?
4981 // Iff the vector is naturally aligned, we can do a wide load regardless.
4982 if (NumEltRemaining < CurrNumEltPerOp &&
4983 (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) &&
4984 CurrOpSizeBytes != 1)
4985 break; // Try smalled vector size.
4986
4987 bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
4988
4989 // If we have fully processed the previous reg, we need to replenish it.
4990 if (SubVecEltsLeft == 0) {
4991 SubVecEltsLeft += CurrVecTy->getNumElements();
4992 // And that's free only for the 0'th subvector of a legalized vector.
4993 if (!Is0thSubVec)
4996 VTy, std::nullopt, CostKind, NumEltDone(),
4997 CurrVecTy);
4998 }
4999
5000 // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
5001 // for smaller widths (32/16/8) we have to insert/extract them separately.
5002 // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
5003 // but let's pretend that it is also true for 16/8 bit wide ops...)
5004 if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
5005 int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
5006 assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
5007 int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
5008 APInt DemandedElts =
5009 APInt::getBitsSet(CoalescedVecTy->getNumElements(),
5010 CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
5011 assert(DemandedElts.popcount() == 1 && "Inserting single value");
5012 Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
5013 !IsLoad, CostKind);
5014 }
5015
5016 // This isn't exactly right. We're using slow unaligned 32-byte accesses
5017 // as a proxy for a double-pumped AVX memory interface such as on
5018 // Sandybridge.
5019 // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
5020 // will be scalarized.
5021 if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
5022 Cost += 2;
5023 else if (CurrOpSizeBytes < 4)
5024 Cost += 2;
5025 else
5026 Cost += 1;
5027
5028 SubVecEltsLeft -= CurrNumEltPerOp;
5029 NumEltRemaining -= CurrNumEltPerOp;
5030 Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes);
5031 }
5032 }
5033
5034 assert(NumEltRemaining <= 0 && "Should have processed all the elements.");
5035
5036 return Cost;
5037}
5038
5040X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
5041 unsigned AddressSpace,
5043 bool IsLoad = (Instruction::Load == Opcode);
5044 bool IsStore = (Instruction::Store == Opcode);
5045
5046 auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
5047 if (!SrcVTy)
5048 // To calculate scalar take the regular cost, without mask
5049 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
5050
5051 unsigned NumElem = SrcVTy->getNumElements();
5052 auto *MaskTy =
5053 FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
5054 if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) ||
5055 (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) {
5056 // Scalarization
5057 APInt DemandedElts = APInt::getAllOnes(NumElem);
5059 MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind);
5060 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
5061 Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
5063 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
5064 InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
5066 SrcVTy, DemandedElts, IsLoad, IsStore, CostKind);
5067 InstructionCost MemopCost =
5068 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5069 Alignment, AddressSpace, CostKind);
5070 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
5071 }
5072
5073 // Legalize the type.
5074 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
5075 auto VT = TLI->getValueType(DL, SrcVTy);
5077 if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
5078 LT.second.getVectorNumElements() == NumElem)
5079 // Promotion requires extend/truncate for data and a shuffle for mask.
5080 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, std::nullopt,
5081 CostKind, 0, nullptr) +
5082 getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, std::nullopt,
5083 CostKind, 0, nullptr);
5084
5085 else if (LT.first * LT.second.getVectorNumElements() > NumElem) {
5086 auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
5087 LT.second.getVectorNumElements());
5088 // Expanding requires fill mask with zeroes
5089 Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, std::nullopt,
5090 CostKind, 0, MaskTy);
5091 }
5092
5093 // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
5094 if (!ST->hasAVX512())
5095 return Cost + LT.first * (IsLoad ? 2 : 8);
5096
5097 // AVX-512 masked load/store is cheaper
5098 return Cost + LT.first;
5099}
5100
5103 const Value *Base,
5104 const TTI::PointersChainInfo &Info,
5105 Type *AccessTy, TTI::TargetCostKind CostKind) {
5106 if (Info.isSameBase() && Info.isKnownStride()) {
5107 // If all the pointers have known stride all the differences are translated
5108 // into constants. X86 memory addressing allows encoding it into
5109 // displacement. So we just need to take the base GEP cost.
5110 if (const auto *BaseGEP = dyn_cast<GetElementPtrInst>(Base)) {
5111 SmallVector<const Value *> Indices(BaseGEP->indices());
5112 return getGEPCost(BaseGEP->getSourceElementType(),
5113 BaseGEP->getPointerOperand(), Indices, nullptr,
5114 CostKind);
5115 }
5116 return TTI::TCC_Free;
5117 }
5118 return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind);
5119}
5120
5122 ScalarEvolution *SE,
5123 const SCEV *Ptr) {
5124 // Address computations in vectorized code with non-consecutive addresses will
5125 // likely result in more instructions compared to scalar code where the
5126 // computation can more often be merged into the index mode. The resulting
5127 // extra micro-ops can significantly decrease throughput.
5128 const unsigned NumVectorInstToHideOverhead = 10;
5129
5130 // Cost modeling of Strided Access Computation is hidden by the indexing
5131 // modes of X86 regardless of the stride value. We dont believe that there
5132 // is a difference between constant strided access in gerenal and constant
5133 // strided value which is less than or equal to 64.
5134 // Even in the case of (loop invariant) stride whose value is not known at
5135 // compile time, the address computation will not incur more than one extra
5136 // ADD instruction.
5137 if (Ty->isVectorTy() && SE && !ST->hasAVX2()) {
5138 // TODO: AVX2 is the current cut-off because we don't have correct
5139 // interleaving costs for prior ISA's.
5141 return NumVectorInstToHideOverhead;
5143 return 1;
5144 }
5145
5146 return BaseT::getAddressComputationCost(Ty, SE, Ptr);
5147}
5148
5151 std::optional<FastMathFlags> FMF,
5154 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5155
5156 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5157 // and make it as the cost.
5158
5159 static const CostTblEntry SLMCostTbl[] = {
5160 { ISD::FADD, MVT::v2f64, 3 },
5161 { ISD::ADD, MVT::v2i64, 5 },
5162 };
5163
5164 static const CostTblEntry SSE2CostTbl[] = {
5165 { ISD::FADD, MVT::v2f64, 2 },
5166 { ISD::FADD, MVT::v2f32, 2 },
5167 { ISD::FADD, MVT::v4f32, 4 },
5168 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
5169 { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
5170 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
5171 { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
5172 { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
5173 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
5174 { ISD::ADD, MVT::v2i8, 2 },
5175 { ISD::ADD, MVT::v4i8, 2 },
5176 { ISD::ADD, MVT::v8i8, 2 },
5177 { ISD::ADD, MVT::v16i8, 3 },
5178 };
5179
5180 static const CostTblEntry AVX1CostTbl[] = {
5181 { ISD::FADD, MVT::v4f64, 3 },
5182 { ISD::FADD, MVT::v4f32, 3 },
5183 { ISD::FADD, MVT::v8f32, 4 },
5184 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
5185 { ISD::ADD, MVT::v4i64, 3 },
5186 { ISD::ADD, MVT::v8i32, 5 },
5187 { ISD::ADD, MVT::v16i16, 5 },
5188 { ISD::ADD, MVT::v32i8, 4 },
5189 };
5190
5191 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5192 assert(ISD && "Invalid opcode");
5193
5194 // Before legalizing the type, give a chance to look up illegal narrow types
5195 // in the table.
5196 // FIXME: Is there a better way to do this?
5197 EVT VT = TLI->getValueType(DL, ValTy);
5198 if (VT.isSimple()) {
5199 MVT MTy = VT.getSimpleVT();
5200 if (ST->useSLMArithCosts())
5201 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5202 return Entry->Cost;
5203
5204 if (ST->hasAVX())
5205 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5206 return Entry->Cost;
5207
5208 if (ST->hasSSE2())
5209 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5210 return Entry->Cost;
5211 }
5212
5213 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5214
5215 MVT MTy = LT.second;
5216
5217 auto *ValVTy = cast<FixedVectorType>(ValTy);
5218
5219 // Special case: vXi8 mul reductions are performed as vXi16.
5220 if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
5221 auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
5222 auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
5223 return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
5225 CostKind) +
5226 getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind);
5227 }
5228
5229 InstructionCost ArithmeticCost = 0;
5230 if (LT.first != 1 && MTy.isVector() &&
5231 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5232 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5233 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5234 MTy.getVectorNumElements());
5235 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5236 ArithmeticCost *= LT.first - 1;
5237 }
5238
5239 if (ST->useSLMArithCosts())
5240 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5241 return ArithmeticCost + Entry->Cost;
5242
5243 if (ST->hasAVX())
5244 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5245 return ArithmeticCost + Entry->Cost;
5246
5247 if (ST->hasSSE2())
5248 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5249 return ArithmeticCost + Entry->Cost;
5250
5251 // FIXME: These assume a naive kshift+binop lowering, which is probably
5252 // conservative in most cases.
5253 static const CostTblEntry AVX512BoolReduction[] = {
5254 { ISD::AND, MVT::v2i1, 3 },
5255 { ISD::AND, MVT::v4i1, 5 },
5256 { ISD::AND, MVT::v8i1, 7 },
5257 { ISD::AND, MVT::v16i1, 9 },
5258 { ISD::AND, MVT::v32i1, 11 },
5259 { ISD::AND, MVT::v64i1, 13 },
5260 { ISD::OR, MVT::v2i1, 3 },
5261 { ISD::OR, MVT::v4i1, 5 },
5262 { ISD::OR, MVT::v8i1, 7 },
5263 { ISD::OR, MVT::v16i1, 9 },
5264 { ISD::OR, MVT::v32i1, 11 },
5265 { ISD::OR, MVT::v64i1, 13 },
5266 };
5267
5268 static const CostTblEntry AVX2BoolReduction[] = {
5269 { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
5270 { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
5271 { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
5272 { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
5273 };
5274
5275 static const CostTblEntry AVX1BoolReduction[] = {
5276 { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
5277 { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
5278 { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5279 { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5280 { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
5281 { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
5282 { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5283 { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5284 };
5285
5286 static const CostTblEntry SSE2BoolReduction[] = {
5287 { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
5288 { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
5289 { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
5290 { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
5291 { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
5292 { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
5293 { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
5294 { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
5295 };
5296
5297 // Handle bool allof/anyof patterns.
5298 if (ValVTy->getElementType()->isIntegerTy(1)) {
5299 InstructionCost ArithmeticCost = 0;
5300 if (LT.first != 1 && MTy.isVector() &&
5301 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5302 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5303 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5304 MTy.getVectorNumElements());
5305 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5306 ArithmeticCost *= LT.first - 1;
5307 }
5308
5309 if (ST->hasAVX512())
5310 if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
5311 return ArithmeticCost + Entry->Cost;
5312 if (ST->hasAVX2())
5313 if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
5314 return ArithmeticCost + Entry->Cost;
5315 if (ST->hasAVX())
5316 if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
5317 return ArithmeticCost + Entry->Cost;
5318 if (ST->hasSSE2())
5319 if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
5320 return ArithmeticCost + Entry->Cost;
5321
5322 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5323 }
5324
5325 unsigned NumVecElts = ValVTy->getNumElements();
5326 unsigned ScalarSize = ValVTy->getScalarSizeInBits();
5327
5328 // Special case power of 2 reductions where the scalar type isn't changed
5329 // by type legalization.
5330 if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
5331 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5332
5333 InstructionCost ReductionCost = 0;
5334
5335 auto *Ty = ValVTy;
5336 if (LT.first != 1 && MTy.isVector() &&
5337 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5338 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5339 Ty = FixedVectorType::get(ValVTy->getElementType(),
5340 MTy.getVectorNumElements());
5341 ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5342 ReductionCost *= LT.first - 1;
5343 NumVecElts = MTy.getVectorNumElements();
5344 }
5345
5346 // Now handle reduction with the legal type, taking into account size changes
5347 // at each level.
5348 while (NumVecElts > 1) {
5349 // Determine the size of the remaining vector we need to reduce.
5350 unsigned Size = NumVecElts * ScalarSize;
5351 NumVecElts /= 2;
5352 // If we're reducing from 256/512 bits, use an extract_subvector.
5353 if (Size > 128) {
5354 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5355 ReductionCost +=
5357 NumVecElts, SubTy);
5358 Ty = SubTy;
5359 } else if (Size == 128) {
5360 // Reducing from 128 bits is a permute of v2f64/v2i64.
5361 FixedVectorType *ShufTy;
5362 if (ValVTy->isFloatingPointTy())
5363 ShufTy =
5364 FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
5365 else
5366 ShufTy =
5367 FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
5368 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5369 std::nullopt, CostKind, 0, nullptr);
5370 } else if (Size == 64) {
5371 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5372 FixedVectorType *ShufTy;
5373 if (ValVTy->isFloatingPointTy())
5374 ShufTy =
5375 FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
5376 else
5377 ShufTy =
5378 FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
5379 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5380 std::nullopt, CostKind, 0, nullptr);
5381 } else {
5382 // Reducing from smaller size is a shift by immediate.
5383 auto *ShiftTy = FixedVectorType::get(
5384 Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
5385 ReductionCost += getArithmeticInstrCost(
5386 Instruction::LShr, ShiftTy, CostKind,
5389 }
5390
5391 // Add the arithmetic op for this level.
5392 ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
5393 }
5394
5395 // Add the final extract element to the cost.
5396 return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5397 CostKind, 0, nullptr, nullptr);
5398}
5399
5402 FastMathFlags FMF) {
5403 IntrinsicCostAttributes ICA(IID, Ty, {Ty, Ty}, FMF);
5404 return getIntrinsicInstrCost(ICA, CostKind);
5405}
5406
5409 FastMathFlags FMF,
5411 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5412
5413 MVT MTy = LT.second;
5414
5415 int ISD;
5416 if (ValTy->isIntOrIntVectorTy()) {
5417 ISD = (IID == Intrinsic::umin || IID == Intrinsic::umax) ? ISD::UMIN
5418 : ISD::SMIN;
5419 } else {
5420 assert(ValTy->isFPOrFPVectorTy() &&
5421 "Expected float point or integer vector type.");
5422 ISD = (IID == Intrinsic::minnum || IID == Intrinsic::maxnum)
5423 ? ISD::FMINNUM
5424 : ISD::FMINIMUM;
5425 }
5426
5427 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5428 // and make it as the cost.
5429
5430 static const CostTblEntry SSE2CostTbl[] = {
5431 {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
5432 {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
5433 {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
5434 };
5435
5436 static const CostTblEntry SSE41CostTbl[] = {
5437 {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
5438 {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
5439 {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
5440 {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
5441 {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
5442 {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
5443 {ISD::SMIN, MVT::v2i8, 3}, // pminsb
5444 {ISD::SMIN, MVT::v4i8, 5}, // pminsb
5445 {ISD::SMIN, MVT::v8i8, 7}, // pminsb
5446 {ISD::SMIN, MVT::v16i8, 6},
5447 {ISD::UMIN, MVT::v2i8, 3}, // same as sse2
5448 {ISD::UMIN, MVT::v4i8, 5}, // same as sse2
5449 {ISD::UMIN, MVT::v8i8, 7}, // same as sse2
5450 {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
5451 };
5452
5453 static const CostTblEntry AVX1CostTbl[] = {
5454 {ISD::SMIN, MVT::v16i16, 6},
5455 {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
5456 {ISD::SMIN, MVT::v32i8, 8},
5457 {ISD::UMIN, MVT::v32i8, 8},
5458 };
5459
5460 static const CostTblEntry AVX512BWCostTbl[] = {
5461 {ISD::SMIN, MVT::v32i16, 8},
5462 {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
5463 {ISD::SMIN, MVT::v64i8, 10},
5464 {ISD::UMIN, MVT::v64i8, 10},
5465 };
5466
5467 // Before legalizing the type, give a chance to look up illegal narrow types
5468 // in the table.
5469 // FIXME: Is there a better way to do this?
5470 EVT VT = TLI->getValueType(DL, ValTy);
5471 if (VT.isSimple()) {
5472 MVT MTy = VT.getSimpleVT();
5473 if (ST->hasBWI())
5474 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5475 return Entry->Cost;
5476
5477 if (ST->hasAVX())
5478 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5479 return Entry->Cost;
5480
5481 if (ST->hasSSE41())
5482 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5483 return Entry->Cost;
5484
5485 if (ST->hasSSE2())
5486 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5487 return Entry->Cost;
5488 }
5489
5490 auto *ValVTy = cast<FixedVectorType>(ValTy);
5491 unsigned NumVecElts = ValVTy->getNumElements();
5492
5493 auto *Ty = ValVTy;
5494 InstructionCost MinMaxCost = 0;
5495 if (LT.first != 1 && MTy.isVector() &&
5496 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5497 // Type needs to be split. We need LT.first - 1 operations ops.
5498 Ty = FixedVectorType::get(ValVTy->getElementType(),
5499 MTy.getVectorNumElements());
5500 MinMaxCost = getMinMaxCost(IID, Ty, CostKind, FMF);
5501 MinMaxCost *= LT.first - 1;
5502 NumVecElts = MTy.getVectorNumElements();
5503 }
5504
5505 if (ST->hasBWI())
5506 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5507 return MinMaxCost + Entry->Cost;
5508
5509 if (ST->hasAVX())
5510 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5511 return MinMaxCost + Entry->Cost;
5512
5513 if (ST->hasSSE41())
5514 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5515 return MinMaxCost + Entry->Cost;
5516
5517 if (ST->hasSSE2())
5518 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5519 return MinMaxCost + Entry->Cost;
5520
5521 unsigned ScalarSize = ValTy->getScalarSizeInBits();
5522
5523 // Special case power of 2 reductions where the scalar type isn't changed
5524 // by type legalization.
5525 if (!isPowerOf2_32(ValVTy->getNumElements()) ||
5526 ScalarSize != MTy.getScalarSizeInBits())
5527 return BaseT::getMinMaxReductionCost(IID, ValTy, FMF, CostKind);
5528
5529 // Now handle reduction with the legal type, taking into account size changes
5530 // at each level.
5531 while (NumVecElts > 1) {
5532 // Determine the size of the remaining vector we need to reduce.
5533 unsigned Size = NumVecElts * ScalarSize;
5534 NumVecElts /= 2;
5535 // If we're reducing from 256/512 bits, use an extract_subvector.
5536 if (Size > 128) {
5537 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5538 MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
5539 CostKind, NumVecElts, SubTy);
5540 Ty = SubTy;
5541 } else if (Size == 128) {
5542 // Reducing from 128 bits is a permute of v2f64/v2i64.
5543 VectorType *ShufTy;
5544 if (ValTy->isFloatingPointTy())
5545 ShufTy =
5547 else
5548 ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
5549 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5550 std::nullopt, CostKind, 0, nullptr);
5551 } else if (Size == 64) {
5552 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5553 FixedVectorType *ShufTy;
5554 if (ValTy->isFloatingPointTy())
5555 ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
5556 else
5557 ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
5558 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5559 std::nullopt, CostKind, 0, nullptr);
5560 } else {
5561 // Reducing from smaller size is a shift by immediate.
5562 auto *ShiftTy = FixedVectorType::get(
5563 Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
5564 MinMaxCost += getArithmeticInstrCost(
5565 Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
5568 }
5569
5570 // Add the arithmetic op for this level.
5571 MinMaxCost += getMinMaxCost(IID, Ty, CostKind, FMF);
5572 }
5573
5574 // Add the final extract element to the cost.
5575 return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5576 CostKind, 0, nullptr, nullptr);
5577}
5578
5579/// Calculate the cost of materializing a 64-bit value. This helper
5580/// method might only calculate a fraction of a larger immediate. Therefore it
5581/// is valid to return a cost of ZERO.
5583 if (Val == 0)
5584 return TTI::TCC_Free;
5585
5586 if (isInt<32>(Val))
5587 return TTI::TCC_Basic;
5588
5589 return 2 * TTI::TCC_Basic;
5590}
5591
5594 assert(Ty->isIntegerTy());
5595
5596 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5597 if (BitSize == 0)
5598 return ~0U;
5599
5600 // Never hoist constants larger than 128bit, because this might lead to
5601 // incorrect code generation or assertions in codegen.
5602 // Fixme: Create a cost model for types larger than i128 once the codegen
5603 // issues have been fixed.
5604 if (BitSize > 128)
5605 return TTI::TCC_Free;
5606
5607 if (Imm == 0)
5608 return TTI::TCC_Free;
5609
5610 // Sign-extend all constants to a multiple of 64-bit.
5611 APInt ImmVal = Imm;
5612 if (BitSize % 64 != 0)
5613 ImmVal = Imm.sext(alignTo(BitSize, 64));
5614
5615 // Split the constant into 64-bit chunks and calculate the cost for each
5616 // chunk.
5618 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
5619 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
5620 int64_t Val = Tmp.getSExtValue();
5621 Cost += getIntImmCost(Val);
5622 }
5623 // We need at least one instruction to materialize the constant.
5624 return std::max<InstructionCost>(1, Cost);
5625}
5626
5628 const APInt &Imm, Type *Ty,
5630 Instruction *Inst) {
5631 assert(Ty->isIntegerTy());
5632
5633 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5634 // There is no cost model for constants with a bit size of 0. Return TCC_Free
5635 // here, so that constant hoisting will ignore this constant.
5636 if (BitSize == 0)
5637 return TTI::TCC_Free;
5638
5639 unsigned ImmIdx = ~0U;
5640 switch (Opcode) {
5641 default:
5642 return TTI::TCC_Free;
5643 case Instruction::GetElementPtr:
5644 // Always hoist the base address of a GetElementPtr. This prevents the
5645 // creation of new constants for every base constant that gets constant
5646 // folded with the offset.
5647 if (Idx == 0)
5648 return 2 * TTI::TCC_Basic;
5649 return TTI::TCC_Free;
5650 case Instruction::Store:
5651 ImmIdx = 0;
5652 break;
5653 case Instruction::ICmp:
5654 // This is an imperfect hack to prevent constant hoisting of
5655 // compares that might be trying to check if a 64-bit value fits in
5656 // 32-bits. The backend can optimize these cases using a right shift by 32.
5657 // Ideally we would check the compare predicate here. There also other
5658 // similar immediates the backend can use shifts for.
5659 if (Idx == 1 && Imm.getBitWidth() == 64) {
5660 uint64_t ImmVal = Imm.getZExtValue();
5661 if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
5662 return TTI::TCC_Free;
5663 }
5664 ImmIdx = 1;
5665 break;
5666 case Instruction::And:
5667 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
5668 // by using a 32-bit operation with implicit zero extension. Detect such
5669 // immediates here as the normal path expects bit 31 to be sign extended.
5670 if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.isIntN(32))
5671 return TTI::TCC_Free;
5672 ImmIdx = 1;
5673 break;
5674 case Instruction::Add:
5675 case Instruction::Sub:
5676 // For add/sub, we can use the opposite instruction for INT32_MIN.
5677 if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000)
5678 return TTI::TCC_Free;
5679 ImmIdx = 1;
5680 break;
5681 case Instruction::UDiv:
5682 case Instruction::SDiv:
5683 case Instruction::URem:
5684 case Instruction::SRem:
5685 // Division by constant is typically expanded later into a different
5686 // instruction sequence. This completely changes the constants.
5687 // Report them as "free" to stop ConstantHoist from marking them as opaque.
5688 return TTI::TCC_Free;
5689 case Instruction::Mul:
5690 case Instruction::Or:
5691 case Instruction::Xor:
5692 ImmIdx = 1;
5693 break;
5694 // Always return TCC_Free for the shift value of a shift instruction.
5695 case Instruction::Shl:
5696 case Instruction::LShr:
5697 case Instruction::AShr:
5698 if (Idx == 1)
5699 return TTI::TCC_Free;
5700 break;
5701 case Instruction::Trunc:
5702 case Instruction::ZExt:
5703 case Instruction::SExt:
5704 case Instruction::IntToPtr:
5705 case Instruction::PtrToInt:
5706 case Instruction::BitCast:
5707 case Instruction::PHI:
5708 case Instruction::Call:
5709 case Instruction::Select:
5710 case Instruction::Ret:
5711 case Instruction::Load:
5712 break;
5713 }
5714
5715 if (Idx == ImmIdx) {
5716 uint64_t NumConstants = divideCeil(BitSize, 64);
5718 return (Cost <= NumConstants * TTI::TCC_Basic)
5719 ? static_cast<int>(TTI::TCC_Free)
5720 : Cost;
5721 }
5722
5723 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
5724}
5725
5727 const APInt &Imm, Type *Ty,
5729 assert(Ty->isIntegerTy());
5730
5731 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5732 // There is no cost model for constants with a bit size of 0. Return TCC_Free
5733 // here, so that constant hoisting will ignore this constant.
5734 if (BitSize == 0)
5735 return TTI::TCC_Free;
5736
5737 switch (IID) {
5738 default:
5739 return TTI::TCC_Free;
5740 case Intrinsic::sadd_with_overflow:
5741 case Intrinsic::uadd_with_overflow:
5742 case Intrinsic::ssub_with_overflow:
5743 case Intrinsic::usub_with_overflow:
5744 case Intrinsic::smul_with_overflow:
5745 case Intrinsic::umul_with_overflow:
5746 if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(32))
5747 return TTI::TCC_Free;
5748 break;
5749 case Intrinsic::experimental_stackmap:
5750 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
5751 return TTI::TCC_Free;
5752 break;
5753 case Intrinsic::experimental_patchpoint_void:
5754 case Intrinsic::experimental_patchpoint:
5755 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
5756 return TTI::TCC_Free;
5757 break;
5758 }
5759 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
5760}
5761
5764 const Instruction *I) {
5766 return Opcode == Instruction::PHI ? 0 : 1;
5767 // Branches are assumed to be predicted.
5768 return 0;
5769}
5770
5771int X86TTIImpl::getGatherOverhead() const {
5772 // Some CPUs have more overhead for gather. The specified overhead is relative
5773 // to the Load operation. "2" is the number provided by Intel architects. This
5774 // parameter is used for cost estimation of Gather Op and comparison with
5775 // other alternatives.
5776 // TODO: Remove the explicit hasAVX512()?, That would mean we would only
5777 // enable gather with a -march.
5778 if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
5779 return 2;
5780
5781 return 1024;
5782}
5783
5784int X86TTIImpl::getScatterOverhead() const {
5785 if (ST->hasAVX512())
5786 return 2;
5787
5788 return 1024;
5789}
5790
5791// Return an average cost of Gather / Scatter instruction, maybe improved later.
5792InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode,
5794 Type *SrcVTy, const Value *Ptr,
5795 Align Alignment,
5796 unsigned AddressSpace) {
5797
5798 assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
5799 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
5800
5801 // Try to reduce index size from 64 bit (default for GEP)
5802 // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
5803 // operation will use 16 x 64 indices which do not fit in a zmm and needs
5804 // to split. Also check that the base pointer is the same for all lanes,
5805 // and that there's at most one variable index.
5806 auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
5807 unsigned IndexSize = DL.getPointerSizeInBits();
5808 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
5809 if (IndexSize < 64 || !GEP)
5810 return IndexSize;
5811
5812 unsigned NumOfVarIndices = 0;
5813 const Value *Ptrs = GEP->getPointerOperand();
5814 if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
5815 return IndexSize;
5816 for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) {
5817 if (isa<Constant>(GEP->getOperand(I)))
5818 continue;
5819 Type *IndxTy = GEP->getOperand(I)->getType();
5820 if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
5821 IndxTy = IndexVTy->getElementType();
5822 if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
5823 !isa<SExtInst>(GEP->getOperand(I))) ||
5824 ++NumOfVarIndices > 1)
5825 return IndexSize; // 64
5826 }
5827 return (unsigned)32;
5828 };
5829
5830 // Trying to reduce IndexSize to 32 bits for vector 16.
5831 // By default the IndexSize is equal to pointer size.
5832 unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
5833 ? getIndexSizeInBits(Ptr, DL)
5835
5836 auto *IndexVTy = FixedVectorType::get(
5837 IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
5838 std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(IndexVTy);
5839 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcVTy);
5840 InstructionCost::CostType SplitFactor =
5841 *std::max(IdxsLT.first, SrcLT.first).getValue();
5842 if (SplitFactor > 1) {
5843 // Handle splitting of vector of pointers
5844 auto *SplitSrcTy =
5845 FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
5846 return SplitFactor * getGSVectorCost(Opcode, CostKind, SplitSrcTy, Ptr,
5847 Alignment, AddressSpace);
5848 }
5849
5850 // If we didn't split, this will be a single gather/scatter instruction.
5852 return 1;
5853
5854 // The gather / scatter cost is given by Intel architects. It is a rough
5855 // number since we are looking at one instruction in a time.
5856 const int GSOverhead = (Opcode == Instruction::Load) ? getGatherOverhead()
5857 : getScatterOverhead();
5858 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5859 MaybeAlign(Alignment), AddressSpace,
5860 CostKind);
5861}
5862
5863/// Calculate the cost of Gather / Scatter operation
5865 unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask,
5867 const Instruction *I = nullptr) {
5868 if (((Opcode == Instruction::Load &&
5869 (!isLegalMaskedGather(SrcVTy, Align(Alignment)) ||
5870 forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
5871 Align(Alignment)))) ||
5872 (Opcode == Instruction::Store &&
5873 (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) ||
5874 forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
5875 Align(Alignment))))))
5876 return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask,
5877 Alignment, CostKind, I);
5878
5879 assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
5880 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
5881 if (!PtrTy && Ptr->getType()->isVectorTy())
5882 PtrTy = dyn_cast<PointerType>(
5883 cast<VectorType>(Ptr->getType())->getElementType());
5884 assert(PtrTy && "Unexpected type for Ptr argument");
5885 unsigned AddressSpace = PtrTy->getAddressSpace();
5886 return getGSVectorCost(Opcode, CostKind, SrcVTy, Ptr, Alignment,
5887 AddressSpace);
5888}
5889
5891 const TargetTransformInfo::LSRCost &C2) {
5892 // X86 specific here are "instruction number 1st priority".
5893 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
5894 C1.NumIVMuls, C1.NumBaseAdds,
5895 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
5896 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
5897 C2.NumIVMuls, C2.NumBaseAdds,
5898 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
5899}
5900
5902 return ST->hasMacroFusion() || ST->hasBranchFusion();
5903}
5904
5905bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
5906 if (!ST->hasAVX())
5907 return false;
5908
5909 // The backend can't handle a single element vector.
5910 if (isa<VectorType>(DataTy) &&
5911 cast<FixedVectorType>(DataTy)->getNumElements() == 1)
5912 return false;
5913 Type *ScalarTy = DataTy->getScalarType();
5914
5915 if (ScalarTy->isPointerTy())
5916 return true;
5917
5918 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
5919 return true;
5920
5921 if (ScalarTy->isHalfTy() && ST->hasBWI())
5922 return true;
5923
5924 if (ScalarTy->isBFloatTy() && ST->hasBF16())
5925 return true;
5926
5927 if (!ScalarTy->isIntegerTy())
5928 return false;
5929
5930 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
5931 return IntWidth == 32 || IntWidth == 64 ||
5932 ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
5933}
5934
5935bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) {
5936 return isLegalMaskedLoad(DataType, Alignment);
5937}
5938
5939bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) {
5940 unsigned DataSize = DL.getTypeStoreSize(DataType);
5941 // The only supported nontemporal loads are for aligned vectors of 16 or 32
5942 // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
5943 // (the equivalent stores only require AVX).
5944 if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
5945 return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
5946
5947 return false;
5948}
5949
5950bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) {
5951 unsigned DataSize = DL.getTypeStoreSize(DataType);
5952
5953 // SSE4A supports nontemporal stores of float and double at arbitrary
5954 // alignment.
5955 if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
5956 return true;
5957
5958 // Besides the SSE4A subtarget exception above, only aligned stores are
5959 // available nontemporaly on any other subtarget. And only stores with a size
5960 // of 4..32 bytes (powers of 2, only) are permitted.
5961 if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
5962 !isPowerOf2_32(DataSize))
5963 return false;
5964
5965 // 32-byte vector nontemporal stores are supported by AVX (the equivalent
5966 // loads require AVX2).
5967 if (DataSize == 32)
5968 return ST->hasAVX();
5969 if (DataSize == 16)
5970 return ST->hasSSE1();
5971 return true;
5972}
5973
5975 ElementCount NumElements) const {
5976 // movddup
5977 return ST->hasSSE3() && !NumElements.isScalable() &&
5978 NumElements.getFixedValue() == 2 &&
5979 ElementTy == Type::getDoubleTy(ElementTy->getContext());
5980}
5981
5983 if (!isa<VectorType>(DataTy))
5984 return false;
5985
5986 if (!ST->hasAVX512())
5987 return false;
5988
5989 // The backend can't handle a single element vector.
5990 if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
5991 return false;
5992
5993 Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
5994
5995 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
5996 return true;
5997
5998 if (!ScalarTy->isIntegerTy())
5999 return false;
6000
6001 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6002 return IntWidth == 32 || IntWidth == 64 ||
6003 ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
6004}
6005
6007 return isLegalMaskedExpandLoad(DataTy, Alignment);
6008}
6009
6010bool X86TTIImpl::supportsGather() const {
6011 // Some CPUs have better gather performance than others.
6012 // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
6013 // enable gather with a -march.
6014 return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2());
6015}
6016
6018 // Gather / Scatter for vector 2 is not profitable on KNL / SKX
6019 // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend
6020 // it to 8 elements, but zeroing upper bits of the mask vector will add more
6021 // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO:
6022 // Check, maybe the gather/scatter instruction is better in the VariableMask
6023 // case.
6024 unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements();
6025 return NumElts == 1 ||
6026 (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())));
6027}
6028
6030 Type *ScalarTy = DataTy->getScalarType();
6031 if (ScalarTy->isPointerTy())
6032 return true;
6033
6034 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6035 return true;
6036
6037 if (!ScalarTy->isIntegerTy())
6038 return false;
6039
6040 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6041 return IntWidth == 32 || IntWidth == 64;
6042}
6043
6045 if (!supportsGather() || !ST->preferGather())
6046 return false;
6047 return isLegalMaskedGatherScatter(DataTy, Alignment);
6048}
6049
6050bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
6051 unsigned Opcode1,
6052 const SmallBitVector &OpcodeMask) const {
6053 // ADDSUBPS 4xf32 SSE3
6054 // VADDSUBPS 4xf32 AVX
6055 // VADDSUBPS 8xf32 AVX2
6056 // ADDSUBPD 2xf64 SSE3
6057 // VADDSUBPD 2xf64 AVX
6058 // VADDSUBPD 4xf64 AVX2
6059
6060 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
6061 assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible");
6062 if (!isPowerOf2_32(NumElements))
6063 return false;
6064 // Check the opcode pattern. We apply the mask on the opcode arguments and
6065 // then check if it is what we expect.
6066 for (int Lane : seq<int>(0, NumElements)) {
6067 unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0;
6068 // We expect FSub for even lanes and FAdd for odd lanes.
6069 if (Lane % 2 == 0 && Opc != Instruction::FSub)
6070 return false;
6071 if (Lane % 2 == 1 && Opc != Instruction::FAdd)
6072 return false;
6073 }
6074 // Now check that the pattern is supported by the target ISA.
6075 Type *ElemTy = cast<VectorType>(VecTy)->getElementType();
6076 if (ElemTy->isFloatTy())
6077 return ST->hasSSE3() && NumElements % 4 == 0;
6078 if (ElemTy->isDoubleTy())
6079 return ST->hasSSE3() && NumElements % 2 == 0;
6080 return false;
6081}
6082
6083bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) {
6084 // AVX2 doesn't support scatter
6085 if (!ST->hasAVX512() || !ST->preferScatter())
6086 return false;
6087 return isLegalMaskedGatherScatter(DataType, Alignment);
6088}
6089
6090bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
6091 EVT VT = TLI->getValueType(DL, DataType);
6092 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
6093}
6094
6096 // FDIV is always expensive, even if it has a very low uop count.
6097 // TODO: Still necessary for recent CPUs with low latency/throughput fdiv?
6098 if (I->getOpcode() == Instruction::FDiv)
6099 return true;
6100
6102}
6103
6105 return false;
6106}
6107
6109 const Function *Callee) const {
6110 const TargetMachine &TM = getTLI()->getTargetMachine();
6111
6112 // Work this as a subsetting of subtarget features.
6113 const FeatureBitset &CallerBits =
6114 TM.getSubtargetImpl(*Caller)->getFeatureBits();
6115 const FeatureBitset &CalleeBits =
6116 TM.getSubtargetImpl(*Callee)->getFeatureBits();
6117
6118 // Check whether features are the same (apart from the ignore list).
6119 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
6120 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
6121 if (RealCallerBits == RealCalleeBits)
6122 return true;
6123
6124 // If the features are a subset, we need to additionally check for calls
6125 // that may become ABI-incompatible as a result of inlining.
6126 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
6127 return false;
6128
6129 for (const Instruction &I : instructions(Callee)) {
6130 if (const auto *CB = dyn_cast<CallBase>(&I)) {
6131 // Having more target features is fine for inline ASM.
6132 if (CB->isInlineAsm())
6133 continue;
6134
6136 for (Value *Arg : CB->args())
6137 Types.push_back(Arg->getType());
6138 if (!CB->getType()->isVoidTy())
6139 Types.push_back(CB->getType());
6140
6141 // Simple types are always ABI compatible.
6142 auto IsSimpleTy = [](Type *Ty) {
6143 return !Ty->isVectorTy() && !Ty->isAggregateType();
6144 };
6145 if (all_of(Types, IsSimpleTy))
6146 continue;
6147
6148 if (Function *NestedCallee = CB->getCalledFunction()) {
6149 // Assume that intrinsics are always ABI compatible.
6150 if (NestedCallee->isIntrinsic())
6151 continue;
6152
6153 // Do a precise compatibility check.
6154 if (!areTypesABICompatible(Caller, NestedCallee, Types))
6155 return false;
6156 } else {
6157 // We don't know the target features of the callee,
6158 // assume it is incompatible.
6159 return false;
6160 }
6161 }
6162 }
6163 return true;
6164}
6165
6167 const Function *Callee,
6168 const ArrayRef<Type *> &Types) const {
6169 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
6170 return false;
6171
6172 // If we get here, we know the target features match. If one function
6173 // considers 512-bit vectors legal and the other does not, consider them
6174 // incompatible.
6175 const TargetMachine &TM = getTLI()->getTargetMachine();
6176
6177 if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
6178 TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs())
6179 return true;
6180
6181 // Consider the arguments compatible if they aren't vectors or aggregates.
6182 // FIXME: Look at the size of vectors.
6183 // FIXME: Look at the element types of aggregates to see if there are vectors.
6184 return llvm::none_of(Types,
6185 [](Type *T) { return T->isVectorTy() || T->isAggregateType(); });
6186}
6187
6189X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
6191 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
6192 Options.NumLoadsPerBlock = 2;
6193 // All GPR and vector loads can be unaligned.
6194 Options.AllowOverlappingLoads = true;
6195 if (IsZeroCmp) {
6196 // Only enable vector loads for equality comparison. Right now the vector
6197 // version is not as fast for three way compare (see #33329).
6198 const unsigned PreferredWidth = ST->getPreferVectorWidth();
6199 if (PreferredWidth >= 512 && ST->hasAVX512() && ST->hasEVEX512())
6200 Options.LoadSizes.push_back(64);
6201 if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
6202 if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
6203 }
6204 if (ST->is64Bit()) {
6205 Options.LoadSizes.push_back(8);
6206 }
6207 Options.LoadSizes.push_back(4);
6208 Options.LoadSizes.push_back(2);
6209 Options.LoadSizes.push_back(1);
6210 return Options;
6211}
6212
6214 return supportsGather();
6215}
6216
6218 return false;
6219}
6220
6222 // TODO: We expect this to be beneficial regardless of arch,
6223 // but there are currently some unexplained performance artifacts on Atom.
6224 // As a temporary solution, disable on Atom.
6225 return !(ST->isAtom());
6226}
6227
6228// Get estimation for interleaved load/store operations and strided load.
6229// \p Indices contains indices for strided load.
6230// \p Factor - the factor of interleaving.
6231// AVX-512 provides 3-src shuffles that significantly reduces the cost.
6233 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
6234 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
6235 TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
6236 // VecTy for interleave memop is <VF*Factor x Elt>.
6237 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6238 // VecTy = <12 x i32>.
6239
6240 // Calculate the number of memory operations (NumOfMemOps), required
6241 // for load/store the VecTy.
6242 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6243 unsigned VecTySize = DL.getTypeStoreSize(VecTy);
6244 unsigned LegalVTSize = LegalVT.getStoreSize();
6245 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
6246
6247 // Get the cost of one memory operation.
6248 auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
6249 LegalVT.getVectorNumElements());
6250 InstructionCost MemOpCost;
6251 bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps;
6252 if (UseMaskedMemOp)
6253 MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment,
6255 else
6256 MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, MaybeAlign(Alignment),
6258
6259 unsigned VF = VecTy->getNumElements() / Factor;
6260 MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
6261
6262 InstructionCost MaskCost;
6263 if (UseMaskedMemOp) {
6264 APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements());
6265 for (unsigned Index : Indices) {
6266 assert(Index < Factor && "Invalid index for interleaved memory op");
6267 for (unsigned Elm = 0; Elm < VF; Elm++)
6268 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
6269 }
6270
6271 Type *I1Type = Type::getInt1Ty(VecTy->getContext());
6272
6273 MaskCost = getReplicationShuffleCost(
6274 I1Type, Factor, VF,
6275 UseMaskForGaps ? DemandedLoadStoreElts
6277 CostKind);
6278
6279 // The Gaps mask is invariant and created outside the loop, therefore the
6280 // cost of creating it is not accounted for here. However if we have both
6281 // a MaskForGaps and some other mask that guards the execution of the
6282 // memory access, we need to account for the cost of And-ing the two masks
6283 // inside the loop.
6284 if (UseMaskForGaps) {
6285 auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements());
6286 MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind);
6287 }
6288 }
6289
6290 if (Opcode == Instruction::Load) {
6291 // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
6292 // contain the cost of the optimized shuffle sequence that the
6293 // X86InterleavedAccess pass will generate.
6294 // The cost of loads and stores are computed separately from the table.
6295
6296 // X86InterleavedAccess support only the following interleaved-access group.
6297 static const CostTblEntry AVX512InterleavedLoadTbl[] = {
6298 {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
6299 {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
6300 {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
6301 };
6302
6303 if (const auto *Entry =
6304 CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
6305 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6306 //If an entry does not exist, fallback to the default implementation.
6307
6308 // Kind of shuffle depends on number of loaded values.
6309 // If we load the entire data in one register, we can use a 1-src shuffle.
6310 // Otherwise, we'll merge 2 sources in each operation.
6311 TTI::ShuffleKind ShuffleKind =
6312 (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
6313
6314 InstructionCost ShuffleCost = getShuffleCost(
6315 ShuffleKind, SingleMemOpTy, std::nullopt, CostKind, 0, nullptr);
6316
6317 unsigned NumOfLoadsInInterleaveGrp =
6318 Indices.size() ? Indices.size() : Factor;
6319 auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
6320 VecTy->getNumElements() / Factor);
6321 InstructionCost NumOfResults =
6322 getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp;
6323
6324 // About a half of the loads may be folded in shuffles when we have only
6325 // one result. If we have more than one result, or the loads are masked,
6326 // we do not fold loads at all.
6327 unsigned NumOfUnfoldedLoads =
6328 UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
6329
6330 // Get a number of shuffle operations per result.
6331 unsigned NumOfShufflesPerResult =
6332 std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
6333
6334 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6335 // When we have more than one destination, we need additional instructions
6336 // to keep sources.
6337 InstructionCost NumOfMoves = 0;
6338 if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
6339 NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
6340
6341 InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
6342 MaskCost + NumOfUnfoldedLoads * MemOpCost +
6343 NumOfMoves;
6344
6345 return Cost;
6346 }
6347
6348 // Store.
6349 assert(Opcode == Instruction::Store &&
6350 "Expected Store Instruction at this point");
6351 // X86InterleavedAccess support only the following interleaved-access group.
6352 static const CostTblEntry AVX512InterleavedStoreTbl[] = {
6353 {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
6354 {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
6355 {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
6356
6357 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
6358 {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
6359 {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
6360 {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
6361 };
6362
6363 if (const auto *Entry =
6364 CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
6365 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6366 //If an entry does not exist, fallback to the default implementation.
6367
6368 // There is no strided stores meanwhile. And store can't be folded in
6369 // shuffle.
6370 unsigned NumOfSources = Factor; // The number of values to be merged.
6371 InstructionCost ShuffleCost = getShuffleCost(
6372 TTI::SK_PermuteTwoSrc, SingleMemOpTy, std::nullopt, CostKind, 0, nullptr);
6373 unsigned NumOfShufflesPerStore = NumOfSources - 1;
6374
6375 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6376 // We need additional instructions to keep sources.
6377 unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
6379 MaskCost +
6380 NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
6381 NumOfMoves;
6382 return Cost;
6383}
6384
6386 unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices,
6387 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
6388 bool UseMaskForCond, bool UseMaskForGaps) {
6389 auto *VecTy = cast<FixedVectorType>(BaseTy);
6390
6391 auto isSupportedOnAVX512 = [&](Type *VecTy) {
6392 Type *EltTy = cast<VectorType>(VecTy)->getElementType();
6393 if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
6394 EltTy->isIntegerTy(32) || EltTy->isPointerTy())
6395 return true;
6396 if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy())
6397 return ST->hasBWI();
6398 if (EltTy->isBFloatTy())
6399 return ST->hasBF16();
6400 return false;
6401 };
6402 if (ST->hasAVX512() && isSupportedOnAVX512(VecTy))
6404 Opcode, VecTy, Factor, Indices, Alignment,
6405 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
6406
6407 if (UseMaskForCond || UseMaskForGaps)
6408 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6409 Alignment, AddressSpace, CostKind,
6410 UseMaskForCond, UseMaskForGaps);
6411
6412 // Get estimation for interleaved load/store operations for SSE-AVX2.
6413 // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow
6414 // computing the cost using a generic formula as a function of generic
6415 // shuffles. We therefore use a lookup table instead, filled according to
6416 // the instruction sequences that codegen currently generates.
6417
6418 // VecTy for interleave memop is <VF*Factor x Elt>.
6419 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6420 // VecTy = <12 x i32>.
6421 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6422
6423 // This function can be called with VecTy=<6xi128>, Factor=3, in which case
6424 // the VF=2, while v2i128 is an unsupported MVT vector type
6425 // (see MachineValueType.h::getVectorVT()).
6426 if (!LegalVT.isVector())
6427 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6428 Alignment, AddressSpace, CostKind);
6429
6430 unsigned VF = VecTy->getNumElements() / Factor;
6431 Type *ScalarTy = VecTy->getElementType();
6432 // Deduplicate entries, model floats/pointers as appropriately-sized integers.
6433 if (!ScalarTy->isIntegerTy())
6434 ScalarTy =
6435 Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
6436
6437 // Get the cost of all the memory operations.
6438 // FIXME: discount dead loads.
6439 InstructionCost MemOpCosts = getMemoryOpCost(
6440 Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind);
6441
6442 auto *VT = FixedVectorType::get(ScalarTy, VF);
6443 EVT ETy = TLI->getValueType(DL, VT);
6444 if (!ETy.isSimple())
6445 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6446 Alignment, AddressSpace, CostKind);
6447
6448 // TODO: Complete for other data-types and strides.
6449 // Each combination of Stride, element bit width and VF results in a different
6450 // sequence; The cost tables are therefore accessed with:
6451 // Factor (stride) and VectorType=VFxiN.
6452 // The Cost accounts only for the shuffle sequence;
6453 // The cost of the loads/stores is accounted for separately.
6454 //
6455 static const CostTblEntry AVX2InterleavedLoadTbl[] = {
6456 {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8
6457 {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8
6458 {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8
6459 {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8
6460 {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8
6461
6462 {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16
6463 {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16
6464 {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16
6465
6466 {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32
6467 {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32
6468 {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32
6469
6470 {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64
6471 {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64
6472 {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64
6473 {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64
6474
6475 {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8
6476 {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8
6477 {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8
6478 {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
6479 {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8
6480
6481 {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16
6482 {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16
6483 {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16
6484 {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16
6485 {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16
6486
6487 {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32
6488 {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32
6489 {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32
6490 {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32
6491 {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32
6492
6493 {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64
6494 {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64
6495 {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64
6496 {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64
6497
6498 {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8
6499 {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8
6500 {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8
6501 {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8
6502 {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8
6503
6504 {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16
6505 {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16
6506 {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16
6507 {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16
6508 {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16
6509
6510 {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32
6511 {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32
6512 {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32
6513 {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32
6514 {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32
6515
6516 {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64
6517 {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64
6518 {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64
6519 {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64
6520
6521 {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8
6522 {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8
6523 {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8
6524 {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8
6525 {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8
6526
6527 {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16
6528 {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16
6529 {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16
6530 {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16
6531 {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16
6532
6533 {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32
6534 {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32
6535 {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32
6536 {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32
6537
6538 {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64
6539 {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64
6540 {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64
6541
6542 {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
6543 };
6544
6545 static const CostTblEntry SSSE3InterleavedLoadTbl[] = {
6546 {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16
6547 };
6548
6549 static const CostTblEntry SSE2InterleavedLoadTbl[] = {
6550 {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16
6551 {2, MVT::v4i16, 7}, // (load 8i16 and) deinterleave into 2 x 4i16
6552
6553 {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32
6554 {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32
6555
6556 {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64
6557 };
6558
6559 static const CostTblEntry AVX2InterleavedStoreTbl[] = {
6560 {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store)
6561 {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store)
6562
6563 {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store)
6564 {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store)
6565 {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store)
6566
6567 {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store)
6568 {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store)
6569 {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store)
6570 {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store)
6571
6572 {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store)
6573 {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store)
6574 {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store)
6575 {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store)
6576 {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store)
6577
6578 {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store)
6579 {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store)
6580 {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store)
6581 {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
6582 {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
6583
6584 {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store)
6585 {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store)
6586 {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store)
6587 {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store)
6588 {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store)
6589
6590 {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store)
6591 {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store)
6592 {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store)
6593 {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store)
6594 {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store)
6595
6596 {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store)
6597 {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store)
6598 {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store)
6599 {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store)
6600
6601 {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store)
6602 {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store)
6603 {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store)
6604 {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store)
6605 {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store)
6606
6607 {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store)
6608 {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store)
6609 {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store)
6610 {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store)
6611 {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store)
6612
6613 {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store)
6614 {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store)
6615 {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store)
6616 {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store)
6617 {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store)
6618
6619 {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store)
6620 {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store)
6621 {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store)
6622 {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store)
6623
6624 {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store)
6625 {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store)
6626 {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store)
6627 {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store)
6628 {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store)
6629
6630 {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store)
6631 {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store)
6632 {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store)
6633 {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store)
6634 {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store)
6635
6636 {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store)
6637 {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store)
6638 {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store)
6639 {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store)
6640
6641 {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store)
6642 {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store)
6643 {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store)
6644 };
6645
6646 static const CostTblEntry SSE2InterleavedStoreTbl[] = {
6647 {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store)
6648 {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store)
6649 {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store)
6650
6651 {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store)
6652 {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store)
6653
6654 {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store)
6655 };
6656
6657 if (Opcode == Instruction::Load) {
6658 auto GetDiscountedCost = [Factor, NumMembers = Indices.size(),
6659 MemOpCosts](const CostTblEntry *Entry) {
6660 // NOTE: this is just an approximation!
6661 // It can over/under -estimate the cost!
6662 return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor);
6663 };
6664
6665 if (ST->hasAVX2())
6666 if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor,
6667 ETy.getSimpleVT()))
6668 return GetDiscountedCost(Entry);
6669
6670 if (ST->hasSSSE3())
6671 if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor,
6672 ETy.getSimpleVT()))
6673 return GetDiscountedCost(Entry);
6674
6675 if (ST->hasSSE2())
6676 if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor,
6677 ETy.getSimpleVT()))
6678 return GetDiscountedCost(Entry);
6679 } else {
6680 assert(Opcode == Instruction::Store &&
6681 "Expected Store Instruction at this point");
6682 assert((!Indices.size() || Indices.size() == Factor) &&
6683 "Interleaved store only supports fully-interleaved groups.");
6684 if (ST->hasAVX2())
6685 if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor,
6686 ETy.getSimpleVT()))
6687 return MemOpCosts + Entry->Cost;
6688
6689 if (ST->hasSSE2())
6690 if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor,
6691 ETy.getSimpleVT()))
6692 return MemOpCosts + Entry->Cost;
6693 }
6694
6695 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6696 Alignment, AddressSpace, CostKind,
6697 UseMaskForCond, UseMaskForGaps);
6698}
6699
6701 StackOffset BaseOffset,
6702 bool HasBaseReg, int64_t Scale,
6703 unsigned AddrSpace) const {
6704 // Scaling factors are not free at all.
6705 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
6706 // will take 2 allocations in the out of order engine instead of 1
6707 // for plain addressing mode, i.e. inst (reg1).
6708 // E.g.,
6709 // vaddps (%rsi,%rdx), %ymm0, %ymm1
6710 // Requires two allocations (one for the load, one for the computation)
6711 // whereas:
6712 // vaddps (%rsi), %ymm0, %ymm1
6713 // Requires just 1 allocation, i.e., freeing allocations for other operations
6714 // and having less micro operations to execute.
6715 //
6716 // For some X86 architectures, this is even worse because for instance for
6717 // stores, the complex addressing mode forces the instruction to use the
6718 // "load" ports instead of the dedicated "store" port.
6719 // E.g., on Haswell:
6720 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
6721 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
6723 AM.BaseGV = BaseGV;
6724 AM.BaseOffs = BaseOffset.getFixed();
6725 AM.HasBaseReg = HasBaseReg;
6726 AM.Scale = Scale;
6727 AM.ScalableOffset = BaseOffset.getScalable();
6728 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
6729 // Scale represents reg2 * scale, thus account for 1
6730 // as soon as we use a second register.
6731 return AM.Scale != 0;
6732 return -1;
6733}
Expand Atomic instructions
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
Hexagon Common GEP
static LVOptions Options
Definition: LVOptions.cpp:25
#define I(x, y, z)
Definition: MD5.cpp:58
if(VerifyEach)
const char LLVMTargetMachineRef TM
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
This file a TargetTransformInfo::Concept conforming object specific to the X86 target machine.
Class for arbitrary precision integers.
Definition: APInt.h:76
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1627
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1308
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:349
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:236
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:358
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1446
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:805
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:178
APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition: APInt.cpp:453
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1520
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:977
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:770
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:440
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:899
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:863
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:340
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:993
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:996
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:1023
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:1017
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:1016
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:1001
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:1004
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:1018
@ ICMP_NE
not equal
Definition: InstrTypes.h:1015
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:1021
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:1019
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:1003
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:410
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:865
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:672
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:472
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:874
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:319
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
unsigned getNumElements() const
Definition: DerivedTypes.h:582
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:973
static InstructionCost getInvalid(CostType Val=0)
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
const IntrinsicInst * getInst() const
Machine Value Type.
bool is128BitVector() const
Return true if this is a 128-bit vector type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:585
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Class to represent pointers.
Definition: DerivedTypes.h:646
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:679
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
bool test(unsigned Idx) const
size_type size() const
Returns the number of bits in this bitvector.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
static StackOffset getScalable(int64_t Scalable)
Definition: TypeSize.h:43
static StackOffset getFixed(int64_t Fixed)
Definition: TypeSize.h:42
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
bool isExpensiveToSpeculativelyExecute(const Instruction *I)
bool isStridedAccess(const SCEV *Ptr) const
unsigned minRequiredElementSize(const Value *Val, bool &isSigned) const
const SCEVConstant * getConstantStrideStep(ScalarEvolution *SE, const SCEV *Ptr) const
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
CacheLevel
The possible cache levels.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:342
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
static Type * getDoubleTy(LLVMContext &C)
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:234
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
static IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:146
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:143
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
static IntegerType * getInt8Ty(LLVMContext &C)
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:157
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:216
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
static VectorType * getExtendedElementVectorType(VectorType *VTy)
This static method is like getInteger except that the element types are twice as wide as the elements...
Definition: DerivedTypes.h:463
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * getDoubleElementsVectorType(VectorType *VTy)
This static method returns a VectorType with twice as many elements as the input type and the same el...
Definition: DerivedTypes.h:517
Type * getElementType() const
Definition: DerivedTypes.h:436
bool hasSSE1() const
Definition: X86Subtarget.h:200
bool hasSSE42() const
Definition: X86Subtarget.h:205
bool useAVX512Regs() const
Definition: X86Subtarget.h:267
bool hasSSE3() const
Definition: X86Subtarget.h:202
bool hasAVX512() const
Definition: X86Subtarget.h:208
bool hasSSE41() const
Definition: X86Subtarget.h:204
bool hasSSE2() const
Definition: X86Subtarget.h:201
bool hasSSSE3() const
Definition: X86Subtarget.h:203
bool hasAVX() const
Definition: X86Subtarget.h:206
unsigned getPreferVectorWidth() const
Definition: X86Subtarget.h:239
bool hasAVX2() const
Definition: X86Subtarget.h:207
InstructionCost getInterleavedMemoryOpCostAVX512(unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
bool isLegalMaskedGather(Type *DataType, Align Alignment)
InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask, TTI::TargetCostKind CostKind) const
std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool isLegalNTStore(Type *DataType, Align Alignment)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
bool isLegalNTLoad(Type *DataType, Align Alignment)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment)
bool isLegalMaskedGatherScatter(Type *DataType, Align Alignment)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool isLegalMaskedLoad(Type *DataType, Align Alignment)
bool supportsEfficientVectorElementLoadStore() const
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
bool prefersVectorizedAddressing() const
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment)
std::optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const override
bool isLegalMaskedStore(Type *DataType, Align Alignment)
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
Calculate the cost of Gather / Scatter operation.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
unsigned getMaxInterleaveFactor(ElementCount VF)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
bool isLegalMaskedCompressStore(Type *DataType, Align Alignment)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
unsigned getNumberOfRegisters(unsigned ClassID) const
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
unsigned getAtomicMemIntrinsicMaxElementSize() const
bool isLegalMaskedScatter(Type *DataType, Align Alignment)
InstructionCost getIntImmCost(int64_t)
Calculate the cost of materializing a 64-bit value.
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isExpensiveToSpeculativelyExecute(const Instruction *I)
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
InstructionCost getMinMaxCost(Intrinsic::ID IID, Type *Ty, TTI::TargetCostKind CostKind, FastMathFlags FMF)
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Type) const
bool hasDivRemOp(Type *DataType, bool IsSigned)
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:199
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:2978
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:751
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:724
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:715
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:240
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:791
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:391
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:689
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:256
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:775
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:723
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:931
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:350
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:728
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:324
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:706
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:536
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:781
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:972
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:675
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:889
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:991
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:837
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:681
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:870
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:787
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:341
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
apint_match m_APIntAllowPoison(const APInt *&Res)
Match APInt while allowing poison in splat vector constants.
Definition: PatternMatch.h:305
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:428
AddressSpace
Definition: NVPTXBaseInfo.h:21
Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:372
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
constexpr int PoisonMaskElem
void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1824
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
InstructionCost Cost
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:439
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
#define N
std::optional< unsigned > operator[](TargetTransformInfo::TargetCostKind Kind) const
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:34
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition: Alignment.h:141
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Type Conversion Cost Table.
Definition: CostTable.h:55