LLVM  14.0.0git
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements a TargetTransformInfo analysis pass specific to the
10 /// X86 target machine. It uses the target's detailed information to provide
11 /// more precise answers to certain TTI queries, while letting the target
12 /// independent and default TTI implementations handle the rest.
13 ///
14 //===----------------------------------------------------------------------===//
15 /// About Cost Model numbers used below it's necessary to say the following:
16 /// the numbers correspond to some "generic" X86 CPU instead of usage of
17 /// concrete CPU model. Usually the numbers correspond to CPU where the feature
18 /// apeared at the first time. For example, if we do Subtarget.hasSSE42() in
19 /// the lookups below the cost is based on Nehalem as that was the first CPU
20 /// to support that feature level and thus has most likely the worst case cost.
21 /// Some examples of other technologies/CPUs:
22 /// SSE 3 - Pentium4 / Athlon64
23 /// SSE 4.1 - Penryn
24 /// SSE 4.2 - Nehalem
25 /// AVX - Sandy Bridge
26 /// AVX2 - Haswell
27 /// AVX-512 - Xeon Phi / Skylake
28 /// And some examples of instruction target dependent costs (latency)
29 /// divss sqrtss rsqrtss
30 /// AMD K7 11-16 19 3
31 /// Piledriver 9-24 13-15 5
32 /// Jaguar 14 16 2
33 /// Pentium II,III 18 30 2
34 /// Nehalem 7-14 7-18 3
35 /// Haswell 10-13 11 5
36 /// TODO: Develop and implement the target dependent cost model and
37 /// specialize cost numbers for different Cost Model Targets such as throughput,
38 /// code size, latency and uop count.
39 //===----------------------------------------------------------------------===//
40 
41 #include "X86TargetTransformInfo.h"
44 #include "llvm/CodeGen/CostTable.h"
46 #include "llvm/IR/IntrinsicInst.h"
47 #include "llvm/Support/Debug.h"
48 
49 using namespace llvm;
50 
51 #define DEBUG_TYPE "x86tti"
52 
53 //===----------------------------------------------------------------------===//
54 //
55 // X86 cost model.
56 //
57 //===----------------------------------------------------------------------===//
58 
60 X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
61  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
62  // TODO: Currently the __builtin_popcount() implementation using SSE3
63  // instructions is inefficient. Once the problem is fixed, we should
64  // call ST->hasSSE3() instead of ST->hasPOPCNT().
66 }
67 
70  switch (Level) {
72  // - Penryn
73  // - Nehalem
74  // - Westmere
75  // - Sandy Bridge
76  // - Ivy Bridge
77  // - Haswell
78  // - Broadwell
79  // - Skylake
80  // - Kabylake
81  return 32 * 1024; // 32 KByte
83  // - Penryn
84  // - Nehalem
85  // - Westmere
86  // - Sandy Bridge
87  // - Ivy Bridge
88  // - Haswell
89  // - Broadwell
90  // - Skylake
91  // - Kabylake
92  return 256 * 1024; // 256 KByte
93  }
94 
95  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
96 }
97 
100  // - Penryn
101  // - Nehalem
102  // - Westmere
103  // - Sandy Bridge
104  // - Ivy Bridge
105  // - Haswell
106  // - Broadwell
107  // - Skylake
108  // - Kabylake
109  switch (Level) {
113  return 8;
114  }
115 
116  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
117 }
118 
119 unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
120  bool Vector = (ClassID == 1);
121  if (Vector && !ST->hasSSE1())
122  return 0;
123 
124  if (ST->is64Bit()) {
125  if (Vector && ST->hasAVX512())
126  return 32;
127  return 16;
128  }
129  return 8;
130 }
131 
132 TypeSize
134  unsigned PreferVectorWidth = ST->getPreferVectorWidth();
135  switch (K) {
137  return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
139  if (ST->hasAVX512() && PreferVectorWidth >= 512)
140  return TypeSize::getFixed(512);
141  if (ST->hasAVX() && PreferVectorWidth >= 256)
142  return TypeSize::getFixed(256);
143  if (ST->hasSSE1() && PreferVectorWidth >= 128)
144  return TypeSize::getFixed(128);
145  return TypeSize::getFixed(0);
147  return TypeSize::getScalable(0);
148  }
149 
150  llvm_unreachable("Unsupported register kind");
151 }
152 
153 unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
155  .getFixedSize();
156 }
157 
158 unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
159  // If the loop will not be vectorized, don't interleave the loop.
160  // Let regular unroll to unroll the loop, which saves the overflow
161  // check and memory check cost.
162  if (VF == 1)
163  return 1;
164 
165  if (ST->isAtom())
166  return 1;
167 
168  // Sandybridge and Haswell have multiple execution ports and pipelined
169  // vector units.
170  if (ST->hasAVX())
171  return 4;
172 
173  return 2;
174 }
175 
177  unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
179  TTI::OperandValueProperties Opd1PropInfo,
181  const Instruction *CxtI) {
182  // TODO: Handle more cost kinds.
184  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
185  Op2Info, Opd1PropInfo,
186  Opd2PropInfo, Args, CxtI);
187 
188  // vXi8 multiplications are always promoted to vXi16.
189  if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
190  Ty->getScalarSizeInBits() == 8) {
191  Type *WideVecTy =
192  VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
193  return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
195  CostKind) +
196  getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
198  CostKind) +
199  getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info,
200  Opd1PropInfo, Opd2PropInfo);
201  }
202 
203  // Legalize the type.
204  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
205 
206  int ISD = TLI->InstructionOpcodeToISD(Opcode);
207  assert(ISD && "Invalid opcode");
208 
209  if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
210  LT.second.getScalarType() == MVT::i32) {
211  // Check if the operands can be represented as a smaller datatype.
212  bool Op1Signed = false, Op2Signed = false;
213  unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
214  unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
215  unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
216 
217  // If both are representable as i15 and at least one is constant,
218  // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
219  // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
220  if (OpMinSize <= 15 && !ST->isPMADDWDSlow()) {
221  bool Op1Constant =
222  isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
223  bool Op2Constant =
224  isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
225  bool Op1Sext = isa<SExtInst>(Args[0]) &&
226  (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
227  bool Op2Sext = isa<SExtInst>(Args[1]) &&
228  (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
229 
230  bool IsZeroExtended = !Op1Signed || !Op2Signed;
231  bool IsConstant = Op1Constant || Op2Constant;
232  bool IsSext = Op1Sext || Op2Sext;
233  if (IsConstant || IsZeroExtended || IsSext)
234  LT.second =
235  MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
236  }
237  }
238 
239  if ((ISD == ISD::MUL || ISD == ISD::SDIV || ISD == ISD::SREM ||
240  ISD == ISD::UDIV || ISD == ISD::UREM) &&
243  Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
244  // Vector multiply by pow2 will be simplified to shifts.
245  if (ISD == ISD::MUL) {
247  Instruction::Shl, Ty, CostKind, Op1Info, Op2Info,
249  return Cost;
250  }
251 
252  if (ISD == ISD::SDIV || ISD == ISD::SREM) {
253  // On X86, vector signed division by constants power-of-two are
254  // normally expanded to the sequence SRA + SRL + ADD + SRA.
255  // The OperandValue properties may not be the same as that of the previous
256  // operation; conservatively assume OP_None.
257  InstructionCost Cost =
258  2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Op1Info,
261  Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info,
264  Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, Op1Info,
267 
268  if (ISD == ISD::SREM) {
269  // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
270  Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info,
271  Op2Info);
272  Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info,
273  Op2Info);
274  }
275 
276  return Cost;
277  }
278 
279  // Vector unsigned division/remainder will be simplified to shifts/masks.
280  if (ISD == ISD::UDIV)
281  return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info,
284  // UREM
285  return getArithmeticInstrCost(Instruction::And, Ty, CostKind, Op1Info,
288  }
289 
290  static const CostTblEntry GLMCostTable[] = {
291  { ISD::FDIV, MVT::f32, 18 }, // divss
292  { ISD::FDIV, MVT::v4f32, 35 }, // divps
293  { ISD::FDIV, MVT::f64, 33 }, // divsd
294  { ISD::FDIV, MVT::v2f64, 65 }, // divpd
295  };
296 
297  if (ST->useGLMDivSqrtCosts())
298  if (const auto *Entry = CostTableLookup(GLMCostTable, ISD,
299  LT.second))
300  return LT.first * Entry->Cost;
301 
302  static const CostTblEntry SLMCostTable[] = {
303  { ISD::MUL, MVT::v4i32, 11 }, // pmulld
304  { ISD::MUL, MVT::v8i16, 2 }, // pmullw
305  { ISD::FMUL, MVT::f64, 2 }, // mulsd
306  { ISD::FMUL, MVT::v2f64, 4 }, // mulpd
307  { ISD::FMUL, MVT::v4f32, 2 }, // mulps
308  { ISD::FDIV, MVT::f32, 17 }, // divss
309  { ISD::FDIV, MVT::v4f32, 39 }, // divps
310  { ISD::FDIV, MVT::f64, 32 }, // divsd
311  { ISD::FDIV, MVT::v2f64, 69 }, // divpd
312  { ISD::FADD, MVT::v2f64, 2 }, // addpd
313  { ISD::FSUB, MVT::v2f64, 2 }, // subpd
314  // v2i64/v4i64 mul is custom lowered as a series of long:
315  // multiplies(3), shifts(3) and adds(2)
316  // slm muldq version throughput is 2 and addq throughput 4
317  // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
318  // 3X4 (addq throughput) = 17
319  { ISD::MUL, MVT::v2i64, 17 },
320  // slm addq\subq throughput is 4
321  { ISD::ADD, MVT::v2i64, 4 },
322  { ISD::SUB, MVT::v2i64, 4 },
323  };
324 
325  if (ST->useSLMArithCosts()) {
326  if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
327  // Check if the operands can be shrinked into a smaller datatype.
328  // TODO: Merge this into generiic vXi32 MUL patterns above.
329  bool Op1Signed = false;
330  unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
331  bool Op2Signed = false;
332  unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
333 
334  bool SignedMode = Op1Signed || Op2Signed;
335  unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
336 
337  if (OpMinSize <= 7)
338  return LT.first * 3; // pmullw/sext
339  if (!SignedMode && OpMinSize <= 8)
340  return LT.first * 3; // pmullw/zext
341  if (OpMinSize <= 15)
342  return LT.first * 5; // pmullw/pmulhw/pshuf
343  if (!SignedMode && OpMinSize <= 16)
344  return LT.first * 5; // pmullw/pmulhw/pshuf
345  }
346 
347  if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
348  LT.second)) {
349  return LT.first * Entry->Cost;
350  }
351  }
352 
353  static const CostTblEntry AVX512BWUniformConstCostTable[] = {
354  { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand.
355  { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand.
356  { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb.
357  };
358 
360  ST->hasBWI()) {
361  if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
362  LT.second))
363  return LT.first * Entry->Cost;
364  }
365 
366  static const CostTblEntry AVX512UniformConstCostTable[] = {
367  { ISD::SRA, MVT::v2i64, 1 },
368  { ISD::SRA, MVT::v4i64, 1 },
369  { ISD::SRA, MVT::v8i64, 1 },
370 
371  { ISD::SHL, MVT::v64i8, 4 }, // psllw + pand.
372  { ISD::SRL, MVT::v64i8, 4 }, // psrlw + pand.
373  { ISD::SRA, MVT::v64i8, 8 }, // psrlw, pand, pxor, psubb.
374 
375  { ISD::SDIV, MVT::v16i32, 6 }, // pmuludq sequence
376  { ISD::SREM, MVT::v16i32, 8 }, // pmuludq+mul+sub sequence
377  { ISD::UDIV, MVT::v16i32, 5 }, // pmuludq sequence
378  { ISD::UREM, MVT::v16i32, 7 }, // pmuludq+mul+sub sequence
379  };
380 
382  ST->hasAVX512()) {
383  if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
384  LT.second))
385  return LT.first * Entry->Cost;
386  }
387 
388  static const CostTblEntry AVX2UniformConstCostTable[] = {
389  { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand.
390  { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand.
391  { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb.
392 
393  { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
394 
395  { ISD::SDIV, MVT::v8i32, 6 }, // pmuludq sequence
396  { ISD::SREM, MVT::v8i32, 8 }, // pmuludq+mul+sub sequence
397  { ISD::UDIV, MVT::v8i32, 5 }, // pmuludq sequence
398  { ISD::UREM, MVT::v8i32, 7 }, // pmuludq+mul+sub sequence
399  };
400 
402  ST->hasAVX2()) {
403  if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
404  LT.second))
405  return LT.first * Entry->Cost;
406  }
407 
408  static const CostTblEntry SSE2UniformConstCostTable[] = {
409  { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand.
410  { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand.
411  { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
412 
413  { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split.
414  { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split.
415  { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
416 
417  { ISD::SDIV, MVT::v8i32, 12+2 }, // 2*pmuludq sequence + split.
418  { ISD::SREM, MVT::v8i32, 16+2 }, // 2*pmuludq+mul+sub sequence + split.
419  { ISD::SDIV, MVT::v4i32, 6 }, // pmuludq sequence
420  { ISD::SREM, MVT::v4i32, 8 }, // pmuludq+mul+sub sequence
421  { ISD::UDIV, MVT::v8i32, 10+2 }, // 2*pmuludq sequence + split.
422  { ISD::UREM, MVT::v8i32, 14+2 }, // 2*pmuludq+mul+sub sequence + split.
423  { ISD::UDIV, MVT::v4i32, 5 }, // pmuludq sequence
424  { ISD::UREM, MVT::v4i32, 7 }, // pmuludq+mul+sub sequence
425  };
426 
427  // XOP has faster vXi8 shifts.
429  ST->hasSSE2() && !ST->hasXOP()) {
430  if (const auto *Entry =
431  CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
432  return LT.first * Entry->Cost;
433  }
434 
435  static const CostTblEntry AVX512BWConstCostTable[] = {
436  { ISD::SDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
437  { ISD::SREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
438  { ISD::UDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
439  { ISD::UREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
440  { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence
441  { ISD::SREM, MVT::v32i16, 8 }, // vpmulhw+mul+sub sequence
442  { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence
443  { ISD::UREM, MVT::v32i16, 8 }, // vpmulhuw+mul+sub sequence
444  };
445 
448  ST->hasBWI()) {
449  if (const auto *Entry =
450  CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
451  return LT.first * Entry->Cost;
452  }
453 
454  static const CostTblEntry AVX512ConstCostTable[] = {
455  { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
456  { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
457  { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
458  { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
459  { ISD::SDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
460  { ISD::SREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
461  { ISD::UDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
462  { ISD::UREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
463  { ISD::SDIV, MVT::v32i16, 12 }, // 2*vpmulhw sequence
464  { ISD::SREM, MVT::v32i16, 16 }, // 2*vpmulhw+mul+sub sequence
465  { ISD::UDIV, MVT::v32i16, 12 }, // 2*vpmulhuw sequence
466  { ISD::UREM, MVT::v32i16, 16 }, // 2*vpmulhuw+mul+sub sequence
467  };
468 
471  ST->hasAVX512()) {
472  if (const auto *Entry =
473  CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
474  return LT.first * Entry->Cost;
475  }
476 
477  static const CostTblEntry AVX2ConstCostTable[] = {
478  { ISD::SDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
479  { ISD::SREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
480  { ISD::UDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
481  { ISD::UREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
482  { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
483  { ISD::SREM, MVT::v16i16, 8 }, // vpmulhw+mul+sub sequence
484  { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
485  { ISD::UREM, MVT::v16i16, 8 }, // vpmulhuw+mul+sub sequence
486  { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
487  { ISD::SREM, MVT::v8i32, 19 }, // vpmuldq+mul+sub sequence
488  { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
489  { ISD::UREM, MVT::v8i32, 19 }, // vpmuludq+mul+sub sequence
490  };
491 
494  ST->hasAVX2()) {
495  if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
496  return LT.first * Entry->Cost;
497  }
498 
499  static const CostTblEntry SSE2ConstCostTable[] = {
500  { ISD::SDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
501  { ISD::SREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
502  { ISD::SDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
503  { ISD::SREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
504  { ISD::UDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
505  { ISD::UREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
506  { ISD::UDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
507  { ISD::UREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
508  { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
509  { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split.
510  { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
511  { ISD::SREM, MVT::v8i16, 8 }, // pmulhw+mul+sub sequence
512  { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
513  { ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split.
514  { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
515  { ISD::UREM, MVT::v8i16, 8 }, // pmulhuw+mul+sub sequence
516  { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split.
517  { ISD::SREM, MVT::v8i32, 48+2 }, // 2*pmuludq+mul+sub sequence + split.
518  { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
519  { ISD::SREM, MVT::v4i32, 24 }, // pmuludq+mul+sub sequence
520  { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split.
521  { ISD::UREM, MVT::v8i32, 40+2 }, // 2*pmuludq+mul+sub sequence + split.
522  { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
523  { ISD::UREM, MVT::v4i32, 20 }, // pmuludq+mul+sub sequence
524  };
525 
528  ST->hasSSE2()) {
529  // pmuldq sequence.
530  if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
531  return LT.first * 32;
532  if (ISD == ISD::SREM && LT.second == MVT::v8i32 && ST->hasAVX())
533  return LT.first * 38;
534  if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
535  return LT.first * 15;
536  if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41())
537  return LT.first * 20;
538 
539  if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
540  return LT.first * Entry->Cost;
541  }
542 
543  static const CostTblEntry AVX512BWShiftCostTable[] = {
544  { ISD::SHL, MVT::v16i8, 4 }, // extend/vpsllvw/pack sequence.
545  { ISD::SRL, MVT::v16i8, 4 }, // extend/vpsrlvw/pack sequence.
546  { ISD::SRA, MVT::v16i8, 4 }, // extend/vpsravw/pack sequence.
547  { ISD::SHL, MVT::v32i8, 4 }, // extend/vpsllvw/pack sequence.
548  { ISD::SRL, MVT::v32i8, 4 }, // extend/vpsrlvw/pack sequence.
549  { ISD::SRA, MVT::v32i8, 6 }, // extend/vpsravw/pack sequence.
550  { ISD::SHL, MVT::v64i8, 6 }, // extend/vpsllvw/pack sequence.
551  { ISD::SRL, MVT::v64i8, 7 }, // extend/vpsrlvw/pack sequence.
552  { ISD::SRA, MVT::v64i8, 15 }, // extend/vpsravw/pack sequence.
553 
554  { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
555  { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
556  { ISD::SRA, MVT::v8i16, 1 }, // vpsravw
557  { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
558  { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
559  { ISD::SRA, MVT::v16i16, 1 }, // vpsravw
560  { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
561  { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
562  { ISD::SRA, MVT::v32i16, 1 }, // vpsravw
563  };
564 
565  if (ST->hasBWI())
566  if (const auto *Entry = CostTableLookup(AVX512BWShiftCostTable, ISD, LT.second))
567  return LT.first * Entry->Cost;
568 
569  static const CostTblEntry AVX2UniformCostTable[] = {
570  // Uniform splats are cheaper for the following instructions.
571  { ISD::SHL, MVT::v16i16, 1 }, // psllw.
572  { ISD::SRL, MVT::v16i16, 1 }, // psrlw.
573  { ISD::SRA, MVT::v16i16, 1 }, // psraw.
574  { ISD::SHL, MVT::v32i16, 2 }, // 2*psllw.
575  { ISD::SRL, MVT::v32i16, 2 }, // 2*psrlw.
576  { ISD::SRA, MVT::v32i16, 2 }, // 2*psraw.
577 
578  { ISD::SHL, MVT::v8i32, 1 }, // pslld
579  { ISD::SRL, MVT::v8i32, 1 }, // psrld
580  { ISD::SRA, MVT::v8i32, 1 }, // psrad
581  { ISD::SHL, MVT::v4i64, 1 }, // psllq
582  { ISD::SRL, MVT::v4i64, 1 }, // psrlq
583  };
584 
585  if (ST->hasAVX2() &&
587  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
588  if (const auto *Entry =
589  CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
590  return LT.first * Entry->Cost;
591  }
592 
593  static const CostTblEntry SSE2UniformCostTable[] = {
594  // Uniform splats are cheaper for the following instructions.
595  { ISD::SHL, MVT::v8i16, 1 }, // psllw.
596  { ISD::SHL, MVT::v4i32, 1 }, // pslld
597  { ISD::SHL, MVT::v2i64, 1 }, // psllq.
598 
599  { ISD::SRL, MVT::v8i16, 1 }, // psrlw.
600  { ISD::SRL, MVT::v4i32, 1 }, // psrld.
601  { ISD::SRL, MVT::v2i64, 1 }, // psrlq.
602 
603  { ISD::SRA, MVT::v8i16, 1 }, // psraw.
604  { ISD::SRA, MVT::v4i32, 1 }, // psrad.
605  };
606 
607  if (ST->hasSSE2() &&
609  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
610  if (const auto *Entry =
611  CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
612  return LT.first * Entry->Cost;
613  }
614 
615  static const CostTblEntry AVX512DQCostTable[] = {
616  { ISD::MUL, MVT::v2i64, 2 }, // pmullq
617  { ISD::MUL, MVT::v4i64, 2 }, // pmullq
618  { ISD::MUL, MVT::v8i64, 2 } // pmullq
619  };
620 
621  // Look for AVX512DQ lowering tricks for custom cases.
622  if (ST->hasDQI())
623  if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
624  return LT.first * Entry->Cost;
625 
626  static const CostTblEntry AVX512BWCostTable[] = {
627  { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence.
628  { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence.
629  { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence.
630  };
631 
632  // Look for AVX512BW lowering tricks for custom cases.
633  if (ST->hasBWI())
634  if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
635  return LT.first * Entry->Cost;
636 
637  static const CostTblEntry AVX512CostTable[] = {
638  { ISD::SHL, MVT::v4i32, 1 },
639  { ISD::SRL, MVT::v4i32, 1 },
640  { ISD::SRA, MVT::v4i32, 1 },
641  { ISD::SHL, MVT::v8i32, 1 },
642  { ISD::SRL, MVT::v8i32, 1 },
643  { ISD::SRA, MVT::v8i32, 1 },
644  { ISD::SHL, MVT::v16i32, 1 },
645  { ISD::SRL, MVT::v16i32, 1 },
646  { ISD::SRA, MVT::v16i32, 1 },
647 
648  { ISD::SHL, MVT::v2i64, 1 },
649  { ISD::SRL, MVT::v2i64, 1 },
650  { ISD::SHL, MVT::v4i64, 1 },
651  { ISD::SRL, MVT::v4i64, 1 },
652  { ISD::SHL, MVT::v8i64, 1 },
653  { ISD::SRL, MVT::v8i64, 1 },
654 
655  { ISD::SRA, MVT::v2i64, 1 },
656  { ISD::SRA, MVT::v4i64, 1 },
657  { ISD::SRA, MVT::v8i64, 1 },
658 
659  { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org)
660  { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org)
661  { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org)
662  { ISD::MUL, MVT::v8i64, 6 }, // 3*pmuludq/3*shift/2*add
663 
664  { ISD::FNEG, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
665  { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
666  { ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
667  { ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
668  { ISD::FDIV, MVT::f64, 4 }, // Skylake from http://www.agner.org/
669  { ISD::FDIV, MVT::v2f64, 4 }, // Skylake from http://www.agner.org/
670  { ISD::FDIV, MVT::v4f64, 8 }, // Skylake from http://www.agner.org/
671  { ISD::FDIV, MVT::v8f64, 16 }, // Skylake from http://www.agner.org/
672 
673  { ISD::FNEG, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
674  { ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
675  { ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
676  { ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
677  { ISD::FDIV, MVT::f32, 3 }, // Skylake from http://www.agner.org/
678  { ISD::FDIV, MVT::v4f32, 3 }, // Skylake from http://www.agner.org/
679  { ISD::FDIV, MVT::v8f32, 5 }, // Skylake from http://www.agner.org/
680  { ISD::FDIV, MVT::v16f32, 10 }, // Skylake from http://www.agner.org/
681  };
682 
683  if (ST->hasAVX512())
684  if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
685  return LT.first * Entry->Cost;
686 
687  static const CostTblEntry AVX2ShiftCostTable[] = {
688  // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
689  // customize them to detect the cases where shift amount is a scalar one.
690  { ISD::SHL, MVT::v4i32, 2 }, // vpsllvd (Haswell from agner.org)
691  { ISD::SRL, MVT::v4i32, 2 }, // vpsrlvd (Haswell from agner.org)
692  { ISD::SRA, MVT::v4i32, 2 }, // vpsravd (Haswell from agner.org)
693  { ISD::SHL, MVT::v8i32, 2 }, // vpsllvd (Haswell from agner.org)
694  { ISD::SRL, MVT::v8i32, 2 }, // vpsrlvd (Haswell from agner.org)
695  { ISD::SRA, MVT::v8i32, 2 }, // vpsravd (Haswell from agner.org)
696  { ISD::SHL, MVT::v2i64, 1 }, // vpsllvq (Haswell from agner.org)
697  { ISD::SRL, MVT::v2i64, 1 }, // vpsrlvq (Haswell from agner.org)
698  { ISD::SHL, MVT::v4i64, 1 }, // vpsllvq (Haswell from agner.org)
699  { ISD::SRL, MVT::v4i64, 1 }, // vpsrlvq (Haswell from agner.org)
700  };
701 
702  if (ST->hasAVX512()) {
703  if (ISD == ISD::SHL && LT.second == MVT::v32i16 &&
706  // On AVX512, a packed v32i16 shift left by a constant build_vector
707  // is lowered into a vector multiply (vpmullw).
708  return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
709  Op1Info, Op2Info,
712  }
713 
714  // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
715  if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
716  if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
719  // On AVX2, a packed v16i16 shift left by a constant build_vector
720  // is lowered into a vector multiply (vpmullw).
721  return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
722  Op1Info, Op2Info,
725 
726  if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
727  return LT.first * Entry->Cost;
728  }
729 
730  static const CostTblEntry XOPShiftCostTable[] = {
731  // 128bit shifts take 1cy, but right shifts require negation beforehand.
732  { ISD::SHL, MVT::v16i8, 1 },
733  { ISD::SRL, MVT::v16i8, 2 },
734  { ISD::SRA, MVT::v16i8, 2 },
735  { ISD::SHL, MVT::v8i16, 1 },
736  { ISD::SRL, MVT::v8i16, 2 },
737  { ISD::SRA, MVT::v8i16, 2 },
738  { ISD::SHL, MVT::v4i32, 1 },
739  { ISD::SRL, MVT::v4i32, 2 },
740  { ISD::SRA, MVT::v4i32, 2 },
741  { ISD::SHL, MVT::v2i64, 1 },
742  { ISD::SRL, MVT::v2i64, 2 },
743  { ISD::SRA, MVT::v2i64, 2 },
744  // 256bit shifts require splitting if AVX2 didn't catch them above.
745  { ISD::SHL, MVT::v32i8, 2+2 },
746  { ISD::SRL, MVT::v32i8, 4+2 },
747  { ISD::SRA, MVT::v32i8, 4+2 },
748  { ISD::SHL, MVT::v16i16, 2+2 },
749  { ISD::SRL, MVT::v16i16, 4+2 },
750  { ISD::SRA, MVT::v16i16, 4+2 },
751  { ISD::SHL, MVT::v8i32, 2+2 },
752  { ISD::SRL, MVT::v8i32, 4+2 },
753  { ISD::SRA, MVT::v8i32, 4+2 },
754  { ISD::SHL, MVT::v4i64, 2+2 },
755  { ISD::SRL, MVT::v4i64, 4+2 },
756  { ISD::SRA, MVT::v4i64, 4+2 },
757  };
758 
759  // Look for XOP lowering tricks.
760  if (ST->hasXOP()) {
761  // If the right shift is constant then we'll fold the negation so
762  // it's as cheap as a left shift.
763  int ShiftISD = ISD;
764  if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) &&
767  ShiftISD = ISD::SHL;
768  if (const auto *Entry =
769  CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
770  return LT.first * Entry->Cost;
771  }
772 
773  static const CostTblEntry SSE2UniformShiftCostTable[] = {
774  // Uniform splats are cheaper for the following instructions.
775  { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split.
776  { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split.
777  { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split.
778 
779  { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split.
780  { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split.
781  { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split.
782 
783  { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split.
784  { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split.
785  { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle.
786  { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split.
787  };
788 
789  if (ST->hasSSE2() &&
791  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
792 
793  // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
794  if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2())
795  return LT.first * 4; // 2*psrad + shuffle.
796 
797  if (const auto *Entry =
798  CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
799  return LT.first * Entry->Cost;
800  }
801 
802  if (ISD == ISD::SHL &&
804  MVT VT = LT.second;
805  // Vector shift left by non uniform constant can be lowered
806  // into vector multiply.
807  if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
808  ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
809  ISD = ISD::MUL;
810  }
811 
812  static const CostTblEntry AVX2CostTable[] = {
813  { ISD::SHL, MVT::v16i8, 6 }, // vpblendvb sequence.
814  { ISD::SHL, MVT::v32i8, 6 }, // vpblendvb sequence.
815  { ISD::SHL, MVT::v64i8, 12 }, // 2*vpblendvb sequence.
816  { ISD::SHL, MVT::v8i16, 5 }, // extend/vpsrlvd/pack sequence.
817  { ISD::SHL, MVT::v16i16, 7 }, // extend/vpsrlvd/pack sequence.
818  { ISD::SHL, MVT::v32i16, 14 }, // 2*extend/vpsrlvd/pack sequence.
819 
820  { ISD::SRL, MVT::v16i8, 6 }, // vpblendvb sequence.
821  { ISD::SRL, MVT::v32i8, 6 }, // vpblendvb sequence.
822  { ISD::SRL, MVT::v64i8, 12 }, // 2*vpblendvb sequence.
823  { ISD::SRL, MVT::v8i16, 5 }, // extend/vpsrlvd/pack sequence.
824  { ISD::SRL, MVT::v16i16, 7 }, // extend/vpsrlvd/pack sequence.
825  { ISD::SRL, MVT::v32i16, 14 }, // 2*extend/vpsrlvd/pack sequence.
826 
827  { ISD::SRA, MVT::v16i8, 17 }, // vpblendvb sequence.
828  { ISD::SRA, MVT::v32i8, 17 }, // vpblendvb sequence.
829  { ISD::SRA, MVT::v64i8, 34 }, // 2*vpblendvb sequence.
830  { ISD::SRA, MVT::v8i16, 5 }, // extend/vpsravd/pack sequence.
831  { ISD::SRA, MVT::v16i16, 7 }, // extend/vpsravd/pack sequence.
832  { ISD::SRA, MVT::v32i16, 14 }, // 2*extend/vpsravd/pack sequence.
833  { ISD::SRA, MVT::v2i64, 2 }, // srl/xor/sub sequence.
834  { ISD::SRA, MVT::v4i64, 2 }, // srl/xor/sub sequence.
835 
836  { ISD::SUB, MVT::v32i8, 1 }, // psubb
837  { ISD::ADD, MVT::v32i8, 1 }, // paddb
838  { ISD::SUB, MVT::v16i16, 1 }, // psubw
839  { ISD::ADD, MVT::v16i16, 1 }, // paddw
840  { ISD::SUB, MVT::v8i32, 1 }, // psubd
841  { ISD::ADD, MVT::v8i32, 1 }, // paddd
842  { ISD::SUB, MVT::v4i64, 1 }, // psubq
843  { ISD::ADD, MVT::v4i64, 1 }, // paddq
844 
845  { ISD::MUL, MVT::v16i16, 1 }, // pmullw
846  { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org)
847  { ISD::MUL, MVT::v4i64, 6 }, // 3*pmuludq/3*shift/2*add
848 
849  { ISD::FNEG, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
850  { ISD::FNEG, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
851  { ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
852  { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
853  { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
854  { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
855  { ISD::FMUL, MVT::f64, 1 }, // Haswell from http://www.agner.org/
856  { ISD::FMUL, MVT::v2f64, 1 }, // Haswell from http://www.agner.org/
857  { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
858  { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
859 
860  { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
861  { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
862  { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
863  { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/
864  { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
865  { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
866  };
867 
868  // Look for AVX2 lowering tricks for custom cases.
869  if (ST->hasAVX2())
870  if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
871  return LT.first * Entry->Cost;
872 
873  static const CostTblEntry AVX1CostTable[] = {
874  // We don't have to scalarize unsupported ops. We can issue two half-sized
875  // operations and we only need to extract the upper YMM half.
876  // Two ops + 1 extract + 1 insert = 4.
877  { ISD::MUL, MVT::v16i16, 4 },
878  { ISD::MUL, MVT::v8i32, 5 }, // BTVER2 from http://www.agner.org/
879  { ISD::MUL, MVT::v4i64, 12 },
880 
881  { ISD::SUB, MVT::v32i8, 4 },
882  { ISD::ADD, MVT::v32i8, 4 },
883  { ISD::SUB, MVT::v16i16, 4 },
884  { ISD::ADD, MVT::v16i16, 4 },
885  { ISD::SUB, MVT::v8i32, 4 },
886  { ISD::ADD, MVT::v8i32, 4 },
887  { ISD::SUB, MVT::v4i64, 4 },
888  { ISD::ADD, MVT::v4i64, 4 },
889 
890  { ISD::SHL, MVT::v32i8, 22 }, // pblendvb sequence + split.
891  { ISD::SHL, MVT::v8i16, 6 }, // pblendvb sequence.
892  { ISD::SHL, MVT::v16i16, 13 }, // pblendvb sequence + split.
893  { ISD::SHL, MVT::v4i32, 3 }, // pslld/paddd/cvttps2dq/pmulld
894  { ISD::SHL, MVT::v8i32, 9 }, // pslld/paddd/cvttps2dq/pmulld + split
895  { ISD::SHL, MVT::v2i64, 2 }, // Shift each lane + blend.
896  { ISD::SHL, MVT::v4i64, 6 }, // Shift each lane + blend + split.
897 
898  { ISD::SRL, MVT::v32i8, 23 }, // pblendvb sequence + split.
899  { ISD::SRL, MVT::v16i16, 28 }, // pblendvb sequence + split.
900  { ISD::SRL, MVT::v4i32, 6 }, // Shift each lane + blend.
901  { ISD::SRL, MVT::v8i32, 14 }, // Shift each lane + blend + split.
902  { ISD::SRL, MVT::v2i64, 2 }, // Shift each lane + blend.
903  { ISD::SRL, MVT::v4i64, 6 }, // Shift each lane + blend + split.
904 
905  { ISD::SRA, MVT::v32i8, 44 }, // pblendvb sequence + split.
906  { ISD::SRA, MVT::v16i16, 28 }, // pblendvb sequence + split.
907  { ISD::SRA, MVT::v4i32, 6 }, // Shift each lane + blend.
908  { ISD::SRA, MVT::v8i32, 14 }, // Shift each lane + blend + split.
909  { ISD::SRA, MVT::v2i64, 5 }, // Shift each lane + blend.
910  { ISD::SRA, MVT::v4i64, 12 }, // Shift each lane + blend + split.
911 
912  { ISD::FNEG, MVT::v4f64, 2 }, // BTVER2 from http://www.agner.org/
913  { ISD::FNEG, MVT::v8f32, 2 }, // BTVER2 from http://www.agner.org/
914 
915  { ISD::FMUL, MVT::f64, 2 }, // BTVER2 from http://www.agner.org/
916  { ISD::FMUL, MVT::v2f64, 2 }, // BTVER2 from http://www.agner.org/
917  { ISD::FMUL, MVT::v4f64, 4 }, // BTVER2 from http://www.agner.org/
918 
919  { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
920  { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
921  { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
922  { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/
923  { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/
924  { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/
925  };
926 
927  if (ST->hasAVX())
928  if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
929  return LT.first * Entry->Cost;
930 
931  static const CostTblEntry SSE42CostTable[] = {
932  { ISD::FADD, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
933  { ISD::FADD, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
934  { ISD::FADD, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
935  { ISD::FADD, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
936 
937  { ISD::FSUB, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
938  { ISD::FSUB, MVT::f32 , 1 }, // Nehalem from http://www.agner.org/
939  { ISD::FSUB, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
940  { ISD::FSUB, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
941 
942  { ISD::FMUL, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
943  { ISD::FMUL, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
944  { ISD::FMUL, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
945  { ISD::FMUL, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
946 
947  { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/
948  { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
949  { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/
950  { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
951 
952  { ISD::MUL, MVT::v2i64, 6 } // 3*pmuludq/3*shift/2*add
953  };
954 
955  if (ST->hasSSE42())
956  if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
957  return LT.first * Entry->Cost;
958 
959  static const CostTblEntry SSE41CostTable[] = {
960  { ISD::SHL, MVT::v16i8, 10 }, // pblendvb sequence.
961  { ISD::SHL, MVT::v8i16, 11 }, // pblendvb sequence.
962  { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
963 
964  { ISD::SRL, MVT::v16i8, 11 }, // pblendvb sequence.
965  { ISD::SRL, MVT::v8i16, 13 }, // pblendvb sequence.
966  { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
967 
968  { ISD::SRA, MVT::v16i8, 21 }, // pblendvb sequence.
969  { ISD::SRA, MVT::v8i16, 13 }, // pblendvb sequence.
970 
971  { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org)
972  };
973 
974  if (ST->hasSSE41())
975  if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
976  return LT.first * Entry->Cost;
977 
978  static const CostTblEntry SSE2CostTable[] = {
979  // We don't correctly identify costs of casts because they are marked as
980  // custom.
981  { ISD::SHL, MVT::v16i8, 13 }, // cmpgtb sequence.
982  { ISD::SHL, MVT::v8i16, 25 }, // cmpgtw sequence.
983  { ISD::SHL, MVT::v4i32, 16 }, // pslld/paddd/cvttps2dq/pmuludq.
984  { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
985 
986  { ISD::SRL, MVT::v16i8, 14 }, // cmpgtb sequence.
987  { ISD::SRL, MVT::v8i16, 16 }, // cmpgtw sequence.
988  { ISD::SRL, MVT::v4i32, 12 }, // Shift each lane + blend.
989  { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
990 
991  { ISD::SRA, MVT::v16i8, 27 }, // unpacked cmpgtb sequence.
992  { ISD::SRA, MVT::v8i16, 16 }, // cmpgtw sequence.
993  { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
994  { ISD::SRA, MVT::v2i64, 8 }, // srl/xor/sub splat+shuffle sequence.
995 
996  { ISD::MUL, MVT::v8i16, 1 }, // pmullw
997  { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
998  { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
999 
1000  { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/
1001  { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
1002  { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
1003  { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
1004 
1005  { ISD::FNEG, MVT::f32, 1 }, // Pentium IV from http://www.agner.org/
1006  { ISD::FNEG, MVT::f64, 1 }, // Pentium IV from http://www.agner.org/
1007  { ISD::FNEG, MVT::v4f32, 1 }, // Pentium IV from http://www.agner.org/
1008  { ISD::FNEG, MVT::v2f64, 1 }, // Pentium IV from http://www.agner.org/
1009 
1010  { ISD::FADD, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
1011  { ISD::FADD, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
1012 
1013  { ISD::FSUB, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
1014  { ISD::FSUB, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
1015  };
1016 
1017  if (ST->hasSSE2())
1018  if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1019  return LT.first * Entry->Cost;
1020 
1021  static const CostTblEntry SSE1CostTable[] = {
1022  { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/
1023  { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
1024 
1025  { ISD::FNEG, MVT::f32, 2 }, // Pentium III from http://www.agner.org/
1026  { ISD::FNEG, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
1027 
1028  { ISD::FADD, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
1029  { ISD::FADD, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
1030 
1031  { ISD::FSUB, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
1032  { ISD::FSUB, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
1033  };
1034 
1035  if (ST->hasSSE1())
1036  if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1037  return LT.first * Entry->Cost;
1038 
1039  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
1040  { ISD::ADD, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/
1041  { ISD::SUB, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/
1042  { ISD::MUL, MVT::i64, 2 }, // Nehalem from http://www.agner.org/
1043  };
1044 
1045  if (ST->is64Bit())
1046  if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1047  return LT.first * Entry->Cost;
1048 
1049  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1050  { ISD::ADD, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
1051  { ISD::ADD, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
1052  { ISD::ADD, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
1053 
1054  { ISD::SUB, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
1055  { ISD::SUB, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
1056  { ISD::SUB, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
1057  };
1058 
1059  if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1060  return LT.first * Entry->Cost;
1061 
1062  // It is not a good idea to vectorize division. We have to scalarize it and
1063  // in the process we will often end up having to spilling regular
1064  // registers. The overhead of division is going to dominate most kernels
1065  // anyways so try hard to prevent vectorization of division - it is
1066  // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1067  // to hide "20 cycles" for each lane.
1068  if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM ||
1069  ISD == ISD::UDIV || ISD == ISD::UREM)) {
1071  Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info,
1073  return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1074  }
1075 
1076  // Fallback to the default implementation.
1077  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info);
1078 }
1079 
1081  VectorType *BaseTp,
1082  ArrayRef<int> Mask, int Index,
1083  VectorType *SubTp) {
1084  // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1085  // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1086  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, BaseTp);
1087 
1089  // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1090  if (Kind == TTI::SK_Transpose)
1092 
1093  // For Broadcasts we are splatting the first element from the first input
1094  // register, so only need to reference that input and all the output
1095  // registers are the same.
1096  if (Kind == TTI::SK_Broadcast)
1097  LT.first = 1;
1098 
1099  // Subvector extractions are free if they start at the beginning of a
1100  // vector and cheap if the subvectors are aligned.
1101  if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1102  int NumElts = LT.second.getVectorNumElements();
1103  if ((Index % NumElts) == 0)
1104  return 0;
1105  std::pair<InstructionCost, MVT> SubLT =
1106  TLI->getTypeLegalizationCost(DL, SubTp);
1107  if (SubLT.second.isVector()) {
1108  int NumSubElts = SubLT.second.getVectorNumElements();
1109  if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1110  return SubLT.first;
1111  // Handle some cases for widening legalization. For now we only handle
1112  // cases where the original subvector was naturally aligned and evenly
1113  // fit in its legalized subvector type.
1114  // FIXME: Remove some of the alignment restrictions.
1115  // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1116  // vectors.
1117  int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1118  if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1119  (NumSubElts % OrigSubElts) == 0 &&
1120  LT.second.getVectorElementType() ==
1121  SubLT.second.getVectorElementType() &&
1122  LT.second.getVectorElementType().getSizeInBits() ==
1123  BaseTp->getElementType()->getPrimitiveSizeInBits()) {
1124  assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1125  "Unexpected number of elements!");
1126  auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
1127  LT.second.getVectorNumElements());
1128  auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
1129  SubLT.second.getVectorNumElements());
1130  int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1131  InstructionCost ExtractCost = getShuffleCost(
1132  TTI::SK_ExtractSubvector, VecTy, None, ExtractIndex, SubTy);
1133 
1134  // If the original size is 32-bits or more, we can use pshufd. Otherwise
1135  // if we have SSSE3 we can use pshufb.
1136  if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1137  return ExtractCost + 1; // pshufd or pshufb
1138 
1139  assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1140  "Unexpected vector size");
1141 
1142  return ExtractCost + 2; // worst case pshufhw + pshufd
1143  }
1144  }
1145  }
1146 
1147  // Subvector insertions are cheap if the subvectors are aligned.
1148  // Note that in general, the insertion starting at the beginning of a vector
1149  // isn't free, because we need to preserve the rest of the wide vector.
1150  if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1151  int NumElts = LT.second.getVectorNumElements();
1152  std::pair<InstructionCost, MVT> SubLT =
1153  TLI->getTypeLegalizationCost(DL, SubTp);
1154  if (SubLT.second.isVector()) {
1155  int NumSubElts = SubLT.second.getVectorNumElements();
1156  if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1157  return SubLT.first;
1158  }
1159 
1160  // If the insertion isn't aligned, treat it like a 2-op shuffle.
1162  }
1163 
1164  // Handle some common (illegal) sub-vector types as they are often very cheap
1165  // to shuffle even on targets without PSHUFB.
1166  EVT VT = TLI->getValueType(DL, BaseTp);
1167  if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1168  !ST->hasSSSE3()) {
1169  static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
1170  {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw
1171  {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw
1172  {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw
1173  {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw
1174  {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck
1175 
1176  {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw
1177  {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw
1178  {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus
1179  {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck
1180 
1181  {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw
1182  {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw
1183  {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw
1184  {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw
1185  {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck
1186 
1187  {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
1188  {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
1189  {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw
1190  {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw
1191  {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck
1192  };
1193 
1194  if (ST->hasSSE2())
1195  if (const auto *Entry =
1196  CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1197  return Entry->Cost;
1198  }
1199 
1200  // We are going to permute multiple sources and the result will be in multiple
1201  // destinations. Providing an accurate cost only for splits where the element
1202  // type remains the same.
1203  if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
1204  MVT LegalVT = LT.second;
1205  if (LegalVT.isVector() &&
1206  LegalVT.getVectorElementType().getSizeInBits() ==
1207  BaseTp->getElementType()->getPrimitiveSizeInBits() &&
1208  LegalVT.getVectorNumElements() <
1209  cast<FixedVectorType>(BaseTp)->getNumElements()) {
1210 
1211  unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
1212  unsigned LegalVTSize = LegalVT.getStoreSize();
1213  // Number of source vectors after legalization:
1214  unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1215  // Number of destination vectors after legalization:
1216  InstructionCost NumOfDests = LT.first;
1217 
1218  auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
1219  LegalVT.getVectorNumElements());
1220 
1221  InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1222  return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1223  None, 0, nullptr);
1224  }
1225 
1226  return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp);
1227  }
1228 
1229  // For 2-input shuffles, we must account for splitting the 2 inputs into many.
1230  if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
1231  // We assume that source and destination have the same vector type.
1232  InstructionCost NumOfDests = LT.first;
1233  InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1;
1234  LT.first = NumOfDests * NumOfShufflesPerDest;
1235  }
1236 
1237  static const CostTblEntry AVX512FP16ShuffleTbl[] = {
1238  {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
1239  {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
1240  {TTI::SK_Broadcast, MVT::v8f16, 1}, // vpbroadcastw
1241 
1242  {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
1243  {TTI::SK_Reverse, MVT::v16f16, 2}, // vpermw
1244  {TTI::SK_Reverse, MVT::v8f16, 1}, // vpshufb
1245 
1246  {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
1247  {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
1248  {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // vpshufb
1249 
1250  {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
1251  {TTI::SK_PermuteTwoSrc, MVT::v16f16, 2}, // vpermt2w
1252  {TTI::SK_PermuteTwoSrc, MVT::v8f16, 2} // vpermt2w
1253  };
1254 
1255  if (!ST->useSoftFloat() && ST->hasFP16())
1256  if (const auto *Entry =
1257  CostTableLookup(AVX512FP16ShuffleTbl, Kind, LT.second))
1258  return LT.first * Entry->Cost;
1259 
1260  static const CostTblEntry AVX512VBMIShuffleTbl[] = {
1261  {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
1262  {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
1263 
1264  {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
1265  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
1266 
1267  {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
1268  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
1269  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b
1270  };
1271 
1272  if (ST->hasVBMI())
1273  if (const auto *Entry =
1274  CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1275  return LT.first * Entry->Cost;
1276 
1277  static const CostTblEntry AVX512BWShuffleTbl[] = {
1278  {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1279  {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1280 
1281  {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
1282  {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
1283  {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
1284 
1285  {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
1286  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
1287  {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
1288 
1289  {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
1290  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
1291  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
1292  {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1293 
1294  {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
1295  {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
1296  };
1297 
1298  if (ST->hasBWI())
1299  if (const auto *Entry =
1300  CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1301  return LT.first * Entry->Cost;
1302 
1303  static const CostTblEntry AVX512ShuffleTbl[] = {
1304  {TTI::SK_Broadcast, MVT::v8f64, 1}, // vbroadcastpd
1305  {TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps
1306  {TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq
1307  {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd
1308  {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1309  {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1310 
1311  {TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd
1312  {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps
1313  {TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq
1314  {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd
1315  {TTI::SK_Reverse, MVT::v32i16, 7}, // per mca
1316  {TTI::SK_Reverse, MVT::v64i8, 7}, // per mca
1317 
1318  {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd
1319  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1320  {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // vpermpd
1321  {TTI::SK_PermuteSingleSrc, MVT::v16f32, 1}, // vpermps
1322  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1323  {TTI::SK_PermuteSingleSrc, MVT::v4f32, 1}, // vpermps
1324  {TTI::SK_PermuteSingleSrc, MVT::v8i64, 1}, // vpermq
1325  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1326  {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // vpermq
1327  {TTI::SK_PermuteSingleSrc, MVT::v16i32, 1}, // vpermd
1328  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1329  {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // vpermd
1330  {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1331 
1332  {TTI::SK_PermuteTwoSrc, MVT::v8f64, 1}, // vpermt2pd
1333  {TTI::SK_PermuteTwoSrc, MVT::v16f32, 1}, // vpermt2ps
1334  {TTI::SK_PermuteTwoSrc, MVT::v8i64, 1}, // vpermt2q
1335  {TTI::SK_PermuteTwoSrc, MVT::v16i32, 1}, // vpermt2d
1336  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 1}, // vpermt2pd
1337  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 1}, // vpermt2ps
1338  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 1}, // vpermt2q
1339  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 1}, // vpermt2d
1340  {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1}, // vpermt2pd
1341  {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1}, // vpermt2ps
1342  {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1}, // vpermt2q
1343  {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1}, // vpermt2d
1344 
1345  // FIXME: This just applies the type legalization cost rules above
1346  // assuming these completely split.
1351 
1352  {TTI::SK_Select, MVT::v32i16, 1}, // vpternlogq
1353  {TTI::SK_Select, MVT::v64i8, 1}, // vpternlogq
1354  {TTI::SK_Select, MVT::v8f64, 1}, // vblendmpd
1355  {TTI::SK_Select, MVT::v16f32, 1}, // vblendmps
1356  {TTI::SK_Select, MVT::v8i64, 1}, // vblendmq
1357  {TTI::SK_Select, MVT::v16i32, 1}, // vblendmd
1358  };
1359 
1360  if (ST->hasAVX512())
1361  if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1362  return LT.first * Entry->Cost;
1363 
1364  static const CostTblEntry AVX2ShuffleTbl[] = {
1365  {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1366  {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1367  {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1368  {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1369  {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1370  {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1371 
1372  {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1373  {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1374  {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1375  {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1376  {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1377  {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1378 
1379  {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1380  {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1381 
1382  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1383  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1384  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1385  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1386  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1387  // + vpblendvb
1388  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1389  // + vpblendvb
1390 
1391  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1392  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1393  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1394  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1395  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1396  // + vpblendvb
1397  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
1398  // + vpblendvb
1399  };
1400 
1401  if (ST->hasAVX2())
1402  if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1403  return LT.first * Entry->Cost;
1404 
1405  static const CostTblEntry XOPShuffleTbl[] = {
1406  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
1407  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
1408  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
1409  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
1410  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1411  // + vinsertf128
1412  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
1413  // + vinsertf128
1414 
1415  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1416  // + vinsertf128
1417  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
1418  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
1419  // + vinsertf128
1420  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
1421  };
1422 
1423  if (ST->hasXOP())
1424  if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1425  return LT.first * Entry->Cost;
1426 
1427  static const CostTblEntry AVX1ShuffleTbl[] = {
1428  {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1429  {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1430  {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1431  {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1432  {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1433  {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
1434 
1435  {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1436  {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1437  {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1438  {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1439  {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1440  // + vinsertf128
1441  {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
1442  // + vinsertf128
1443 
1444  {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
1445  {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
1446  {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
1447  {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
1448  {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
1449  {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
1450 
1451  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
1452  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
1453  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1454  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1455  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
1456  // + 2*por + vinsertf128
1457  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
1458  // + 2*por + vinsertf128
1459 
1460  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
1461  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
1462  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1463  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1464  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
1465  // + 4*por + vinsertf128
1466  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
1467  // + 4*por + vinsertf128
1468  };
1469 
1470  if (ST->hasAVX())
1471  if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
1472  return LT.first * Entry->Cost;
1473 
1474  static const CostTblEntry SSE41ShuffleTbl[] = {
1475  {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
1476  {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1477  {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
1478  {TTI::SK_Select, MVT::v4f32, 1}, // blendps
1479  {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
1480  {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
1481  };
1482 
1483  if (ST->hasSSE41())
1484  if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
1485  return LT.first * Entry->Cost;
1486 
1487  static const CostTblEntry SSSE3ShuffleTbl[] = {
1488  {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
1489  {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
1490 
1491  {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
1492  {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
1493 
1494  {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
1495  {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
1496 
1497  {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
1498  {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1499 
1500  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
1501  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
1502  };
1503 
1504  if (ST->hasSSSE3())
1505  if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
1506  return LT.first * Entry->Cost;
1507 
1508  static const CostTblEntry SSE2ShuffleTbl[] = {
1509  {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
1510  {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
1511  {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
1512  {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
1513  {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
1514 
1515  {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
1516  {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
1517  {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
1518  {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
1519  {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
1520  // + 2*pshufd + 2*unpck + packus
1521 
1522  {TTI::SK_Select, MVT::v2i64, 1}, // movsd
1523  {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1524  {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
1525  {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
1526  {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
1527 
1528  {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
1529  {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
1530  {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
1531  {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
1532  // + pshufd/unpck
1533  { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
1534  // + 2*pshufd + 2*unpck + 2*packus
1535 
1536  { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
1537  { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
1538  { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
1539  { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
1540  { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
1541  };
1542 
1543  if (ST->hasSSE2())
1544  if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
1545  return LT.first * Entry->Cost;
1546 
1547  static const CostTblEntry SSE1ShuffleTbl[] = {
1548  { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
1549  { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
1550  { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
1551  { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
1552  { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
1553  };
1554 
1555  if (ST->hasSSE1())
1556  if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
1557  return LT.first * Entry->Cost;
1558 
1559  return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp);
1560 }
1561 
1563  Type *Src,
1566  const Instruction *I) {
1567  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1568  assert(ISD && "Invalid opcode");
1569 
1570  // TODO: Allow non-throughput costs that aren't binary.
1571  auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
1573  return Cost == 0 ? 0 : 1;
1574  return Cost;
1575  };
1576 
1577  // The cost tables include both specific, custom (non-legal) src/dst type
1578  // conversions and generic, legalized types. We test for customs first, before
1579  // falling back to legalization.
1580  // FIXME: Need a better design of the cost table to handle non-simple types of
1581  // potential massive combinations (elem_num x src_type x dst_type).
1582  static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
1585 
1586  // Mask sign extend has an instruction.
1604 
1605  // Mask zero extend is a sext + shift.
1623 
1641 
1643  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm
1644  { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // vpmovwb
1645  { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // vpmovwb
1646  { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // vpmovwb
1647  };
1648 
1649  static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
1650  // Mask sign extend has an instruction.
1659 
1660  // Mask zero extend is a sext + shift.
1669 
1678 
1681 
1684 
1687 
1690  };
1691 
1692  // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
1693  // 256-bit wide vectors.
1694 
1695  static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
1699 
1700  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
1701  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
1702  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
1703  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd
1704  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
1705  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
1706  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
1707  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd
1708  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd
1709  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd
1710  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd
1711  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd
1712  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq
1713  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq
1714  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq
1715  { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 2 }, // vpmovdb
1716  { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 2 }, // vpmovdb
1717  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 }, // vpmovdb
1718  { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, 2 }, // vpmovdb
1719  { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, 2 }, // vpmovdb
1720  { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 }, // vpmovdw
1721  { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, 2 }, // vpmovdw
1722  { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 2 }, // vpmovqb
1723  { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1 }, // vpshufb
1724  { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 }, // vpmovqb
1725  { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, 2 }, // vpmovqb
1726  { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, 2 }, // vpmovqb
1727  { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, 2 }, // vpmovqb
1728  { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 }, // vpmovqw
1729  { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, 2 }, // vpmovqw
1730  { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, 2 }, // vpmovqw
1731  { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, // vpmovqd
1732  { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd
1733  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb
1734 
1735  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32
1738 
1739  // Sign extend is zmm vpternlogd+vptruncdb.
1740  // Zero extend is zmm broadcast load+vptruncdw.
1749 
1750  // Sign extend is zmm vpternlogd+vptruncdw.
1751  // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
1760 
1761  { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd
1762  { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld
1763  { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd
1764  { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld
1765  { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd
1766  { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld
1767  { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq
1768  { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq
1769  { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq
1770  { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq
1771 
1772  { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd
1773  { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld
1774  { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq
1775  { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq
1776 
1787 
1788  { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
1789  { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
1790 
1799 
1810 
1822 
1829  };
1830 
1831  static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] {
1832  // Mask sign extend has an instruction.
1850 
1851  // Mask zero extend is a sext + shift.
1869 
1887 
1889  };
1890 
1891  static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = {
1892  // Mask sign extend has an instruction.
1901 
1902  // Mask zero extend is a sext + shift.
1911 
1920 
1925 
1930 
1935 
1940  };
1941 
1942  static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = {
1943  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
1944  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
1945  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
1946  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8
1947  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
1948  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
1949  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
1950  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16
1951  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd
1952  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd
1953  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd
1954  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq
1955  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq
1956  { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd
1957  { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, // vpmovqb
1958  { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, // vpmovqw
1959  { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, // vpmovwb
1960 
1961  // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
1962  // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
1971 
1972  // sign extend is vpcmpeq+maskedmove+vpmovdw
1973  // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
1982 
1983  { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd
1984  { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld
1985  { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd
1986  { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld
1987  { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd
1988  { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld
1989  { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq
1990  { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq
1991  { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq
1992  { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq
1993 
2006 
2011 
2025 
2029 
2037  };
2038 
2039  static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
2046 
2061 
2063 
2076 
2079 
2084 
2093 
2101 
2112  };
2113 
2114  static const TypeConversionCostTblEntry AVXConversionTbl[] = {
2121 
2134 
2140 
2143  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // and+extract+packuswb
2147  { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 3 }, // and+extract+2*packusdw
2149 
2162 
2180 
2192 
2206 
2209  };
2210 
2211  static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
2224 
2225  // These truncates end up widening elements.
2226  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ
2227  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ
2228  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD
2229 
2233 
2245 
2260 
2271 
2282  };
2283 
2284  static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
2285  // These are somewhat magic numbers justified by comparing the
2286  // output of llvm-mca for our various supported scheduler models
2287  // and basing it off the worst case scenario.
2300 
2314 
2325 
2329  { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 15 },
2336 
2349 
2350  // These truncates are really widening elements.
2351  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD
2352  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // PUNPCKLWD+DQ
2353  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // PUNPCKLBW+WD+PSHUFD
2354  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 1 }, // PUNPCKLWD
2355  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD
2356  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW
2357 
2358  { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 2 }, // PAND+PACKUSWB
2360  { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 3 }, // PAND+2*PACKUSWB
2366  { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB
2367  { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW
2368  { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, 1 }, // PSHUFD
2369  };
2370 
2371  // Attempt to map directly to (simple) MVT types to let us match custom entries.
2372  EVT SrcTy = TLI->getValueType(DL, Src);
2373  EVT DstTy = TLI->getValueType(DL, Dst);
2374 
2375  // The function getSimpleVT only handles simple value types.
2376  if (SrcTy.isSimple() && DstTy.isSimple()) {
2377  MVT SimpleSrcTy = SrcTy.getSimpleVT();
2378  MVT SimpleDstTy = DstTy.getSimpleVT();
2379 
2380  if (ST->useAVX512Regs()) {
2381  if (ST->hasBWI())
2382  if (const auto *Entry = ConvertCostTableLookup(
2383  AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2384  return AdjustCost(Entry->Cost);
2385 
2386  if (ST->hasDQI())
2387  if (const auto *Entry = ConvertCostTableLookup(
2388  AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2389  return AdjustCost(Entry->Cost);
2390 
2391  if (ST->hasAVX512())
2392  if (const auto *Entry = ConvertCostTableLookup(
2393  AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2394  return AdjustCost(Entry->Cost);
2395  }
2396 
2397  if (ST->hasBWI())
2398  if (const auto *Entry = ConvertCostTableLookup(
2399  AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2400  return AdjustCost(Entry->Cost);
2401 
2402  if (ST->hasDQI())
2403  if (const auto *Entry = ConvertCostTableLookup(
2404  AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2405  return AdjustCost(Entry->Cost);
2406 
2407  if (ST->hasAVX512())
2408  if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2409  SimpleDstTy, SimpleSrcTy))
2410  return AdjustCost(Entry->Cost);
2411 
2412  if (ST->hasAVX2()) {
2413  if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2414  SimpleDstTy, SimpleSrcTy))
2415  return AdjustCost(Entry->Cost);
2416  }
2417 
2418  if (ST->hasAVX()) {
2419  if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2420  SimpleDstTy, SimpleSrcTy))
2421  return AdjustCost(Entry->Cost);
2422  }
2423 
2424  if (ST->hasSSE41()) {
2425  if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
2426  SimpleDstTy, SimpleSrcTy))
2427  return AdjustCost(Entry->Cost);
2428  }
2429 
2430  if (ST->hasSSE2()) {
2431  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2432  SimpleDstTy, SimpleSrcTy))
2433  return AdjustCost(Entry->Cost);
2434  }
2435  }
2436 
2437  // Fall back to legalized types.
2438  std::pair<InstructionCost, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
2439  std::pair<InstructionCost, MVT> LTDest =
2440  TLI->getTypeLegalizationCost(DL, Dst);
2441 
2442  if (ST->useAVX512Regs()) {
2443  if (ST->hasBWI())
2444  if (const auto *Entry = ConvertCostTableLookup(
2445  AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
2446  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2447 
2448  if (ST->hasDQI())
2449  if (const auto *Entry = ConvertCostTableLookup(
2450  AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
2451  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2452 
2453  if (ST->hasAVX512())
2454  if (const auto *Entry = ConvertCostTableLookup(
2455  AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
2456  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2457  }
2458 
2459  if (ST->hasBWI())
2460  if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
2461  LTDest.second, LTSrc.second))
2462  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2463 
2464  if (ST->hasDQI())
2465  if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
2466  LTDest.second, LTSrc.second))
2467  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2468 
2469  if (ST->hasAVX512())
2470  if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2471  LTDest.second, LTSrc.second))
2472  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2473 
2474  if (ST->hasAVX2())
2475  if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2476  LTDest.second, LTSrc.second))
2477  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2478 
2479  if (ST->hasAVX())
2480  if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2481  LTDest.second, LTSrc.second))
2482  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2483 
2484  if (ST->hasSSE41())
2485  if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
2486  LTDest.second, LTSrc.second))
2487  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2488 
2489  if (ST->hasSSE2())
2490  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2491  LTDest.second, LTSrc.second))
2492  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2493 
2494  // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
2495  // sitofp.
2496  if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
2497  1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
2498  Type *ExtSrc = Src->getWithNewBitWidth(32);
2499  unsigned ExtOpc =
2500  (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
2501 
2502  // For scalar loads the extend would be free.
2503  InstructionCost ExtCost = 0;
2504  if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
2505  ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
2506 
2507  return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
2509  }
2510 
2511  // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
2512  // i32.
2513  if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
2514  1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
2515  Type *TruncDst = Dst->getWithNewBitWidth(32);
2516  return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
2517  getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
2519  }
2520 
2521  return AdjustCost(
2522  BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2523 }
2524 
2526  Type *CondTy,
2527  CmpInst::Predicate VecPred,
2529  const Instruction *I) {
2530  // TODO: Handle other cost kinds.
2532  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2533  I);
2534 
2535  // Legalize the type.
2536  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2537 
2538  MVT MTy = LT.second;
2539 
2540  int ISD = TLI->InstructionOpcodeToISD(Opcode);
2541  assert(ISD && "Invalid opcode");
2542 
2543  unsigned ExtraCost = 0;
2544  if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
2545  // Some vector comparison predicates cost extra instructions.
2546  // TODO: Should we invert this and assume worst case cmp costs
2547  // and reduce for particular predicates?
2548  if (MTy.isVector() &&
2549  !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
2550  (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
2551  ST->hasBWI())) {
2552  // Fallback to I if a specific predicate wasn't specified.
2553  CmpInst::Predicate Pred = VecPred;
2554  if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
2555  Pred == CmpInst::BAD_FCMP_PREDICATE))
2556  Pred = cast<CmpInst>(I)->getPredicate();
2557 
2558  switch (Pred) {
2559  case CmpInst::Predicate::ICMP_NE:
2560  // xor(cmpeq(x,y),-1)
2561  ExtraCost = 1;
2562  break;
2563  case CmpInst::Predicate::ICMP_SGE:
2564  case CmpInst::Predicate::ICMP_SLE:
2565  // xor(cmpgt(x,y),-1)
2566  ExtraCost = 1;
2567  break;
2568  case CmpInst::Predicate::ICMP_ULT:
2569  case CmpInst::Predicate::ICMP_UGT:
2570  // cmpgt(xor(x,signbit),xor(y,signbit))
2571  // xor(cmpeq(pmaxu(x,y),x),-1)
2572  ExtraCost = 2;
2573  break;
2574  case CmpInst::Predicate::ICMP_ULE:
2575  case CmpInst::Predicate::ICMP_UGE:
2576  if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
2577  (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
2578  // cmpeq(psubus(x,y),0)
2579  // cmpeq(pminu(x,y),x)
2580  ExtraCost = 1;
2581  } else {
2582  // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
2583  ExtraCost = 3;
2584  }
2585  break;
2586  case CmpInst::Predicate::BAD_ICMP_PREDICATE:
2587  case CmpInst::Predicate::BAD_FCMP_PREDICATE:
2588  // Assume worst case scenario and add the maximum extra cost.
2589  ExtraCost = 3;
2590  break;
2591  default:
2592  break;
2593  }
2594  }
2595  }
2596 
2597  static const CostTblEntry SLMCostTbl[] = {
2598  // slm pcmpeq/pcmpgt throughput is 2
2599  { ISD::SETCC, MVT::v2i64, 2 },
2600  };
2601 
2602  static const CostTblEntry AVX512BWCostTbl[] = {
2603  { ISD::SETCC, MVT::v32i16, 1 },
2604  { ISD::SETCC, MVT::v64i8, 1 },
2605 
2606  { ISD::SELECT, MVT::v32i16, 1 },
2607  { ISD::SELECT, MVT::v64i8, 1 },
2608  };
2609 
2610  static const CostTblEntry AVX512CostTbl[] = {
2611  { ISD::SETCC, MVT::v8i64, 1 },
2612  { ISD::SETCC, MVT::v16i32, 1 },
2613  { ISD::SETCC, MVT::v8f64, 1 },
2614  { ISD::SETCC, MVT::v16f32, 1 },
2615 
2616  { ISD::SELECT, MVT::v8i64, 1 },
2617  { ISD::SELECT, MVT::v16i32, 1 },
2618  { ISD::SELECT, MVT::v8f64, 1 },
2619  { ISD::SELECT, MVT::v16f32, 1 },
2620 
2621  { ISD::SETCC, MVT::v32i16, 2 }, // FIXME: should probably be 4
2622  { ISD::SETCC, MVT::v64i8, 2 }, // FIXME: should probably be 4
2623 
2624  { ISD::SELECT, MVT::v32i16, 2 }, // FIXME: should be 3
2625  { ISD::SELECT, MVT::v64i8, 2 }, // FIXME: should be 3
2626  };
2627 
2628  static const CostTblEntry AVX2CostTbl[] = {
2629  { ISD::SETCC, MVT::v4i64, 1 },
2630  { ISD::SETCC, MVT::v8i32, 1 },
2631  { ISD::SETCC, MVT::v16i16, 1 },
2632  { ISD::SETCC, MVT::v32i8, 1 },
2633 
2634  { ISD::SELECT, MVT::v4i64, 1 }, // pblendvb
2635  { ISD::SELECT, MVT::v8i32, 1 }, // pblendvb
2636  { ISD::SELECT, MVT::v16i16, 1 }, // pblendvb
2637  { ISD::SELECT, MVT::v32i8, 1 }, // pblendvb
2638  };
2639 
2640  static const CostTblEntry AVX1CostTbl[] = {
2641  { ISD::SETCC, MVT::v4f64, 1 },
2642  { ISD::SETCC, MVT::v8f32, 1 },
2643  // AVX1 does not support 8-wide integer compare.
2644  { ISD::SETCC, MVT::v4i64, 4 },
2645  { ISD::SETCC, MVT::v8i32, 4 },
2646  { ISD::SETCC, MVT::v16i16, 4 },
2647  { ISD::SETCC, MVT::v32i8, 4 },
2648 
2649  { ISD::SELECT, MVT::v4f64, 1 }, // vblendvpd
2650  { ISD::SELECT, MVT::v8f32, 1 }, // vblendvps
2651  { ISD::SELECT, MVT::v4i64, 1 }, // vblendvpd
2652  { ISD::SELECT, MVT::v8i32, 1 }, // vblendvps
2653  { ISD::SELECT, MVT::v16i16, 3 }, // vandps + vandnps + vorps
2654  { ISD::SELECT, MVT::v32i8, 3 }, // vandps + vandnps + vorps
2655  };
2656 
2657  static const CostTblEntry SSE42CostTbl[] = {
2658  { ISD::SETCC, MVT::v2f64, 1 },
2659  { ISD::SETCC, MVT::v4f32, 1 },
2660  { ISD::SETCC, MVT::v2i64, 1 },
2661  };
2662 
2663  static const CostTblEntry SSE41CostTbl[] = {
2664  { ISD::SELECT, MVT::v2f64, 1 }, // blendvpd
2665  { ISD::SELECT, MVT::v4f32, 1 }, // blendvps
2666  { ISD::SELECT, MVT::v2i64, 1 }, // pblendvb
2667  { ISD::SELECT, MVT::v4i32, 1 }, // pblendvb
2668  { ISD::SELECT, MVT::v8i16, 1 }, // pblendvb
2669  { ISD::SELECT, MVT::v16i8, 1 }, // pblendvb
2670  };
2671 
2672  static const CostTblEntry SSE2CostTbl[] = {
2673  { ISD::SETCC, MVT::v2f64, 2 },
2674  { ISD::SETCC, MVT::f64, 1 },
2675  { ISD::SETCC, MVT::v2i64, 8 },
2676  { ISD::SETCC, MVT::v4i32, 1 },
2677  { ISD::SETCC, MVT::v8i16, 1 },
2678  { ISD::SETCC, MVT::v16i8, 1 },
2679 
2680  { ISD::SELECT, MVT::v2f64, 3 }, // andpd + andnpd + orpd
2681  { ISD::SELECT, MVT::v2i64, 3 }, // pand + pandn + por
2682  { ISD::SELECT, MVT::v4i32, 3 }, // pand + pandn + por
2683  { ISD::SELECT, MVT::v8i16, 3 }, // pand + pandn + por
2684  { ISD::SELECT, MVT::v16i8, 3 }, // pand + pandn + por
2685  };
2686 
2687  static const CostTblEntry SSE1CostTbl[] = {
2688  { ISD::SETCC, MVT::v4f32, 2 },
2689  { ISD::SETCC, MVT::f32, 1 },
2690 
2691  { ISD::SELECT, MVT::v4f32, 3 }, // andps + andnps + orps
2692  };
2693 
2694  if (ST->useSLMArithCosts())
2695  if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
2696  return LT.first * (ExtraCost + Entry->Cost);
2697 
2698  if (ST->hasBWI())
2699  if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
2700  return LT.first * (ExtraCost + Entry->Cost);
2701 
2702  if (ST->hasAVX512())
2703  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2704  return LT.first * (ExtraCost + Entry->Cost);
2705 
2706  if (ST->hasAVX2())
2707  if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
2708  return LT.first * (ExtraCost + Entry->Cost);
2709 
2710  if (ST->hasAVX())
2711  if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
2712  return LT.first * (ExtraCost + Entry->Cost);
2713 
2714  if (ST->hasSSE42())
2715  if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
2716  return LT.first * (ExtraCost + Entry->Cost);
2717 
2718  if (ST->hasSSE41())
2719  if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
2720  return LT.first * (ExtraCost + Entry->Cost);
2721 
2722  if (ST->hasSSE2())
2723  if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
2724  return LT.first * (ExtraCost + Entry->Cost);
2725 
2726  if (ST->hasSSE1())
2727  if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
2728  return LT.first * (ExtraCost + Entry->Cost);
2729 
2730  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
2731 }
2732 
2734 
2738 
2739  // Costs should match the codegen from:
2740  // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
2741  // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
2742  // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
2743  // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
2744  // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
2745 
2746  // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
2747  // specialized in these tables yet.
2748  static const CostTblEntry AVX512BITALGCostTbl[] = {
2749  { ISD::CTPOP, MVT::v32i16, 1 },
2750  { ISD::CTPOP, MVT::v64i8, 1 },
2751  { ISD::CTPOP, MVT::v16i16, 1 },
2752  { ISD::CTPOP, MVT::v32i8, 1 },
2753  { ISD::CTPOP, MVT::v8i16, 1 },
2754  { ISD::CTPOP, MVT::v16i8, 1 },
2755  };
2756  static const CostTblEntry AVX512VPOPCNTDQCostTbl[] = {
2757  { ISD::CTPOP, MVT::v8i64, 1 },
2758  { ISD::CTPOP, MVT::v16i32, 1 },
2759  { ISD::CTPOP, MVT::v4i64, 1 },
2760  { ISD::CTPOP, MVT::v8i32, 1 },
2761  { ISD::CTPOP, MVT::v2i64, 1 },
2762  { ISD::CTPOP, MVT::v4i32, 1 },
2763  };
2764  static const CostTblEntry AVX512CDCostTbl[] = {
2765  { ISD::CTLZ, MVT::v8i64, 1 },
2766  { ISD::CTLZ, MVT::v16i32, 1 },
2767  { ISD::CTLZ, MVT::v32i16, 8 },
2768  { ISD::CTLZ, MVT::v64i8, 20 },
2769  { ISD::CTLZ, MVT::v4i64, 1 },
2770  { ISD::CTLZ, MVT::v8i32, 1 },
2771  { ISD::CTLZ, MVT::v16i16, 4 },
2772  { ISD::CTLZ, MVT::v32i8, 10 },
2773  { ISD::CTLZ, MVT::v2i64, 1 },
2774  { ISD::CTLZ, MVT::v4i32, 1 },
2775  { ISD::CTLZ, MVT::v8i16, 4 },
2776  { ISD::CTLZ, MVT::v16i8, 4 },
2777  };
2778  static const CostTblEntry AVX512BWCostTbl[] = {
2779  { ISD::ABS, MVT::v32i16, 1 },
2780  { ISD::ABS, MVT::v64i8, 1 },
2781  { ISD::BITREVERSE, MVT::v8i64, 3 },
2782  { ISD::BITREVERSE, MVT::v16i32, 3 },
2783  { ISD::BITREVERSE, MVT::v32i16, 3 },
2784  { ISD::BITREVERSE, MVT::v64i8, 2 },
2785  { ISD::BSWAP, MVT::v8i64, 1 },
2786  { ISD::BSWAP, MVT::v16i32, 1 },
2787  { ISD::BSWAP, MVT::v32i16, 1 },
2788  { ISD::CTLZ, MVT::v8i64, 23 },
2789  { ISD::CTLZ, MVT::v16i32, 22 },
2790  { ISD::CTLZ, MVT::v32i16, 18 },
2791  { ISD::CTLZ, MVT::v64i8, 17 },
2792  { ISD::CTPOP, MVT::v8i64, 7 },
2793  { ISD::CTPOP, MVT::v16i32, 11 },
2794  { ISD::CTPOP, MVT::v32i16, 9 },
2795  { ISD::CTPOP, MVT::v64i8, 6 },
2796  { ISD::CTTZ, MVT::v8i64, 10 },
2797  { ISD::CTTZ, MVT::v16i32, 14 },
2798  { ISD::CTTZ, MVT::v32i16, 12 },
2799  { ISD::CTTZ, MVT::v64i8, 9 },
2800  { ISD::SADDSAT, MVT::v32i16, 1 },
2801  { ISD::SADDSAT, MVT::v64i8, 1 },
2802  { ISD::SMAX, MVT::v32i16, 1 },
2803  { ISD::SMAX, MVT::v64i8, 1 },
2804  { ISD::SMIN, MVT::v32i16, 1 },
2805  { ISD::SMIN, MVT::v64i8, 1 },
2806  { ISD::SSUBSAT, MVT::v32i16, 1 },
2807  { ISD::SSUBSAT, MVT::v64i8, 1 },
2808  { ISD::UADDSAT, MVT::v32i16, 1 },
2809  { ISD::UADDSAT, MVT::v64i8, 1 },
2810  { ISD::UMAX, MVT::v32i16, 1 },
2811  { ISD::UMAX, MVT::v64i8, 1 },
2812  { ISD::UMIN, MVT::v32i16, 1 },
2813  { ISD::UMIN, MVT::v64i8, 1 },
2814  { ISD::USUBSAT, MVT::v32i16, 1 },
2815  { ISD::USUBSAT, MVT::v64i8, 1 },
2816  };
2817  static const CostTblEntry AVX512CostTbl[] = {
2818  { ISD::ABS, MVT::v8i64, 1 },
2819  { ISD::ABS, MVT::v16i32, 1 },
2820  { ISD::ABS, MVT::v32i16, 2 },
2821  { ISD::ABS, MVT::v64i8, 2 },
2822  { ISD::ABS, MVT::v4i64, 1 },
2823  { ISD::ABS, MVT::v2i64, 1 },
2824  { ISD::BITREVERSE, MVT::v8i64, 36 },
2825  { ISD::BITREVERSE, MVT::v16i32, 24 },
2826  { ISD::BITREVERSE, MVT::v32i16, 10 },
2827  { ISD::BITREVERSE, MVT::v64i8, 10 },
2828  { ISD::BSWAP, MVT::v8i64, 4 },
2829  { ISD::BSWAP, MVT::v16i32, 4 },
2830  { ISD::BSWAP, MVT::v32i16, 4 },
2831  { ISD::CTLZ, MVT::v8i64, 29 },
2832  { ISD::CTLZ, MVT::v16i32, 35 },
2833  { ISD::CTLZ, MVT::v32i16, 28 },
2834  { ISD::CTLZ, MVT::v64i8, 18 },
2835  { ISD::CTPOP, MVT::v8i64, 16 },
2836  { ISD::CTPOP, MVT::v16i32, 24 },
2837  { ISD::CTPOP, MVT::v32i16, 18 },
2838  { ISD::CTPOP, MVT::v64i8, 12 },
2839  { ISD::CTTZ, MVT::v8i64, 20 },
2840  { ISD::CTTZ, MVT::v16i32, 28 },
2841  { ISD::CTTZ, MVT::v32i16, 24 },
2842  { ISD::CTTZ, MVT::v64i8, 18 },
2843  { ISD::SMAX, MVT::v8i64, 1 },
2844  { ISD::SMAX, MVT::v16i32, 1 },
2845  { ISD::SMAX, MVT::v32i16, 2 },
2846  { ISD::SMAX, MVT::v64i8, 2 },
2847  { ISD::SMAX, MVT::v4i64, 1 },
2848  { ISD::SMAX, MVT::v2i64, 1 },
2849  { ISD::SMIN, MVT::v8i64, 1 },
2850  { ISD::SMIN, MVT::v16i32, 1 },
2851  { ISD::SMIN, MVT::v32i16, 2 },
2852  { ISD::SMIN, MVT::v64i8, 2 },
2853  { ISD::SMIN, MVT::v4i64, 1 },
2854  { ISD::SMIN, MVT::v2i64, 1 },
2855  { ISD::UMAX, MVT::v8i64, 1 },
2856  { ISD::UMAX, MVT::v16i32, 1 },
2857  { ISD::UMAX, MVT::v32i16, 2 },
2858  { ISD::UMAX, MVT::v64i8, 2 },
2859  { ISD::UMAX, MVT::v4i64, 1 },
2860  { ISD::UMAX, MVT::v2i64, 1 },
2861  { ISD::UMIN, MVT::v8i64, 1 },
2862  { ISD::UMIN, MVT::v16i32, 1 },
2863  { ISD::UMIN, MVT::v32i16, 2 },
2864  { ISD::UMIN, MVT::v64i8, 2 },
2865  { ISD::UMIN, MVT::v4i64, 1 },
2866  { ISD::UMIN, MVT::v2i64, 1 },
2867  { ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd
2868  { ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq
2869  { ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq
2870  { ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq
2871  { ISD::UADDSAT, MVT::v16i32, 3 }, // not + pminud + paddd
2872  { ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq
2873  { ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq
2874  { ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq
2875  { ISD::SADDSAT, MVT::v32i16, 2 },
2876  { ISD::SADDSAT, MVT::v64i8, 2 },
2877  { ISD::SSUBSAT, MVT::v32i16, 2 },
2878  { ISD::SSUBSAT, MVT::v64i8, 2 },
2879  { ISD::UADDSAT, MVT::v32i16, 2 },
2880  { ISD::UADDSAT, MVT::v64i8, 2 },
2881  { ISD::USUBSAT, MVT::v32i16, 2 },
2882  { ISD::USUBSAT, MVT::v64i8, 2 },
2883  { ISD::FMAXNUM, MVT::f32, 2 },
2884  { ISD::FMAXNUM, MVT::v4f32, 2 },
2885  { ISD::FMAXNUM, MVT::v8f32, 2 },
2886  { ISD::FMAXNUM, MVT::v16f32, 2 },
2887  { ISD::FMAXNUM, MVT::f64, 2 },
2888  { ISD::FMAXNUM, MVT::v2f64, 2 },
2889  { ISD::FMAXNUM, MVT::v4f64, 2 },
2890  { ISD::FMAXNUM, MVT::v8f64, 2 },
2891  };
2892  static const CostTblEntry XOPCostTbl[] = {
2893  { ISD::BITREVERSE, MVT::v4i64, 4 },
2894  { ISD::BITREVERSE, MVT::v8i32, 4 },
2895  { ISD::BITREVERSE, MVT::v16i16, 4 },
2896  { ISD::BITREVERSE, MVT::v32i8, 4 },
2897  { ISD::BITREVERSE, MVT::v2i64, 1 },
2898  { ISD::BITREVERSE, MVT::v4i32, 1 },
2899  { ISD::BITREVERSE, MVT::v8i16, 1 },
2900  { ISD::BITREVERSE, MVT::v16i8, 1 },
2901  { ISD::BITREVERSE, MVT::i64, 3 },
2902  { ISD::BITREVERSE, MVT::i32, 3 },
2903  { ISD::BITREVERSE, MVT::i16, 3 },
2904  { ISD::BITREVERSE, MVT::i8, 3 }
2905  };
2906  static const CostTblEntry AVX2CostTbl[] = {
2907  { ISD::ABS, MVT::v4i64, 2 }, // VBLENDVPD(X,VPSUBQ(0,X),X)
2908  { ISD::ABS, MVT::v8i32, 1 },
2909  { ISD::ABS, MVT::v16i16, 1 },
2910  { ISD::ABS, MVT::v32i8, 1 },
2911  { ISD::BITREVERSE, MVT::v2i64, 3 },
2912  { ISD::BITREVERSE, MVT::v4i64, 3 },
2913  { ISD::BITREVERSE, MVT::v4i32, 3 },
2914  { ISD::BITREVERSE, MVT::v8i32, 3 },
2915  { ISD::BITREVERSE, MVT::v8i16, 3 },
2916  { ISD::BITREVERSE, MVT::v16i16, 3 },
2917  { ISD::BITREVERSE, MVT::v16i8, 3 },
2918  { ISD::BITREVERSE, MVT::v32i8, 3 },
2919  { ISD::BSWAP, MVT::v4i64, 1 },
2920  { ISD::BSWAP, MVT::v8i32, 1 },
2921  { ISD::BSWAP, MVT::v16i16, 1 },
2922  { ISD::CTLZ, MVT::v2i64, 7 },
2923  { ISD::CTLZ, MVT::v4i64, 7 },
2924  { ISD::CTLZ, MVT::v4i32, 5 },
2925  { ISD::CTLZ, MVT::v8i32, 5 },
2926  { ISD::CTLZ, MVT::v8i16, 4 },
2927  { ISD::CTLZ, MVT::v16i16, 4 },
2928  { ISD::CTLZ, MVT::v16i8, 3 },
2929  { ISD::CTLZ, MVT::v32i8, 3 },
2930  { ISD::CTPOP, MVT::v2i64, 3 },
2931  { ISD::CTPOP, MVT::v4i64, 3 },
2932  { ISD::CTPOP, MVT::v4i32, 7 },
2933  { ISD::CTPOP, MVT::v8i32, 7 },
2934  { ISD::CTPOP, MVT::v8i16, 3 },
2935  { ISD::CTPOP, MVT::v16i16, 3 },
2936  { ISD::CTPOP, MVT::v16i8, 2 },
2937  { ISD::CTPOP, MVT::v32i8, 2 },
2938  { ISD::CTTZ, MVT::v2i64, 4 },
2939  { ISD::CTTZ, MVT::v4i64, 4 },
2940  { ISD::CTTZ, MVT::v4i32, 7 },
2941  { ISD::CTTZ, MVT::v8i32, 7 },
2942  { ISD::CTTZ, MVT::v8i16, 4 },
2943  { ISD::CTTZ, MVT::v16i16, 4 },
2944  { ISD::CTTZ, MVT::v16i8, 3 },
2945  { ISD::CTTZ, MVT::v32i8, 3 },
2946  { ISD::SADDSAT, MVT::v16i16, 1 },
2947  { ISD::SADDSAT, MVT::v32i8, 1 },
2948  { ISD::SMAX, MVT::v8i32, 1 },
2949  { ISD::SMAX, MVT::v16i16, 1 },
2950  { ISD::SMAX, MVT::v32i8, 1 },
2951  { ISD::SMIN, MVT::v8i32, 1 },
2952  { ISD::SMIN, MVT::v16i16, 1 },
2953  { ISD::SMIN, MVT::v32i8, 1 },
2954  { ISD::SSUBSAT, MVT::v16i16, 1 },
2955  { ISD::SSUBSAT, MVT::v32i8, 1 },
2956  { ISD::UADDSAT, MVT::v16i16, 1 },
2957  { ISD::UADDSAT, MVT::v32i8, 1 },
2958  { ISD::UADDSAT, MVT::v8i32, 3 }, // not + pminud + paddd
2959  { ISD::UMAX, MVT::v8i32, 1 },
2960  { ISD::UMAX, MVT::v16i16, 1 },
2961  { ISD::UMAX, MVT::v32i8, 1 },
2962  { ISD::UMIN, MVT::v8i32, 1 },
2963  { ISD::UMIN, MVT::v16i16, 1 },
2964  { ISD::UMIN, MVT::v32i8, 1 },
2965  { ISD::USUBSAT, MVT::v16i16, 1 },
2966  { ISD::USUBSAT, MVT::v32i8, 1 },
2967  { ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd
2968  { ISD::FMAXNUM, MVT::v8f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS
2969  { ISD::FMAXNUM, MVT::v4f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD
2970  { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/
2971  { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
2972  { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
2973  { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/
2974  { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
2975  { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
2976  };
2977  static const CostTblEntry AVX1CostTbl[] = {
2978  { ISD::ABS, MVT::v4i64, 5 }, // VBLENDVPD(X,VPSUBQ(0,X),X)
2979  { ISD::ABS, MVT::v8i32, 3 },
2980  { ISD::ABS, MVT::v16i16, 3 },
2981  { ISD::ABS, MVT::v32i8, 3 },
2982  { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert
2983  { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert
2984  { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert
2985  { ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert
2986  { ISD::BSWAP, MVT::v4i64, 4 },
2987  { ISD::BSWAP, MVT::v8i32, 4 },
2988  { ISD::BSWAP, MVT::v16i16, 4 },
2989  { ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert
2990  { ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert
2991  { ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert
2992  { ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
2993  { ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert
2994  { ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert
2995  { ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert
2996  { ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert
2997  { ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert
2998  { ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert
2999  { ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert
3000  { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
3001  { ISD::SADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
3002  { ISD::SADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
3003  { ISD::SMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
3004  { ISD::SMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
3005  { ISD::SMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
3006  { ISD::SMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
3007  { ISD::SMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
3008  { ISD::SMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
3009  { ISD::SSUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
3010  { ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
3011  { ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
3012  { ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
3013  { ISD::UADDSAT, MVT::v8i32, 8 }, // 2 x 128-bit Op + extract/insert
3014  { ISD::UMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
3015  { ISD::UMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
3016  { ISD::UMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
3017  { ISD::UMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
3018  { ISD::UMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
3019  { ISD::UMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
3020  { ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
3021  { ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
3022  { ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert
3023  { ISD::FMAXNUM, MVT::f32, 3 }, // MAXSS + CMPUNORDSS + BLENDVPS
3024  { ISD::FMAXNUM, MVT::v4f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS
3025  { ISD::FMAXNUM, MVT::v8f32, 5 }, // MAXPS + CMPUNORDPS + BLENDVPS + ?
3026  { ISD::FMAXNUM, MVT::f64, 3 }, // MAXSD + CMPUNORDSD + BLENDVPD
3027  { ISD::FMAXNUM, MVT::v2f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD
3028  { ISD::FMAXNUM, MVT::v4f64, 5 }, // MAXPD + CMPUNORDPD + BLENDVPD + ?
3029  { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
3030  { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
3031  { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
3032  { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/
3033  { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/
3034  { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/
3035  };
3036  static const CostTblEntry GLMCostTbl[] = {
3037  { ISD::FSQRT, MVT::f32, 19 }, // sqrtss
3038  { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps
3039  { ISD::FSQRT, MVT::f64, 34 }, // sqrtsd
3040  { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd
3041  };
3042  static const CostTblEntry SLMCostTbl[] = {
3043  { ISD::FSQRT, MVT::f32, 20 }, // sqrtss
3044  { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps
3045  { ISD::FSQRT, MVT::f64, 35 }, // sqrtsd
3046  { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd
3047  };
3048  static const CostTblEntry SSE42CostTbl[] = {
3049  { ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd
3050  { ISD::UADDSAT, MVT::v4i32, 3 }, // not + pminud + paddd
3051  { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
3052  { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
3053  };
3054  static const CostTblEntry SSE41CostTbl[] = {
3055  { ISD::ABS, MVT::v2i64, 2 }, // BLENDVPD(X,PSUBQ(0,X),X)
3056  { ISD::SMAX, MVT::v4i32, 1 },
3057  { ISD::SMAX, MVT::v16i8, 1 },
3058  { ISD::SMIN, MVT::v4i32, 1 },
3059  { ISD::SMIN, MVT::v16i8, 1 },
3060  { ISD::UMAX, MVT::v4i32, 1 },
3061  { ISD::UMAX, MVT::v8i16, 1 },
3062  { ISD::UMIN, MVT::v4i32, 1 },
3063  { ISD::UMIN, MVT::v8i16, 1 },
3064  };
3065  static const CostTblEntry SSSE3CostTbl[] = {
3066  { ISD::ABS, MVT::v4i32, 1 },
3067  { ISD::ABS, MVT::v8i16, 1 },
3068  { ISD::ABS, MVT::v16i8, 1 },
3069  { ISD::BITREVERSE, MVT::v2i64, 5 },
3070  { ISD::BITREVERSE, MVT::v4i32, 5 },
3071  { ISD::BITREVERSE, MVT::v8i16, 5 },
3072  { ISD::BITREVERSE, MVT::v16i8, 5 },
3073  { ISD::BSWAP, MVT::v2i64, 1 },
3074  { ISD::BSWAP, MVT::v4i32, 1 },
3075  { ISD::BSWAP, MVT::v8i16, 1 },
3076  { ISD::CTLZ, MVT::v2i64, 23 },
3077  { ISD::CTLZ, MVT::v4i32, 18 },
3078  { ISD::CTLZ, MVT::v8i16, 14 },
3079  { ISD::CTLZ, MVT::v16i8, 9 },
3080  { ISD::CTPOP, MVT::v2i64, 7 },
3081  { ISD::CTPOP, MVT::v4i32, 11 },
3082  { ISD::CTPOP, MVT::v8i16, 9 },
3083  { ISD::CTPOP, MVT::v16i8, 6 },
3084  { ISD::CTTZ, MVT::v2i64, 10 },
3085  { ISD::CTTZ, MVT::v4i32, 14 },
3086  { ISD::CTTZ, MVT::v8i16, 12 },
3087  { ISD::CTTZ, MVT::v16i8, 9 }
3088  };
3089  static const CostTblEntry SSE2CostTbl[] = {
3090  { ISD::ABS, MVT::v2i64, 4 },
3091  { ISD::ABS, MVT::v4i32, 3 },
3092  { ISD::ABS, MVT::v8i16, 2 },
3093  { ISD::ABS, MVT::v16i8, 2 },
3094  { ISD::BITREVERSE, MVT::v2i64, 29 },
3095  { ISD::BITREVERSE, MVT::v4i32, 27 },
3096  { ISD::BITREVERSE, MVT::v8i16, 27 },
3097  { ISD::BITREVERSE, MVT::v16i8, 20 },
3098  { ISD::BSWAP, MVT::v2i64, 7 },
3099  { ISD::BSWAP, MVT::v4i32, 7 },
3100  { ISD::BSWAP, MVT::v8i16, 7 },
3101  { ISD::CTLZ, MVT::v2i64, 25 },
3102  { ISD::CTLZ, MVT::v4i32, 26 },
3103  { ISD::CTLZ, MVT::v8i16, 20 },
3104  { ISD::CTLZ, MVT::v16i8, 17 },
3105  { ISD::CTPOP, MVT::v2i64, 12 },
3106  { ISD::CTPOP, MVT::v4i32, 15 },
3107  { ISD::CTPOP, MVT::v8i16, 13 },
3108  { ISD::CTPOP, MVT::v16i8, 10 },
3109  { ISD::CTTZ, MVT::v2i64, 14 },
3110  { ISD::CTTZ, MVT::v4i32, 18 },
3111  { ISD::CTTZ, MVT::v8i16, 16 },
3112  { ISD::CTTZ, MVT::v16i8, 13 },
3113  { ISD::SADDSAT, MVT::v8i16, 1 },
3114  { ISD::SADDSAT, MVT::v16i8, 1 },
3115  { ISD::SMAX, MVT::v8i16, 1 },
3116  { ISD::SMIN, MVT::v8i16, 1 },
3117  { ISD::SSUBSAT, MVT::v8i16, 1 },
3118  { ISD::SSUBSAT, MVT::v16i8, 1 },
3119  { ISD::UADDSAT, MVT::v8i16, 1 },
3120  { ISD::UADDSAT, MVT::v16i8, 1 },
3121  { ISD::UMAX, MVT::v8i16, 2 },
3122  { ISD::UMAX, MVT::v16i8, 1 },
3123  { ISD::UMIN, MVT::v8i16, 2 },
3124  { ISD::UMIN, MVT::v16i8, 1 },
3125  { ISD::USUBSAT, MVT::v8i16, 1 },
3126  { ISD::USUBSAT, MVT::v16i8, 1 },
3127  { ISD::FMAXNUM, MVT::f64, 4 },
3128  { ISD::FMAXNUM, MVT::v2f64, 4 },
3129  { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/
3130  { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/
3131  };
3132  static const CostTblEntry SSE1CostTbl[] = {
3133  { ISD::FMAXNUM, MVT::f32, 4 },
3134  { ISD::FMAXNUM, MVT::v4f32, 4 },
3135  { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
3136  { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
3137  };
3138  static const CostTblEntry BMI64CostTbl[] = { // 64-bit targets
3139  { ISD::CTTZ, MVT::i64, 1 },
3140  };
3141  static const CostTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
3142  { ISD::CTTZ, MVT::i32, 1 },
3143  { ISD::CTTZ, MVT::i16, 1 },
3144  { ISD::CTTZ, MVT::i8, 1 },
3145  };
3146  static const CostTblEntry LZCNT64CostTbl[] = { // 64-bit targets
3147  { ISD::CTLZ, MVT::i64, 1 },
3148  };
3149  static const CostTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
3150  { ISD::CTLZ, MVT::i32, 1 },
3151  { ISD::CTLZ, MVT::i16, 1 },
3152  { ISD::CTLZ, MVT::i8, 1 },
3153  };
3154  static const CostTblEntry POPCNT64CostTbl[] = { // 64-bit targets
3155  { ISD::CTPOP, MVT::i64, 1 },
3156  };
3157  static const CostTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
3158  { ISD::CTPOP, MVT::i32, 1 },
3159  { ISD::CTPOP, MVT::i16, 1 },
3160  { ISD::CTPOP, MVT::i8, 1 },
3161  };
3162  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
3163  { ISD::ABS, MVT::i64, 2 }, // SUB+CMOV
3164  { ISD::BITREVERSE, MVT::i64, 14 },
3165  { ISD::BSWAP, MVT::i64, 1 },
3166  { ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV
3167  { ISD::CTTZ, MVT::i64, 3 }, // TEST+BSF+CMOV/BRANCH
3168  { ISD::CTPOP, MVT::i64, 10 },
3169  { ISD::SADDO, MVT::i64, 1 },
3170  { ISD::UADDO, MVT::i64, 1 },
3171  { ISD::UMULO, MVT::i64, 2 }, // mulq + seto
3172  };
3173  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
3174  { ISD::ABS, MVT::i32, 2 }, // SUB+CMOV
3175  { ISD::ABS, MVT::i16, 2 }, // SUB+CMOV
3176  { ISD::BITREVERSE, MVT::i32, 14 },
3177  { ISD::BITREVERSE, MVT::i16, 14 },
3178  { ISD::BITREVERSE, MVT::i8, 11 },
3179  { ISD::BSWAP, MVT::i32, 1 },
3180  { ISD::BSWAP, MVT::i16, 1 }, // ROL
3181  { ISD::CTLZ, MVT::i32, 4 }, // BSR+XOR or BSR+XOR+CMOV
3182  { ISD::CTLZ, MVT::i16, 4 }, // BSR+XOR or BSR+XOR+CMOV
3183  { ISD::CTLZ, MVT::i8, 4 }, // BSR+XOR or BSR+XOR+CMOV
3184  { ISD::CTTZ, MVT::i32, 3 }, // TEST+BSF+CMOV/BRANCH
3185  { ISD::CTTZ, MVT::i16, 3 }, // TEST+BSF+CMOV/BRANCH
3186  { ISD::CTTZ, MVT::i8, 3 }, // TEST+BSF+CMOV/BRANCH
3187  { ISD::CTPOP, MVT::i32, 8 },
3188  { ISD::CTPOP, MVT::i16, 9 },
3189  { ISD::CTPOP, MVT::i8, 7 },
3190  { ISD::SADDO, MVT::i32, 1 },
3191  { ISD::SADDO, MVT::i16, 1 },
3192  { ISD::SADDO, MVT::i8, 1 },
3193  { ISD::UADDO, MVT::i32, 1 },
3194  { ISD::UADDO, MVT::i16, 1 },
3195  { ISD::UADDO, MVT::i8, 1 },
3196  { ISD::UMULO, MVT::i32, 2 }, // mul + seto
3197  { ISD::UMULO, MVT::i16, 2 },
3198  { ISD::UMULO, MVT::i8, 2 },
3199  };
3200 
3201  Type *RetTy = ICA.getReturnType();
3202  Type *OpTy = RetTy;
3203  Intrinsic::ID IID = ICA.getID();
3204  unsigned ISD = ISD::DELETED_NODE;
3205  switch (IID) {
3206  default:
3207  break;
3208  case Intrinsic::abs:
3209  ISD = ISD::ABS;
3210  break;
3211  case Intrinsic::bitreverse:
3212  ISD = ISD::BITREVERSE;
3213  break;
3214  case Intrinsic::bswap:
3215  ISD = ISD::BSWAP;
3216  break;
3217  case Intrinsic::ctlz:
3218  ISD = ISD::CTLZ;
3219  break;
3220  case Intrinsic::ctpop:
3221  ISD = ISD::CTPOP;
3222  break;
3223  case Intrinsic::cttz:
3224  ISD = ISD::CTTZ;
3225  break;
3226  case Intrinsic::maxnum:
3227