LLVM  14.0.0git
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements a TargetTransformInfo analysis pass specific to the
10 /// X86 target machine. It uses the target's detailed information to provide
11 /// more precise answers to certain TTI queries, while letting the target
12 /// independent and default TTI implementations handle the rest.
13 ///
14 //===----------------------------------------------------------------------===//
15 /// About Cost Model numbers used below it's necessary to say the following:
16 /// the numbers correspond to some "generic" X86 CPU instead of usage of
17 /// concrete CPU model. Usually the numbers correspond to CPU where the feature
18 /// apeared at the first time. For example, if we do Subtarget.hasSSE42() in
19 /// the lookups below the cost is based on Nehalem as that was the first CPU
20 /// to support that feature level and thus has most likely the worst case cost.
21 /// Some examples of other technologies/CPUs:
22 /// SSE 3 - Pentium4 / Athlon64
23 /// SSE 4.1 - Penryn
24 /// SSE 4.2 - Nehalem
25 /// AVX - Sandy Bridge
26 /// AVX2 - Haswell
27 /// AVX-512 - Xeon Phi / Skylake
28 /// And some examples of instruction target dependent costs (latency)
29 /// divss sqrtss rsqrtss
30 /// AMD K7 11-16 19 3
31 /// Piledriver 9-24 13-15 5
32 /// Jaguar 14 16 2
33 /// Pentium II,III 18 30 2
34 /// Nehalem 7-14 7-18 3
35 /// Haswell 10-13 11 5
36 /// TODO: Develop and implement the target dependent cost model and
37 /// specialize cost numbers for different Cost Model Targets such as throughput,
38 /// code size, latency and uop count.
39 //===----------------------------------------------------------------------===//
40 
41 #include "X86TargetTransformInfo.h"
44 #include "llvm/CodeGen/CostTable.h"
46 #include "llvm/IR/IntrinsicInst.h"
47 #include "llvm/Support/Debug.h"
48 
49 using namespace llvm;
50 
51 #define DEBUG_TYPE "x86tti"
52 
53 //===----------------------------------------------------------------------===//
54 //
55 // X86 cost model.
56 //
57 //===----------------------------------------------------------------------===//
58 
60 X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
61  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
62  // TODO: Currently the __builtin_popcount() implementation using SSE3
63  // instructions is inefficient. Once the problem is fixed, we should
64  // call ST->hasSSE3() instead of ST->hasPOPCNT().
66 }
67 
70  switch (Level) {
72  // - Penryn
73  // - Nehalem
74  // - Westmere
75  // - Sandy Bridge
76  // - Ivy Bridge
77  // - Haswell
78  // - Broadwell
79  // - Skylake
80  // - Kabylake
81  return 32 * 1024; // 32 KByte
83  // - Penryn
84  // - Nehalem
85  // - Westmere
86  // - Sandy Bridge
87  // - Ivy Bridge
88  // - Haswell
89  // - Broadwell
90  // - Skylake
91  // - Kabylake
92  return 256 * 1024; // 256 KByte
93  }
94 
95  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
96 }
97 
100  // - Penryn
101  // - Nehalem
102  // - Westmere
103  // - Sandy Bridge
104  // - Ivy Bridge
105  // - Haswell
106  // - Broadwell
107  // - Skylake
108  // - Kabylake
109  switch (Level) {
113  return 8;
114  }
115 
116  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
117 }
118 
119 unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
120  bool Vector = (ClassID == 1);
121  if (Vector && !ST->hasSSE1())
122  return 0;
123 
124  if (ST->is64Bit()) {
125  if (Vector && ST->hasAVX512())
126  return 32;
127  return 16;
128  }
129  return 8;
130 }
131 
132 TypeSize
134  unsigned PreferVectorWidth = ST->getPreferVectorWidth();
135  switch (K) {
137  return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
139  if (ST->hasAVX512() && PreferVectorWidth >= 512)
140  return TypeSize::getFixed(512);
141  if (ST->hasAVX() && PreferVectorWidth >= 256)
142  return TypeSize::getFixed(256);
143  if (ST->hasSSE1() && PreferVectorWidth >= 128)
144  return TypeSize::getFixed(128);
145  return TypeSize::getFixed(0);
147  return TypeSize::getScalable(0);
148  }
149 
150  llvm_unreachable("Unsupported register kind");
151 }
152 
153 unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
155  .getFixedSize();
156 }
157 
158 unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
159  // If the loop will not be vectorized, don't interleave the loop.
160  // Let regular unroll to unroll the loop, which saves the overflow
161  // check and memory check cost.
162  if (VF == 1)
163  return 1;
164 
165  if (ST->isAtom())
166  return 1;
167 
168  // Sandybridge and Haswell have multiple execution ports and pipelined
169  // vector units.
170  if (ST->hasAVX())
171  return 4;
172 
173  return 2;
174 }
175 
177  unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
179  TTI::OperandValueProperties Opd1PropInfo,
181  const Instruction *CxtI) {
182  // TODO: Handle more cost kinds.
184  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
185  Op2Info, Opd1PropInfo,
186  Opd2PropInfo, Args, CxtI);
187 
188  // vXi8 multiplications are always promoted to vXi16.
189  if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
190  Ty->getScalarSizeInBits() == 8) {
191  Type *WideVecTy =
192  VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
193  return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
195  CostKind) +
196  getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
198  CostKind) +
199  getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info,
200  Opd1PropInfo, Opd2PropInfo);
201  }
202 
203  // Legalize the type.
204  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
205 
206  int ISD = TLI->InstructionOpcodeToISD(Opcode);
207  assert(ISD && "Invalid opcode");
208 
209  if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
210  LT.second.getScalarType() == MVT::i32) {
211  // Check if the operands can be represented as a smaller datatype.
212  bool Op1Signed = false, Op2Signed = false;
213  unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
214  unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
215  unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
216 
217  // If both are representable as i15 and at least one is zero-extended,
218  // then we can treat this as PMADDWD which has the same costs
219  // as a vXi16 multiply..
220  if (OpMinSize <= 15 && (!Op1Signed || !Op2Signed) && !ST->isPMADDWDSlow())
221  LT.second =
222  MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
223  }
224 
225  if ((ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
226  ISD == ISD::UREM) &&
229  Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
230  if (ISD == ISD::SDIV || ISD == ISD::SREM) {
231  // On X86, vector signed division by constants power-of-two are
232  // normally expanded to the sequence SRA + SRL + ADD + SRA.
233  // The OperandValue properties may not be the same as that of the previous
234  // operation; conservatively assume OP_None.
235  InstructionCost Cost =
236  2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Op1Info,
239  Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info,
242  Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, Op1Info,
245 
246  if (ISD == ISD::SREM) {
247  // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
248  Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info,
249  Op2Info);
250  Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info,
251  Op2Info);
252  }
253 
254  return Cost;
255  }
256 
257  // Vector unsigned division/remainder will be simplified to shifts/masks.
258  if (ISD == ISD::UDIV)
259  return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info,
262 
263  else // UREM
264  return getArithmeticInstrCost(Instruction::And, Ty, CostKind, Op1Info,
267  }
268 
269  static const CostTblEntry GLMCostTable[] = {
270  { ISD::FDIV, MVT::f32, 18 }, // divss
271  { ISD::FDIV, MVT::v4f32, 35 }, // divps
272  { ISD::FDIV, MVT::f64, 33 }, // divsd
273  { ISD::FDIV, MVT::v2f64, 65 }, // divpd
274  };
275 
276  if (ST->useGLMDivSqrtCosts())
277  if (const auto *Entry = CostTableLookup(GLMCostTable, ISD,
278  LT.second))
279  return LT.first * Entry->Cost;
280 
281  static const CostTblEntry SLMCostTable[] = {
282  { ISD::MUL, MVT::v4i32, 11 }, // pmulld
283  { ISD::MUL, MVT::v8i16, 2 }, // pmullw
284  { ISD::FMUL, MVT::f64, 2 }, // mulsd
285  { ISD::FMUL, MVT::v2f64, 4 }, // mulpd
286  { ISD::FMUL, MVT::v4f32, 2 }, // mulps
287  { ISD::FDIV, MVT::f32, 17 }, // divss
288  { ISD::FDIV, MVT::v4f32, 39 }, // divps
289  { ISD::FDIV, MVT::f64, 32 }, // divsd
290  { ISD::FDIV, MVT::v2f64, 69 }, // divpd
291  { ISD::FADD, MVT::v2f64, 2 }, // addpd
292  { ISD::FSUB, MVT::v2f64, 2 }, // subpd
293  // v2i64/v4i64 mul is custom lowered as a series of long:
294  // multiplies(3), shifts(3) and adds(2)
295  // slm muldq version throughput is 2 and addq throughput 4
296  // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
297  // 3X4 (addq throughput) = 17
298  { ISD::MUL, MVT::v2i64, 17 },
299  // slm addq\subq throughput is 4
300  { ISD::ADD, MVT::v2i64, 4 },
301  { ISD::SUB, MVT::v2i64, 4 },
302  };
303 
304  if (ST->isSLM()) {
305  if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
306  // Check if the operands can be shrinked into a smaller datatype.
307  // TODO: Merge this into generiic vXi32 MUL patterns above.
308  bool Op1Signed = false;
309  unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
310  bool Op2Signed = false;
311  unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
312 
313  bool SignedMode = Op1Signed || Op2Signed;
314  unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
315 
316  if (OpMinSize <= 7)
317  return LT.first * 3; // pmullw/sext
318  if (!SignedMode && OpMinSize <= 8)
319  return LT.first * 3; // pmullw/zext
320  if (OpMinSize <= 15)
321  return LT.first * 5; // pmullw/pmulhw/pshuf
322  if (!SignedMode && OpMinSize <= 16)
323  return LT.first * 5; // pmullw/pmulhw/pshuf
324  }
325 
326  if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
327  LT.second)) {
328  return LT.first * Entry->Cost;
329  }
330  }
331 
332  static const CostTblEntry AVX512BWUniformConstCostTable[] = {
333  { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand.
334  { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand.
335  { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb.
336  };
337 
339  ST->hasBWI()) {
340  if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
341  LT.second))
342  return LT.first * Entry->Cost;
343  }
344 
345  static const CostTblEntry AVX512UniformConstCostTable[] = {
346  { ISD::SRA, MVT::v2i64, 1 },
347  { ISD::SRA, MVT::v4i64, 1 },
348  { ISD::SRA, MVT::v8i64, 1 },
349 
350  { ISD::SHL, MVT::v64i8, 4 }, // psllw + pand.
351  { ISD::SRL, MVT::v64i8, 4 }, // psrlw + pand.
352  { ISD::SRA, MVT::v64i8, 8 }, // psrlw, pand, pxor, psubb.
353 
354  { ISD::SDIV, MVT::v16i32, 6 }, // pmuludq sequence
355  { ISD::SREM, MVT::v16i32, 8 }, // pmuludq+mul+sub sequence
356  { ISD::UDIV, MVT::v16i32, 5 }, // pmuludq sequence
357  { ISD::UREM, MVT::v16i32, 7 }, // pmuludq+mul+sub sequence
358  };
359 
361  ST->hasAVX512()) {
362  if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
363  LT.second))
364  return LT.first * Entry->Cost;
365  }
366 
367  static const CostTblEntry AVX2UniformConstCostTable[] = {
368  { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand.
369  { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand.
370  { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb.
371 
372  { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
373 
374  { ISD::SDIV, MVT::v8i32, 6 }, // pmuludq sequence
375  { ISD::SREM, MVT::v8i32, 8 }, // pmuludq+mul+sub sequence
376  { ISD::UDIV, MVT::v8i32, 5 }, // pmuludq sequence
377  { ISD::UREM, MVT::v8i32, 7 }, // pmuludq+mul+sub sequence
378  };
379 
381  ST->hasAVX2()) {
382  if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
383  LT.second))
384  return LT.first * Entry->Cost;
385  }
386 
387  static const CostTblEntry SSE2UniformConstCostTable[] = {
388  { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand.
389  { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand.
390  { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
391 
392  { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split.
393  { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split.
394  { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
395 
396  { ISD::SDIV, MVT::v8i32, 12+2 }, // 2*pmuludq sequence + split.
397  { ISD::SREM, MVT::v8i32, 16+2 }, // 2*pmuludq+mul+sub sequence + split.
398  { ISD::SDIV, MVT::v4i32, 6 }, // pmuludq sequence
399  { ISD::SREM, MVT::v4i32, 8 }, // pmuludq+mul+sub sequence
400  { ISD::UDIV, MVT::v8i32, 10+2 }, // 2*pmuludq sequence + split.
401  { ISD::UREM, MVT::v8i32, 14+2 }, // 2*pmuludq+mul+sub sequence + split.
402  { ISD::UDIV, MVT::v4i32, 5 }, // pmuludq sequence
403  { ISD::UREM, MVT::v4i32, 7 }, // pmuludq+mul+sub sequence
404  };
405 
406  // XOP has faster vXi8 shifts.
408  ST->hasSSE2() && !ST->hasXOP()) {
409  if (const auto *Entry =
410  CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
411  return LT.first * Entry->Cost;
412  }
413 
414  static const CostTblEntry AVX512BWConstCostTable[] = {
415  { ISD::SDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
416  { ISD::SREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
417  { ISD::UDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
418  { ISD::UREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
419  { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence
420  { ISD::SREM, MVT::v32i16, 8 }, // vpmulhw+mul+sub sequence
421  { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence
422  { ISD::UREM, MVT::v32i16, 8 }, // vpmulhuw+mul+sub sequence
423  };
424 
427  ST->hasBWI()) {
428  if (const auto *Entry =
429  CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
430  return LT.first * Entry->Cost;
431  }
432 
433  static const CostTblEntry AVX512ConstCostTable[] = {
434  { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
435  { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
436  { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
437  { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
438  { ISD::SDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
439  { ISD::SREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
440  { ISD::UDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
441  { ISD::UREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
442  { ISD::SDIV, MVT::v32i16, 12 }, // 2*vpmulhw sequence
443  { ISD::SREM, MVT::v32i16, 16 }, // 2*vpmulhw+mul+sub sequence
444  { ISD::UDIV, MVT::v32i16, 12 }, // 2*vpmulhuw sequence
445  { ISD::UREM, MVT::v32i16, 16 }, // 2*vpmulhuw+mul+sub sequence
446  };
447 
450  ST->hasAVX512()) {
451  if (const auto *Entry =
452  CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
453  return LT.first * Entry->Cost;
454  }
455 
456  static const CostTblEntry AVX2ConstCostTable[] = {
457  { ISD::SDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
458  { ISD::SREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
459  { ISD::UDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
460  { ISD::UREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
461  { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
462  { ISD::SREM, MVT::v16i16, 8 }, // vpmulhw+mul+sub sequence
463  { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
464  { ISD::UREM, MVT::v16i16, 8 }, // vpmulhuw+mul+sub sequence
465  { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
466  { ISD::SREM, MVT::v8i32, 19 }, // vpmuldq+mul+sub sequence
467  { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
468  { ISD::UREM, MVT::v8i32, 19 }, // vpmuludq+mul+sub sequence
469  };
470 
473  ST->hasAVX2()) {
474  if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
475  return LT.first * Entry->Cost;
476  }
477 
478  static const CostTblEntry SSE2ConstCostTable[] = {
479  { ISD::SDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
480  { ISD::SREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
481  { ISD::SDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
482  { ISD::SREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
483  { ISD::UDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
484  { ISD::UREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
485  { ISD::UDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
486  { ISD::UREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
487  { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
488  { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split.
489  { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
490  { ISD::SREM, MVT::v8i16, 8 }, // pmulhw+mul+sub sequence
491  { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
492  { ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split.
493  { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
494  { ISD::UREM, MVT::v8i16, 8 }, // pmulhuw+mul+sub sequence
495  { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split.
496  { ISD::SREM, MVT::v8i32, 48+2 }, // 2*pmuludq+mul+sub sequence + split.
497  { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
498  { ISD::SREM, MVT::v4i32, 24 }, // pmuludq+mul+sub sequence
499  { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split.
500  { ISD::UREM, MVT::v8i32, 40+2 }, // 2*pmuludq+mul+sub sequence + split.
501  { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
502  { ISD::UREM, MVT::v4i32, 20 }, // pmuludq+mul+sub sequence
503  };
504 
507  ST->hasSSE2()) {
508  // pmuldq sequence.
509  if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
510  return LT.first * 32;
511  if (ISD == ISD::SREM && LT.second == MVT::v8i32 && ST->hasAVX())
512  return LT.first * 38;
513  if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
514  return LT.first * 15;
515  if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41())
516  return LT.first * 20;
517 
518  if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
519  return LT.first * Entry->Cost;
520  }
521 
522  static const CostTblEntry AVX512BWShiftCostTable[] = {
523  { ISD::SHL, MVT::v16i8, 4 }, // extend/vpsllvw/pack sequence.
524  { ISD::SRL, MVT::v16i8, 4 }, // extend/vpsrlvw/pack sequence.
525  { ISD::SRA, MVT::v16i8, 4 }, // extend/vpsravw/pack sequence.
526  { ISD::SHL, MVT::v32i8, 4 }, // extend/vpsllvw/pack sequence.
527  { ISD::SRL, MVT::v32i8, 4 }, // extend/vpsrlvw/pack sequence.
528  { ISD::SRA, MVT::v32i8, 6 }, // extend/vpsravw/pack sequence.
529  { ISD::SHL, MVT::v64i8, 6 }, // extend/vpsllvw/pack sequence.
530  { ISD::SRL, MVT::v64i8, 7 }, // extend/vpsrlvw/pack sequence.
531  { ISD::SRA, MVT::v64i8, 15 }, // extend/vpsravw/pack sequence.
532 
533  { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
534  { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
535  { ISD::SRA, MVT::v8i16, 1 }, // vpsravw
536  { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
537  { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
538  { ISD::SRA, MVT::v16i16, 1 }, // vpsravw
539  { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
540  { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
541  { ISD::SRA, MVT::v32i16, 1 }, // vpsravw
542  };
543 
544  if (ST->hasBWI())
545  if (const auto *Entry = CostTableLookup(AVX512BWShiftCostTable, ISD, LT.second))
546  return LT.first * Entry->Cost;
547 
548  static const CostTblEntry AVX2UniformCostTable[] = {
549  // Uniform splats are cheaper for the following instructions.
550  { ISD::SHL, MVT::v16i16, 1 }, // psllw.
551  { ISD::SRL, MVT::v16i16, 1 }, // psrlw.
552  { ISD::SRA, MVT::v16i16, 1 }, // psraw.
553  { ISD::SHL, MVT::v32i16, 2 }, // 2*psllw.
554  { ISD::SRL, MVT::v32i16, 2 }, // 2*psrlw.
555  { ISD::SRA, MVT::v32i16, 2 }, // 2*psraw.
556 
557  { ISD::SHL, MVT::v8i32, 1 }, // pslld
558  { ISD::SRL, MVT::v8i32, 1 }, // psrld
559  { ISD::SRA, MVT::v8i32, 1 }, // psrad
560  { ISD::SHL, MVT::v4i64, 1 }, // psllq
561  { ISD::SRL, MVT::v4i64, 1 }, // psrlq
562  };
563 
564  if (ST->hasAVX2() &&
566  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
567  if (const auto *Entry =
568  CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
569  return LT.first * Entry->Cost;
570  }
571 
572  static const CostTblEntry SSE2UniformCostTable[] = {
573  // Uniform splats are cheaper for the following instructions.
574  { ISD::SHL, MVT::v8i16, 1 }, // psllw.
575  { ISD::SHL, MVT::v4i32, 1 }, // pslld
576  { ISD::SHL, MVT::v2i64, 1 }, // psllq.
577 
578  { ISD::SRL, MVT::v8i16, 1 }, // psrlw.
579  { ISD::SRL, MVT::v4i32, 1 }, // psrld.
580  { ISD::SRL, MVT::v2i64, 1 }, // psrlq.
581 
582  { ISD::SRA, MVT::v8i16, 1 }, // psraw.
583  { ISD::SRA, MVT::v4i32, 1 }, // psrad.
584  };
585 
586  if (ST->hasSSE2() &&
588  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
589  if (const auto *Entry =
590  CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
591  return LT.first * Entry->Cost;
592  }
593 
594  static const CostTblEntry AVX512DQCostTable[] = {
595  { ISD::MUL, MVT::v2i64, 2 }, // pmullq
596  { ISD::MUL, MVT::v4i64, 2 }, // pmullq
597  { ISD::MUL, MVT::v8i64, 2 } // pmullq
598  };
599 
600  // Look for AVX512DQ lowering tricks for custom cases.
601  if (ST->hasDQI())
602  if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
603  return LT.first * Entry->Cost;
604 
605  static const CostTblEntry AVX512BWCostTable[] = {
606  { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence.
607  { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence.
608  { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence.
609  };
610 
611  // Look for AVX512BW lowering tricks for custom cases.
612  if (ST->hasBWI())
613  if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
614  return LT.first * Entry->Cost;
615 
616  static const CostTblEntry AVX512CostTable[] = {
617  { ISD::SHL, MVT::v4i32, 1 },
618  { ISD::SRL, MVT::v4i32, 1 },
619  { ISD::SRA, MVT::v4i32, 1 },
620  { ISD::SHL, MVT::v8i32, 1 },
621  { ISD::SRL, MVT::v8i32, 1 },
622  { ISD::SRA, MVT::v8i32, 1 },
623  { ISD::SHL, MVT::v16i32, 1 },
624  { ISD::SRL, MVT::v16i32, 1 },
625  { ISD::SRA, MVT::v16i32, 1 },
626 
627  { ISD::SHL, MVT::v2i64, 1 },
628  { ISD::SRL, MVT::v2i64, 1 },
629  { ISD::SHL, MVT::v4i64, 1 },
630  { ISD::SRL, MVT::v4i64, 1 },
631  { ISD::SHL, MVT::v8i64, 1 },
632  { ISD::SRL, MVT::v8i64, 1 },
633 
634  { ISD::SRA, MVT::v2i64, 1 },
635  { ISD::SRA, MVT::v4i64, 1 },
636  { ISD::SRA, MVT::v8i64, 1 },
637 
638  { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org)
639  { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org)
640  { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org)
641  { ISD::MUL, MVT::v8i64, 6 }, // 3*pmuludq/3*shift/2*add
642 
643  { ISD::FNEG, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
644  { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
645  { ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
646  { ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
647  { ISD::FDIV, MVT::f64, 4 }, // Skylake from http://www.agner.org/
648  { ISD::FDIV, MVT::v2f64, 4 }, // Skylake from http://www.agner.org/
649  { ISD::FDIV, MVT::v4f64, 8 }, // Skylake from http://www.agner.org/
650  { ISD::FDIV, MVT::v8f64, 16 }, // Skylake from http://www.agner.org/
651 
652  { ISD::FNEG, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
653  { ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
654  { ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
655  { ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
656  { ISD::FDIV, MVT::f32, 3 }, // Skylake from http://www.agner.org/
657  { ISD::FDIV, MVT::v4f32, 3 }, // Skylake from http://www.agner.org/
658  { ISD::FDIV, MVT::v8f32, 5 }, // Skylake from http://www.agner.org/
659  { ISD::FDIV, MVT::v16f32, 10 }, // Skylake from http://www.agner.org/
660  };
661 
662  if (ST->hasAVX512())
663  if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
664  return LT.first * Entry->Cost;
665 
666  static const CostTblEntry AVX2ShiftCostTable[] = {
667  // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
668  // customize them to detect the cases where shift amount is a scalar one.
669  { ISD::SHL, MVT::v4i32, 2 }, // vpsllvd (Haswell from agner.org)
670  { ISD::SRL, MVT::v4i32, 2 }, // vpsrlvd (Haswell from agner.org)
671  { ISD::SRA, MVT::v4i32, 2 }, // vpsravd (Haswell from agner.org)
672  { ISD::SHL, MVT::v8i32, 2 }, // vpsllvd (Haswell from agner.org)
673  { ISD::SRL, MVT::v8i32, 2 }, // vpsrlvd (Haswell from agner.org)
674  { ISD::SRA, MVT::v8i32, 2 }, // vpsravd (Haswell from agner.org)
675  { ISD::SHL, MVT::v2i64, 1 }, // vpsllvq (Haswell from agner.org)
676  { ISD::SRL, MVT::v2i64, 1 }, // vpsrlvq (Haswell from agner.org)
677  { ISD::SHL, MVT::v4i64, 1 }, // vpsllvq (Haswell from agner.org)
678  { ISD::SRL, MVT::v4i64, 1 }, // vpsrlvq (Haswell from agner.org)
679  };
680 
681  if (ST->hasAVX512()) {
682  if (ISD == ISD::SHL && LT.second == MVT::v32i16 &&
685  // On AVX512, a packed v32i16 shift left by a constant build_vector
686  // is lowered into a vector multiply (vpmullw).
687  return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
688  Op1Info, Op2Info,
691  }
692 
693  // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
694  if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
695  if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
698  // On AVX2, a packed v16i16 shift left by a constant build_vector
699  // is lowered into a vector multiply (vpmullw).
700  return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
701  Op1Info, Op2Info,
704 
705  if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
706  return LT.first * Entry->Cost;
707  }
708 
709  static const CostTblEntry XOPShiftCostTable[] = {
710  // 128bit shifts take 1cy, but right shifts require negation beforehand.
711  { ISD::SHL, MVT::v16i8, 1 },
712  { ISD::SRL, MVT::v16i8, 2 },
713  { ISD::SRA, MVT::v16i8, 2 },
714  { ISD::SHL, MVT::v8i16, 1 },
715  { ISD::SRL, MVT::v8i16, 2 },
716  { ISD::SRA, MVT::v8i16, 2 },
717  { ISD::SHL, MVT::v4i32, 1 },
718  { ISD::SRL, MVT::v4i32, 2 },
719  { ISD::SRA, MVT::v4i32, 2 },
720  { ISD::SHL, MVT::v2i64, 1 },
721  { ISD::SRL, MVT::v2i64, 2 },
722  { ISD::SRA, MVT::v2i64, 2 },
723  // 256bit shifts require splitting if AVX2 didn't catch them above.
724  { ISD::SHL, MVT::v32i8, 2+2 },
725  { ISD::SRL, MVT::v32i8, 4+2 },
726  { ISD::SRA, MVT::v32i8, 4+2 },
727  { ISD::SHL, MVT::v16i16, 2+2 },
728  { ISD::SRL, MVT::v16i16, 4+2 },
729  { ISD::SRA, MVT::v16i16, 4+2 },
730  { ISD::SHL, MVT::v8i32, 2+2 },
731  { ISD::SRL, MVT::v8i32, 4+2 },
732  { ISD::SRA, MVT::v8i32, 4+2 },
733  { ISD::SHL, MVT::v4i64, 2+2 },
734  { ISD::SRL, MVT::v4i64, 4+2 },
735  { ISD::SRA, MVT::v4i64, 4+2 },
736  };
737 
738  // Look for XOP lowering tricks.
739  if (ST->hasXOP()) {
740  // If the right shift is constant then we'll fold the negation so
741  // it's as cheap as a left shift.
742  int ShiftISD = ISD;
743  if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) &&
746  ShiftISD = ISD::SHL;
747  if (const auto *Entry =
748  CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
749  return LT.first * Entry->Cost;
750  }
751 
752  static const CostTblEntry SSE2UniformShiftCostTable[] = {
753  // Uniform splats are cheaper for the following instructions.
754  { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split.
755  { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split.
756  { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split.
757 
758  { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split.
759  { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split.
760  { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split.
761 
762  { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split.
763  { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split.
764  { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle.
765  { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split.
766  };
767 
768  if (ST->hasSSE2() &&
770  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
771 
772  // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
773  if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2())
774  return LT.first * 4; // 2*psrad + shuffle.
775 
776  if (const auto *Entry =
777  CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
778  return LT.first * Entry->Cost;
779  }
780 
781  if (ISD == ISD::SHL &&
783  MVT VT = LT.second;
784  // Vector shift left by non uniform constant can be lowered
785  // into vector multiply.
786  if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
787  ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
788  ISD = ISD::MUL;
789  }
790 
791  static const CostTblEntry AVX2CostTable[] = {
792  { ISD::SHL, MVT::v16i8, 6 }, // vpblendvb sequence.
793  { ISD::SHL, MVT::v32i8, 6 }, // vpblendvb sequence.
794  { ISD::SHL, MVT::v64i8, 12 }, // 2*vpblendvb sequence.
795  { ISD::SHL, MVT::v8i16, 5 }, // extend/vpsrlvd/pack sequence.
796  { ISD::SHL, MVT::v16i16, 7 }, // extend/vpsrlvd/pack sequence.
797  { ISD::SHL, MVT::v32i16, 14 }, // 2*extend/vpsrlvd/pack sequence.
798 
799  { ISD::SRL, MVT::v16i8, 6 }, // vpblendvb sequence.
800  { ISD::SRL, MVT::v32i8, 6 }, // vpblendvb sequence.
801  { ISD::SRL, MVT::v64i8, 12 }, // 2*vpblendvb sequence.
802  { ISD::SRL, MVT::v8i16, 5 }, // extend/vpsrlvd/pack sequence.
803  { ISD::SRL, MVT::v16i16, 7 }, // extend/vpsrlvd/pack sequence.
804  { ISD::SRL, MVT::v32i16, 14 }, // 2*extend/vpsrlvd/pack sequence.
805 
806  { ISD::SRA, MVT::v16i8, 17 }, // vpblendvb sequence.
807  { ISD::SRA, MVT::v32i8, 17 }, // vpblendvb sequence.
808  { ISD::SRA, MVT::v64i8, 34 }, // 2*vpblendvb sequence.
809  { ISD::SRA, MVT::v8i16, 5 }, // extend/vpsravd/pack sequence.
810  { ISD::SRA, MVT::v16i16, 7 }, // extend/vpsravd/pack sequence.
811  { ISD::SRA, MVT::v32i16, 14 }, // 2*extend/vpsravd/pack sequence.
812  { ISD::SRA, MVT::v2i64, 2 }, // srl/xor/sub sequence.
813  { ISD::SRA, MVT::v4i64, 2 }, // srl/xor/sub sequence.
814 
815  { ISD::SUB, MVT::v32i8, 1 }, // psubb
816  { ISD::ADD, MVT::v32i8, 1 }, // paddb
817  { ISD::SUB, MVT::v16i16, 1 }, // psubw
818  { ISD::ADD, MVT::v16i16, 1 }, // paddw
819  { ISD::SUB, MVT::v8i32, 1 }, // psubd
820  { ISD::ADD, MVT::v8i32, 1 }, // paddd
821  { ISD::SUB, MVT::v4i64, 1 }, // psubq
822  { ISD::ADD, MVT::v4i64, 1 }, // paddq
823 
824  { ISD::MUL, MVT::v16i16, 1 }, // pmullw
825  { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org)
826  { ISD::MUL, MVT::v4i64, 6 }, // 3*pmuludq/3*shift/2*add
827 
828  { ISD::FNEG, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
829  { ISD::FNEG, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
830  { ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
831  { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
832  { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
833  { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
834  { ISD::FMUL, MVT::f64, 1 }, // Haswell from http://www.agner.org/
835  { ISD::FMUL, MVT::v2f64, 1 }, // Haswell from http://www.agner.org/
836  { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
837  { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
838 
839  { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
840  { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
841  { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
842  { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/
843  { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
844  { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
845  };
846 
847  // Look for AVX2 lowering tricks for custom cases.
848  if (ST->hasAVX2())
849  if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
850  return LT.first * Entry->Cost;
851 
852  static const CostTblEntry AVX1CostTable[] = {
853  // We don't have to scalarize unsupported ops. We can issue two half-sized
854  // operations and we only need to extract the upper YMM half.
855  // Two ops + 1 extract + 1 insert = 4.
856  { ISD::MUL, MVT::v16i16, 4 },
857  { ISD::MUL, MVT::v8i32, 5 }, // BTVER2 from http://www.agner.org/
858  { ISD::MUL, MVT::v4i64, 12 },
859 
860  { ISD::SUB, MVT::v32i8, 4 },
861  { ISD::ADD, MVT::v32i8, 4 },
862  { ISD::SUB, MVT::v16i16, 4 },
863  { ISD::ADD, MVT::v16i16, 4 },
864  { ISD::SUB, MVT::v8i32, 4 },
865  { ISD::ADD, MVT::v8i32, 4 },
866  { ISD::SUB, MVT::v4i64, 4 },
867  { ISD::ADD, MVT::v4i64, 4 },
868 
869  { ISD::SHL, MVT::v32i8, 22 }, // pblendvb sequence + split.
870  { ISD::SHL, MVT::v8i16, 6 }, // pblendvb sequence.
871  { ISD::SHL, MVT::v16i16, 13 }, // pblendvb sequence + split.
872  { ISD::SHL, MVT::v4i32, 3 }, // pslld/paddd/cvttps2dq/pmulld
873  { ISD::SHL, MVT::v8i32, 9 }, // pslld/paddd/cvttps2dq/pmulld + split
874  { ISD::SHL, MVT::v2i64, 2 }, // Shift each lane + blend.
875  { ISD::SHL, MVT::v4i64, 6 }, // Shift each lane + blend + split.
876 
877  { ISD::SRL, MVT::v32i8, 23 }, // pblendvb sequence + split.
878  { ISD::SRL, MVT::v16i16, 28 }, // pblendvb sequence + split.
879  { ISD::SRL, MVT::v4i32, 6 }, // Shift each lane + blend.
880  { ISD::SRL, MVT::v8i32, 14 }, // Shift each lane + blend + split.
881  { ISD::SRL, MVT::v2i64, 2 }, // Shift each lane + blend.
882  { ISD::SRL, MVT::v4i64, 6 }, // Shift each lane + blend + split.
883 
884  { ISD::SRA, MVT::v32i8, 44 }, // pblendvb sequence + split.
885  { ISD::SRA, MVT::v16i16, 28 }, // pblendvb sequence + split.
886  { ISD::SRA, MVT::v4i32, 6 }, // Shift each lane + blend.
887  { ISD::SRA, MVT::v8i32, 14 }, // Shift each lane + blend + split.
888  { ISD::SRA, MVT::v2i64, 5 }, // Shift each lane + blend.
889  { ISD::SRA, MVT::v4i64, 12 }, // Shift each lane + blend + split.
890 
891  { ISD::FNEG, MVT::v4f64, 2 }, // BTVER2 from http://www.agner.org/
892  { ISD::FNEG, MVT::v8f32, 2 }, // BTVER2 from http://www.agner.org/
893 
894  { ISD::FMUL, MVT::f64, 2 }, // BTVER2 from http://www.agner.org/
895  { ISD::FMUL, MVT::v2f64, 2 }, // BTVER2 from http://www.agner.org/
896  { ISD::FMUL, MVT::v4f64, 4 }, // BTVER2 from http://www.agner.org/
897 
898  { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
899  { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
900  { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
901  { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/
902  { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/
903  { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/
904  };
905 
906  if (ST->hasAVX())
907  if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
908  return LT.first * Entry->Cost;
909 
910  static const CostTblEntry SSE42CostTable[] = {
911  { ISD::FADD, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
912  { ISD::FADD, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
913  { ISD::FADD, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
914  { ISD::FADD, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
915 
916  { ISD::FSUB, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
917  { ISD::FSUB, MVT::f32 , 1 }, // Nehalem from http://www.agner.org/
918  { ISD::FSUB, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
919  { ISD::FSUB, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
920 
921  { ISD::FMUL, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
922  { ISD::FMUL, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
923  { ISD::FMUL, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
924  { ISD::FMUL, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
925 
926  { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/
927  { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
928  { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/
929  { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
930 
931  { ISD::MUL, MVT::v2i64, 6 } // 3*pmuludq/3*shift/2*add
932  };
933 
934  if (ST->hasSSE42())
935  if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
936  return LT.first * Entry->Cost;
937 
938  static const CostTblEntry SSE41CostTable[] = {
939  { ISD::SHL, MVT::v16i8, 10 }, // pblendvb sequence.
940  { ISD::SHL, MVT::v8i16, 11 }, // pblendvb sequence.
941  { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
942 
943  { ISD::SRL, MVT::v16i8, 11 }, // pblendvb sequence.
944  { ISD::SRL, MVT::v8i16, 13 }, // pblendvb sequence.
945  { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
946 
947  { ISD::SRA, MVT::v16i8, 21 }, // pblendvb sequence.
948  { ISD::SRA, MVT::v8i16, 13 }, // pblendvb sequence.
949 
950  { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org)
951  };
952 
953  if (ST->hasSSE41())
954  if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
955  return LT.first * Entry->Cost;
956 
957  static const CostTblEntry SSE2CostTable[] = {
958  // We don't correctly identify costs of casts because they are marked as
959  // custom.
960  { ISD::SHL, MVT::v16i8, 13 }, // cmpgtb sequence.
961  { ISD::SHL, MVT::v8i16, 25 }, // cmpgtw sequence.
962  { ISD::SHL, MVT::v4i32, 16 }, // pslld/paddd/cvttps2dq/pmuludq.
963  { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
964 
965  { ISD::SRL, MVT::v16i8, 14 }, // cmpgtb sequence.
966  { ISD::SRL, MVT::v8i16, 16 }, // cmpgtw sequence.
967  { ISD::SRL, MVT::v4i32, 12 }, // Shift each lane + blend.
968  { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
969 
970  { ISD::SRA, MVT::v16i8, 27 }, // unpacked cmpgtb sequence.
971  { ISD::SRA, MVT::v8i16, 16 }, // cmpgtw sequence.
972  { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
973  { ISD::SRA, MVT::v2i64, 8 }, // srl/xor/sub splat+shuffle sequence.
974 
975  { ISD::MUL, MVT::v8i16, 1 }, // pmullw
976  { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
977  { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
978 
979  { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/
980  { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
981  { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
982  { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
983 
984  { ISD::FNEG, MVT::f32, 1 }, // Pentium IV from http://www.agner.org/
985  { ISD::FNEG, MVT::f64, 1 }, // Pentium IV from http://www.agner.org/
986  { ISD::FNEG, MVT::v4f32, 1 }, // Pentium IV from http://www.agner.org/
987  { ISD::FNEG, MVT::v2f64, 1 }, // Pentium IV from http://www.agner.org/
988 
989  { ISD::FADD, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
990  { ISD::FADD, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
991 
992  { ISD::FSUB, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
993  { ISD::FSUB, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
994  };
995 
996  if (ST->hasSSE2())
997  if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
998  return LT.first * Entry->Cost;
999 
1000  static const CostTblEntry SSE1CostTable[] = {
1001  { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/
1002  { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
1003 
1004  { ISD::FNEG, MVT::f32, 2 }, // Pentium III from http://www.agner.org/
1005  { ISD::FNEG, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
1006 
1007  { ISD::FADD, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
1008  { ISD::FADD, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
1009 
1010  { ISD::FSUB, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
1011  { ISD::FSUB, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
1012  };
1013 
1014  if (ST->hasSSE1())
1015  if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1016  return LT.first * Entry->Cost;
1017 
1018  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
1019  { ISD::ADD, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/
1020  { ISD::SUB, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/
1021  };
1022 
1023  if (ST->is64Bit())
1024  if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1025  return LT.first * Entry->Cost;
1026 
1027  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1028  { ISD::ADD, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
1029  { ISD::ADD, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
1030  { ISD::ADD, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
1031 
1032  { ISD::SUB, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
1033  { ISD::SUB, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
1034  { ISD::SUB, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
1035  };
1036 
1037  if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1038  return LT.first * Entry->Cost;
1039 
1040  // It is not a good idea to vectorize division. We have to scalarize it and
1041  // in the process we will often end up having to spilling regular
1042  // registers. The overhead of division is going to dominate most kernels
1043  // anyways so try hard to prevent vectorization of division - it is
1044  // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1045  // to hide "20 cycles" for each lane.
1046  if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM ||
1047  ISD == ISD::UDIV || ISD == ISD::UREM)) {
1049  Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info,
1051  return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1052  }
1053 
1054  // Fallback to the default implementation.
1055  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info);
1056 }
1057 
1059  VectorType *BaseTp,
1060  ArrayRef<int> Mask, int Index,
1061  VectorType *SubTp) {
1062  // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1063  // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1064  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, BaseTp);
1065 
1067  // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1068  if (Kind == TTI::SK_Transpose)
1070 
1071  // For Broadcasts we are splatting the first element from the first input
1072  // register, so only need to reference that input and all the output
1073  // registers are the same.
1074  if (Kind == TTI::SK_Broadcast)
1075  LT.first = 1;
1076 
1077  // Subvector extractions are free if they start at the beginning of a
1078  // vector and cheap if the subvectors are aligned.
1079  if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1080  int NumElts = LT.second.getVectorNumElements();
1081  if ((Index % NumElts) == 0)
1082  return 0;
1083  std::pair<InstructionCost, MVT> SubLT =
1084  TLI->getTypeLegalizationCost(DL, SubTp);
1085  if (SubLT.second.isVector()) {
1086  int NumSubElts = SubLT.second.getVectorNumElements();
1087  if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1088  return SubLT.first;
1089  // Handle some cases for widening legalization. For now we only handle
1090  // cases where the original subvector was naturally aligned and evenly
1091  // fit in its legalized subvector type.
1092  // FIXME: Remove some of the alignment restrictions.
1093  // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1094  // vectors.
1095  int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1096  if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1097  (NumSubElts % OrigSubElts) == 0 &&
1098  LT.second.getVectorElementType() ==
1099  SubLT.second.getVectorElementType() &&
1100  LT.second.getVectorElementType().getSizeInBits() ==
1101  BaseTp->getElementType()->getPrimitiveSizeInBits()) {
1102  assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1103  "Unexpected number of elements!");
1104  auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
1105  LT.second.getVectorNumElements());
1106  auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
1107  SubLT.second.getVectorNumElements());
1108  int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1109  InstructionCost ExtractCost = getShuffleCost(
1110  TTI::SK_ExtractSubvector, VecTy, None, ExtractIndex, SubTy);
1111 
1112  // If the original size is 32-bits or more, we can use pshufd. Otherwise
1113  // if we have SSSE3 we can use pshufb.
1114  if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1115  return ExtractCost + 1; // pshufd or pshufb
1116 
1117  assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1118  "Unexpected vector size");
1119 
1120  return ExtractCost + 2; // worst case pshufhw + pshufd
1121  }
1122  }
1123  }
1124 
1125  // Subvector insertions are cheap if the subvectors are aligned.
1126  // Note that in general, the insertion starting at the beginning of a vector
1127  // isn't free, because we need to preserve the rest of the wide vector.
1128  if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1129  int NumElts = LT.second.getVectorNumElements();
1130  std::pair<InstructionCost, MVT> SubLT =
1131  TLI->getTypeLegalizationCost(DL, SubTp);
1132  if (SubLT.second.isVector()) {
1133  int NumSubElts = SubLT.second.getVectorNumElements();
1134  if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1135  return SubLT.first;
1136  }
1137 
1138  // If the insertion isn't aligned, treat it like a 2-op shuffle.
1140  }
1141 
1142  // Handle some common (illegal) sub-vector types as they are often very cheap
1143  // to shuffle even on targets without PSHUFB.
1144  EVT VT = TLI->getValueType(DL, BaseTp);
1145  if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1146  !ST->hasSSSE3()) {
1147  static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
1148  {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw
1149  {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw
1150  {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw
1151  {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw
1152  {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck
1153 
1154  {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw
1155  {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw
1156  {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus
1157  {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck
1158 
1159  {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw
1160  {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw
1161  {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw
1162  {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw
1163  {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck
1164 
1165  {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
1166  {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
1167  {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw
1168  {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw
1169  {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck
1170  };
1171 
1172  if (ST->hasSSE2())
1173  if (const auto *Entry =
1174  CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1175  return Entry->Cost;
1176  }
1177 
1178  // We are going to permute multiple sources and the result will be in multiple
1179  // destinations. Providing an accurate cost only for splits where the element
1180  // type remains the same.
1181  if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
1182  MVT LegalVT = LT.second;
1183  if (LegalVT.isVector() &&
1184  LegalVT.getVectorElementType().getSizeInBits() ==
1185  BaseTp->getElementType()->getPrimitiveSizeInBits() &&
1186  LegalVT.getVectorNumElements() <
1187  cast<FixedVectorType>(BaseTp)->getNumElements()) {
1188 
1189  unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
1190  unsigned LegalVTSize = LegalVT.getStoreSize();
1191  // Number of source vectors after legalization:
1192  unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1193  // Number of destination vectors after legalization:
1194  InstructionCost NumOfDests = LT.first;
1195 
1196  auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
1197  LegalVT.getVectorNumElements());
1198 
1199  InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1200  return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1201  None, 0, nullptr);
1202  }
1203 
1204  return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp);
1205  }
1206 
1207  // For 2-input shuffles, we must account for splitting the 2 inputs into many.
1208  if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
1209  // We assume that source and destination have the same vector type.
1210  InstructionCost NumOfDests = LT.first;
1211  InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1;
1212  LT.first = NumOfDests * NumOfShufflesPerDest;
1213  }
1214 
1215  static const CostTblEntry AVX512FP16ShuffleTbl[] = {
1216  {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
1217  {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
1218  {TTI::SK_Broadcast, MVT::v8f16, 1}, // vpbroadcastw
1219 
1220  {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
1221  {TTI::SK_Reverse, MVT::v16f16, 2}, // vpermw
1222  {TTI::SK_Reverse, MVT::v8f16, 1}, // vpshufb
1223 
1224  {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
1225  {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
1226  {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // vpshufb
1227 
1228  {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
1229  {TTI::SK_PermuteTwoSrc, MVT::v16f16, 2}, // vpermt2w
1230  {TTI::SK_PermuteTwoSrc, MVT::v8f16, 2} // vpermt2w
1231  };
1232 
1233  if (!ST->useSoftFloat() && ST->hasFP16())
1234  if (const auto *Entry =
1235  CostTableLookup(AVX512FP16ShuffleTbl, Kind, LT.second))
1236  return LT.first * Entry->Cost;
1237 
1238  static const CostTblEntry AVX512VBMIShuffleTbl[] = {
1239  {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
1240  {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
1241 
1242  {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
1243  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
1244 
1245  {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
1246  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
1247  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b
1248  };
1249 
1250  if (ST->hasVBMI())
1251  if (const auto *Entry =
1252  CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1253  return LT.first * Entry->Cost;
1254 
1255  static const CostTblEntry AVX512BWShuffleTbl[] = {
1256  {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1257  {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1258 
1259  {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
1260  {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
1261  {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
1262 
1263  {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
1264  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
1265  {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
1266 
1267  {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
1268  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
1269  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
1270  {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1271 
1272  {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
1273  {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
1274  };
1275 
1276  if (ST->hasBWI())
1277  if (const auto *Entry =
1278  CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1279  return LT.first * Entry->Cost;
1280 
1281  static const CostTblEntry AVX512ShuffleTbl[] = {
1282  {TTI::SK_Broadcast, MVT::v8f64, 1}, // vbroadcastpd
1283  {TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps
1284  {TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq
1285  {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd
1286  {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1287  {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1288 
1289  {TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd
1290  {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps
1291  {TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq
1292  {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd
1293  {TTI::SK_Reverse, MVT::v32i16, 7}, // per mca
1294  {TTI::SK_Reverse, MVT::v64i8, 7}, // per mca
1295 
1296  {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd
1297  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1298  {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // vpermpd
1299  {TTI::SK_PermuteSingleSrc, MVT::v16f32, 1}, // vpermps
1300  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1301  {TTI::SK_PermuteSingleSrc, MVT::v4f32, 1}, // vpermps
1302  {TTI::SK_PermuteSingleSrc, MVT::v8i64, 1}, // vpermq
1303  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1304  {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // vpermq
1305  {TTI::SK_PermuteSingleSrc, MVT::v16i32, 1}, // vpermd
1306  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1307  {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // vpermd
1308  {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1309 
1310  {TTI::SK_PermuteTwoSrc, MVT::v8f64, 1}, // vpermt2pd
1311  {TTI::SK_PermuteTwoSrc, MVT::v16f32, 1}, // vpermt2ps
1312  {TTI::SK_PermuteTwoSrc, MVT::v8i64, 1}, // vpermt2q
1313  {TTI::SK_PermuteTwoSrc, MVT::v16i32, 1}, // vpermt2d
1314  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 1}, // vpermt2pd
1315  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 1}, // vpermt2ps
1316  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 1}, // vpermt2q
1317  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 1}, // vpermt2d
1318  {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1}, // vpermt2pd
1319  {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1}, // vpermt2ps
1320  {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1}, // vpermt2q
1321  {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1}, // vpermt2d
1322 
1323  // FIXME: This just applies the type legalization cost rules above
1324  // assuming these completely split.
1329 
1330  {TTI::SK_Select, MVT::v32i16, 1}, // vpternlogq
1331  {TTI::SK_Select, MVT::v64i8, 1}, // vpternlogq
1332  {TTI::SK_Select, MVT::v8f64, 1}, // vblendmpd
1333  {TTI::SK_Select, MVT::v16f32, 1}, // vblendmps
1334  {TTI::SK_Select, MVT::v8i64, 1}, // vblendmq
1335  {TTI::SK_Select, MVT::v16i32, 1}, // vblendmd
1336  };
1337 
1338  if (ST->hasAVX512())
1339  if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1340  return LT.first * Entry->Cost;
1341 
1342  static const CostTblEntry AVX2ShuffleTbl[] = {
1343  {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1344  {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1345  {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1346  {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1347  {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1348  {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1349 
1350  {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1351  {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1352  {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1353  {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1354  {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1355  {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1356 
1357  {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1358  {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1359 
1360  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1361  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1362  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1363  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1364  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1365  // + vpblendvb
1366  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1367  // + vpblendvb
1368 
1369  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1370  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1371  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1372  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1373  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1374  // + vpblendvb
1375  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
1376  // + vpblendvb
1377  };
1378 
1379  if (ST->hasAVX2())
1380  if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1381  return LT.first * Entry->Cost;
1382 
1383  static const CostTblEntry XOPShuffleTbl[] = {
1384  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
1385  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
1386  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
1387  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
1388  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1389  // + vinsertf128
1390  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
1391  // + vinsertf128
1392 
1393  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1394  // + vinsertf128
1395  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
1396  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
1397  // + vinsertf128
1398  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
1399  };
1400 
1401  if (ST->hasXOP())
1402  if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1403  return LT.first * Entry->Cost;
1404 
1405  static const CostTblEntry AVX1ShuffleTbl[] = {
1406  {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1407  {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1408  {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1409  {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1410  {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1411  {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
1412 
1413  {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1414  {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1415  {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1416  {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1417  {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1418  // + vinsertf128
1419  {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
1420  // + vinsertf128
1421 
1422  {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
1423  {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
1424  {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
1425  {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
1426  {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
1427  {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
1428 
1429  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
1430  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
1431  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1432  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1433  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
1434  // + 2*por + vinsertf128
1435  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
1436  // + 2*por + vinsertf128
1437 
1438  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
1439  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
1440  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1441  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1442  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
1443  // + 4*por + vinsertf128
1444  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
1445  // + 4*por + vinsertf128
1446  };
1447 
1448  if (ST->hasAVX())
1449  if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
1450  return LT.first * Entry->Cost;
1451 
1452  static const CostTblEntry SSE41ShuffleTbl[] = {
1453  {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
1454  {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1455  {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
1456  {TTI::SK_Select, MVT::v4f32, 1}, // blendps
1457  {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
1458  {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
1459  };
1460 
1461  if (ST->hasSSE41())
1462  if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
1463  return LT.first * Entry->Cost;
1464 
1465  static const CostTblEntry SSSE3ShuffleTbl[] = {
1466  {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
1467  {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
1468 
1469  {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
1470  {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
1471 
1472  {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
1473  {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
1474 
1475  {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
1476  {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1477 
1478  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
1479  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
1480  };
1481 
1482  if (ST->hasSSSE3())
1483  if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
1484  return LT.first * Entry->Cost;
1485 
1486  static const CostTblEntry SSE2ShuffleTbl[] = {
1487  {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
1488  {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
1489  {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
1490  {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
1491  {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
1492 
1493  {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
1494  {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
1495  {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
1496  {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
1497  {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
1498  // + 2*pshufd + 2*unpck + packus
1499 
1500  {TTI::SK_Select, MVT::v2i64, 1}, // movsd
1501  {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1502  {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
1503  {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
1504  {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
1505 
1506  {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
1507  {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
1508  {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
1509  {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
1510  // + pshufd/unpck
1511  { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
1512  // + 2*pshufd + 2*unpck + 2*packus
1513 
1514  { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
1515  { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
1516  { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
1517  { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
1518  { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
1519  };
1520 
1521  if (ST->hasSSE2())
1522  if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
1523  return LT.first * Entry->Cost;
1524 
1525  static const CostTblEntry SSE1ShuffleTbl[] = {
1526  { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
1527  { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
1528  { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
1529  { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
1530  { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
1531  };
1532 
1533  if (ST->hasSSE1())
1534  if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
1535  return LT.first * Entry->Cost;
1536 
1537  return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp);
1538 }
1539 
1541  Type *Src,
1544  const Instruction *I) {
1545  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1546  assert(ISD && "Invalid opcode");
1547 
1548  // TODO: Allow non-throughput costs that aren't binary.
1549  auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
1551  return Cost == 0 ? 0 : 1;
1552  return Cost;
1553  };
1554 
1555  // The cost tables include both specific, custom (non-legal) src/dst type
1556  // conversions and generic, legalized types. We test for customs first, before
1557  // falling back to legalization.
1558  // FIXME: Need a better design of the cost table to handle non-simple types of
1559  // potential massive combinations (elem_num x src_type x dst_type).
1560  static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
1563 
1564  // Mask sign extend has an instruction.
1576 
1577  // Mask zero extend is a sext + shift.
1589 
1591  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm
1592  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // widen to zmm
1593  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // widen to zmm
1594  { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // vpmovwb
1595  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // widen to zmm
1596  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // widen to zmm
1597  { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // vpmovwb
1598  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // widen to zmm
1599  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // widen to zmm
1600  { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // vpmovwb
1601  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // widen to zmm
1602  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // widen to zmm
1603  { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // widen to zmm
1606  };
1607 
1608  static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
1611 
1614 
1617 
1620  };
1621 
1622  // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
1623  // 256-bit wide vectors.
1624 
1625  static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
1629 
1630  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
1631  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
1632  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
1633  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd
1634  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
1635  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
1636  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
1637  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd
1638  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd
1639  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd
1640  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd
1641  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd
1642  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq
1643  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq
1644  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq
1645  { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 2 }, // vpmovdb
1646  { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 2 }, // vpmovdb
1647  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 }, // vpmovdb
1648  { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 }, // vpmovdb
1649  { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 2 }, // vpmovqb
1650  { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1 }, // vpshufb
1651  { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 }, // vpmovqb
1652  { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 }, // vpmovqw
1653  { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, // vpmovqd
1654  { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd
1655  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb
1656 
1657  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32
1659 
1660  // Sign extend is zmm vpternlogd+vptruncdb.
1661  // Zero extend is zmm broadcast load+vptruncdw.
1670 
1671  // Sign extend is zmm vpternlogd+vptruncdw.
1672  // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
1681 
1682  { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd
1683  { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld
1684  { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd
1685  { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld
1686  { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd
1687  { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld
1688  { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq
1689  { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq
1690  { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq
1691  { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq
1692 
1693  { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd
1694  { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld
1695  { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq
1696  { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq
1697 
1708 
1709  { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
1710  { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
1711 
1720 
1731 
1743 
1750  };
1751 
1752  static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] {
1753  // Mask sign extend has an instruction.
1763 
1764  // Mask zero extend is a sext + shift.
1774 
1776  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // vpsllw+vptestmb
1777  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // vpsllw+vptestmw
1778  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // vpsllw+vptestmb
1779  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // vpsllw+vptestmw
1780  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // vpsllw+vptestmb
1781  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // vpsllw+vptestmw
1782  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // vpsllw+vptestmb
1783  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // vpsllw+vptestmw
1784  { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // vpsllw+vptestmb
1785  };
1786 
1787  static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = {
1792 
1797 
1802 
1807  };
1808 
1809  static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = {
1810  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
1811  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
1812  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
1813  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8
1814  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
1815  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
1816  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
1817  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16
1818  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd
1819  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd
1820  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd
1821  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq
1822  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq
1823  { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd
1824  { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, // vpmovqb
1825  { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, // vpmovqw
1826  { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, // vpmovwb
1827 
1828  // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
1829  // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
1838 
1839  // sign extend is vpcmpeq+maskedmove+vpmovdw
1840  // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
1849 
1850  { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd
1851  { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld
1852  { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd
1853  { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld
1854  { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd
1855  { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld
1856  { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq
1857  { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq
1858  { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq
1859  { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq
1860 
1873 
1878 
1892 
1896 
1904  };
1905 
1906  static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
1913 
1928 
1930 
1941 
1944 
1949 
1958 
1966 
1977  };
1978 
1979  static const TypeConversionCostTblEntry AVXConversionTbl[] = {
1986 
1999 
2005 
2006  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // and+extract+packuswb
2010  { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 3 }, // and+extract+2*packusdw
2012 
2025 
2043 
2055 
2069 
2072  };
2073 
2074  static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
2087 
2088  // These truncates end up widening elements.
2089  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ
2090  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ
2091  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD
2092 
2096 
2108 
2123 
2134 
2145  };
2146 
2147  static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
2148  // These are somewhat magic numbers justified by comparing the
2149  // output of llvm-mca for our various supported scheduler models
2150  // and basing it off the worst case scenario.
2163 
2177 
2188 
2192  { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 15 },
2199 
2212 
2213  // These truncates are really widening elements.
2214  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD
2215  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // PUNPCKLWD+DQ
2216  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // PUNPCKLBW+WD+PSHUFD
2217  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 1 }, // PUNPCKLWD
2218  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD
2219  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW
2220 
2221  { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 2 }, // PAND+PACKUSWB
2223  { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 3 }, // PAND+2*PACKUSWB
2229  { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB
2230  { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW
2231  { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, 1 }, // PSHUFD
2232  };
2233 
2234  // Attempt to map directly to (simple) MVT types to let us match custom entries.
2235  EVT SrcTy = TLI->getValueType(DL, Src);
2236  EVT DstTy = TLI->getValueType(DL, Dst);
2237 
2238  // The function getSimpleVT only handles simple value types.
2239  if (SrcTy.isSimple() && DstTy.isSimple()) {
2240  MVT SimpleSrcTy = SrcTy.getSimpleVT();
2241  MVT SimpleDstTy = DstTy.getSimpleVT();
2242 
2243  if (ST->useAVX512Regs()) {
2244  if (ST->hasBWI())
2245  if (const auto *Entry = ConvertCostTableLookup(
2246  AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2247  return AdjustCost(Entry->Cost);
2248 
2249  if (ST->hasDQI())
2250  if (const auto *Entry = ConvertCostTableLookup(
2251  AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2252  return AdjustCost(Entry->Cost);
2253 
2254  if (ST->hasAVX512())
2255  if (const auto *Entry = ConvertCostTableLookup(
2256  AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2257  return AdjustCost(Entry->Cost);
2258  }
2259 
2260  if (ST->hasBWI())
2261  if (const auto *Entry = ConvertCostTableLookup(
2262  AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2263  return AdjustCost(Entry->Cost);
2264 
2265  if (ST->hasDQI())
2266  if (const auto *Entry = ConvertCostTableLookup(
2267  AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2268  return AdjustCost(Entry->Cost);
2269 
2270  if (ST->hasAVX512())
2271  if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2272  SimpleDstTy, SimpleSrcTy))
2273  return AdjustCost(Entry->Cost);
2274 
2275  if (ST->hasAVX2()) {
2276  if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2277  SimpleDstTy, SimpleSrcTy))
2278  return AdjustCost(Entry->Cost);
2279  }
2280 
2281  if (ST->hasAVX()) {
2282  if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2283  SimpleDstTy, SimpleSrcTy))
2284  return AdjustCost(Entry->Cost);
2285  }
2286 
2287  if (ST->hasSSE41()) {
2288  if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
2289  SimpleDstTy, SimpleSrcTy))
2290  return AdjustCost(Entry->Cost);
2291  }
2292 
2293  if (ST->hasSSE2()) {
2294  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2295  SimpleDstTy, SimpleSrcTy))
2296  return AdjustCost(Entry->Cost);
2297  }
2298  }
2299 
2300  // Fall back to legalized types.
2301  std::pair<InstructionCost, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
2302  std::pair<InstructionCost, MVT> LTDest =
2303  TLI->getTypeLegalizationCost(DL, Dst);
2304 
2305  if (ST->useAVX512Regs()) {
2306  if (ST->hasBWI())
2307  if (const auto *Entry = ConvertCostTableLookup(
2308  AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
2309  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2310 
2311  if (ST->hasDQI())
2312  if (const auto *Entry = ConvertCostTableLookup(
2313  AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
2314  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2315 
2316  if (ST->hasAVX512())
2317  if (const auto *Entry = ConvertCostTableLookup(
2318  AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
2319  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2320  }
2321 
2322  if (ST->hasBWI())
2323  if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
2324  LTDest.second, LTSrc.second))
2325  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2326 
2327  if (ST->hasDQI())
2328  if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
2329  LTDest.second, LTSrc.second))
2330  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2331 
2332  if (ST->hasAVX512())
2333  if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2334  LTDest.second, LTSrc.second))
2335  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2336 
2337  if (ST->hasAVX2())
2338  if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2339  LTDest.second, LTSrc.second))
2340  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2341 
2342  if (ST->hasAVX())
2343  if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2344  LTDest.second, LTSrc.second))
2345  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2346 
2347  if (ST->hasSSE41())
2348  if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
2349  LTDest.second, LTSrc.second))
2350  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2351 
2352  if (ST->hasSSE2())
2353  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2354  LTDest.second, LTSrc.second))
2355  return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2356 
2357  // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
2358  // sitofp.
2359  if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
2360  1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
2361  Type *ExtSrc = Src->getWithNewBitWidth(32);
2362  unsigned ExtOpc =
2363  (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
2364 
2365  // For scalar loads the extend would be free.
2366  InstructionCost ExtCost = 0;
2367  if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
2368  ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
2369 
2370  return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
2372  }
2373 
2374  // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
2375  // i32.
2376  if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
2377  1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
2378  Type *TruncDst = Dst->getWithNewBitWidth(32);
2379  return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
2380  getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
2382  }
2383 
2384  return AdjustCost(
2385  BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2386 }
2387 
2389  Type *CondTy,
2390  CmpInst::Predicate VecPred,
2392  const Instruction *I) {
2393  // TODO: Handle other cost kinds.
2395  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2396  I);
2397 
2398  // Legalize the type.
2399  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2400 
2401  MVT MTy = LT.second;
2402 
2403  int ISD = TLI->InstructionOpcodeToISD(Opcode);
2404  assert(ISD && "Invalid opcode");
2405 
2406  unsigned ExtraCost = 0;
2407  if (I && (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp)) {
2408  // Some vector comparison predicates cost extra instructions.
2409  if (MTy.isVector() &&
2410  !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
2411  (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
2412  ST->hasBWI())) {
2413  switch (cast<CmpInst>(I)->getPredicate()) {
2414  case CmpInst::Predicate::ICMP_NE:
2415  // xor(cmpeq(x,y),-1)
2416  ExtraCost = 1;
2417  break;
2418  case CmpInst::Predicate::ICMP_SGE:
2419  case CmpInst::Predicate::ICMP_SLE:
2420  // xor(cmpgt(x,y),-1)
2421  ExtraCost = 1;
2422  break;
2423  case CmpInst::Predicate::ICMP_ULT:
2424  case CmpInst::Predicate::ICMP_UGT:
2425  // cmpgt(xor(x,signbit),xor(y,signbit))
2426  // xor(cmpeq(pmaxu(x,y),x),-1)
2427  ExtraCost = 2;
2428  break;
2429  case CmpInst::Predicate::ICMP_ULE:
2430  case CmpInst::Predicate::ICMP_UGE:
2431  if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
2432  (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
2433  // cmpeq(psubus(x,y),0)
2434  // cmpeq(pminu(x,y),x)
2435  ExtraCost = 1;
2436  } else {
2437  // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
2438  ExtraCost = 3;
2439  }
2440  break;
2441  default:
2442  break;
2443  }
2444  }
2445  }
2446 
2447  static const CostTblEntry SLMCostTbl[] = {
2448  // slm pcmpeq/pcmpgt throughput is 2
2449  { ISD::SETCC, MVT::v2i64, 2 },
2450  };
2451 
2452  static const CostTblEntry AVX512BWCostTbl[] = {
2453  { ISD::SETCC, MVT::v32i16, 1 },
2454  { ISD::SETCC, MVT::v64i8, 1 },
2455 
2456  { ISD::SELECT, MVT::v32i16, 1 },
2457  { ISD::SELECT, MVT::v64i8, 1 },
2458  };
2459 
2460  static const CostTblEntry AVX512CostTbl[] = {
2461  { ISD::SETCC, MVT::v8i64, 1 },
2462  { ISD::SETCC, MVT::v16i32, 1 },
2463  { ISD::SETCC, MVT::v8f64, 1 },
2464  { ISD::SETCC, MVT::v16f32, 1 },
2465 
2466  { ISD::SELECT, MVT::v8i64, 1 },
2467  { ISD::SELECT, MVT::v16i32, 1 },
2468  { ISD::SELECT, MVT::v8f64, 1 },
2469  { ISD::SELECT, MVT::v16f32, 1 },
2470 
2471  { ISD::SETCC, MVT::v32i16, 2 }, // FIXME: should probably be 4
2472  { ISD::SETCC, MVT::v64i8, 2 }, // FIXME: should probably be 4
2473 
2474  { ISD::SELECT, MVT::v32i16, 2 }, // FIXME: should be 3
2475  { ISD::SELECT, MVT::v64i8, 2 }, // FIXME: should be 3
2476  };
2477 
2478  static const CostTblEntry AVX2CostTbl[] = {
2479  { ISD::SETCC, MVT::v4i64, 1 },
2480  { ISD::SETCC, MVT::v8i32, 1 },
2481  { ISD::SETCC, MVT::v16i16, 1 },
2482  { ISD::SETCC, MVT::v32i8, 1 },
2483 
2484  { ISD::SELECT, MVT::v4i64, 1 }, // pblendvb
2485  { ISD::SELECT, MVT::v8i32, 1 }, // pblendvb
2486  { ISD::SELECT, MVT::v16i16, 1 }, // pblendvb
2487  { ISD::SELECT, MVT::v32i8, 1 }, // pblendvb
2488  };
2489 
2490  static const CostTblEntry AVX1CostTbl[] = {
2491  { ISD::SETCC, MVT::v4f64, 1 },
2492  { ISD::SETCC, MVT::v8f32, 1 },
2493  // AVX1 does not support 8-wide integer compare.
2494  { ISD::SETCC, MVT::v4i64, 4 },
2495  { ISD::SETCC, MVT::v8i32, 4 },
2496  { ISD::SETCC, MVT::v16i16, 4 },
2497  { ISD::SETCC, MVT::v32i8, 4 },
2498 
2499  { ISD::SELECT, MVT::v4f64, 1 }, // vblendvpd
2500  { ISD::SELECT, MVT::v8f32, 1 }, // vblendvps
2501  { ISD::SELECT, MVT::v4i64, 1 }, // vblendvpd
2502  { ISD::SELECT, MVT::v8i32, 1 }, // vblendvps
2503  { ISD::SELECT, MVT::v16i16, 3 }, // vandps + vandnps + vorps
2504  { ISD::SELECT, MVT::v32i8, 3 }, // vandps + vandnps + vorps
2505  };
2506 
2507  static const CostTblEntry SSE42CostTbl[] = {
2508  { ISD::SETCC, MVT::v2f64, 1 },
2509  { ISD::SETCC, MVT::v4f32, 1 },
2510  { ISD::SETCC, MVT::v2i64, 1 },
2511  };
2512 
2513  static const CostTblEntry SSE41CostTbl[] = {
2514  { ISD::SELECT, MVT::v2f64, 1 }, // blendvpd
2515  { ISD::SELECT, MVT::v4f32, 1 }, // blendvps
2516  { ISD::SELECT, MVT::v2i64, 1 }, // pblendvb
2517  { ISD::SELECT, MVT::v4i32, 1 }, // pblendvb
2518  { ISD::SELECT, MVT::v8i16, 1 }, // pblendvb
2519  { ISD::SELECT, MVT::v16i8, 1 }, // pblendvb
2520  };
2521 
2522  static const CostTblEntry SSE2CostTbl[] = {
2523  { ISD::SETCC, MVT::v2f64, 2 },
2524  { ISD::SETCC, MVT::f64, 1 },
2525  { ISD::SETCC, MVT::v2i64, 8 },
2526  { ISD::SETCC, MVT::v4i32, 1 },
2527  { ISD::SETCC, MVT::v8i16, 1 },
2528  { ISD::SETCC, MVT::v16i8, 1 },
2529 
2530  { ISD::SELECT, MVT::v2f64, 3 }, // andpd + andnpd + orpd
2531  { ISD::SELECT, MVT::v2i64, 3 }, // pand + pandn + por
2532  { ISD::SELECT, MVT::v4i32, 3 }, // pand + pandn + por
2533  { ISD::SELECT, MVT::v8i16, 3 }, // pand + pandn + por
2534  { ISD::SELECT, MVT::v16i8, 3 }, // pand + pandn + por
2535  };
2536 
2537  static const CostTblEntry SSE1CostTbl[] = {
2538  { ISD::SETCC, MVT::v4f32, 2 },
2539  { ISD::SETCC, MVT::f32, 1 },
2540 
2541  { ISD::SELECT, MVT::v4f32, 3 }, // andps + andnps + orps
2542  };
2543 
2544  if (ST->isSLM())
2545  if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
2546  return LT.first * (ExtraCost + Entry->Cost);
2547 
2548  if (ST->hasBWI())
2549  if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
2550  return LT.first * (ExtraCost + Entry->Cost);
2551 
2552  if (ST->hasAVX512())
2553  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2554  return LT.first * (ExtraCost + Entry->Cost);
2555 
2556  if (ST->hasAVX2())
2557  if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
2558  return LT.first * (ExtraCost + Entry->Cost);
2559 
2560  if (ST->hasAVX())
2561  if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
2562  return LT.first * (ExtraCost + Entry->Cost);
2563 
2564  if (ST->hasSSE42())
2565  if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
2566  return LT.first * (ExtraCost + Entry->Cost);
2567 
2568  if (ST->hasSSE41())
2569  if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
2570  return LT.first * (ExtraCost + Entry->Cost);
2571 
2572  if (ST->hasSSE2())
2573  if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
2574  return LT.first * (ExtraCost + Entry->Cost);
2575 
2576  if (ST->hasSSE1())
2577  if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
2578  return LT.first * (ExtraCost + Entry->Cost);
2579 
2580  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
2581 }
2582 
2584 
2588 
2589  // Costs should match the codegen from:
2590  // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
2591  // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
2592  // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
2593  // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
2594  // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
2595 
2596  // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
2597  // specialized in these tables yet.
2598  static const CostTblEntry AVX512BITALGCostTbl[] = {
2599  { ISD::CTPOP, MVT::v32i16, 1 },
2600  { ISD::CTPOP, MVT::v64i8, 1 },
2601  { ISD::CTPOP, MVT::v16i16, 1 },
2602  { ISD::CTPOP, MVT::v32i8, 1 },
2603  { ISD::CTPOP, MVT::v8i16, 1 },
2604  { ISD::CTPOP, MVT::v16i8, 1 },
2605  };
2606  static const CostTblEntry AVX512VPOPCNTDQCostTbl[] = {
2607  { ISD::CTPOP, MVT::v8i64, 1 },
2608  { ISD::CTPOP, MVT::v16i32, 1 },
2609  { ISD::CTPOP, MVT::v4i64, 1 },
2610  { ISD::CTPOP, MVT::v8i32, 1 },
2611  { ISD::CTPOP, MVT::v2i64, 1 },
2612  { ISD::CTPOP, MVT::v4i32, 1 },
2613  };
2614  static const CostTblEntry AVX512CDCostTbl[] = {
2615  { ISD::CTLZ, MVT::v8i64, 1 },
2616  { ISD::CTLZ, MVT::v16i32, 1 },
2617  { ISD::CTLZ, MVT::v32i16, 8 },
2618  { ISD::CTLZ, MVT::v64i8, 20 },
2619  { ISD::CTLZ, MVT::v4i64, 1 },
2620  { ISD::CTLZ, MVT::v8i32, 1 },
2621  { ISD::CTLZ, MVT::v16i16, 4 },
2622  { ISD::CTLZ, MVT::v32i8, 10 },
2623  { ISD::CTLZ, MVT::v2i64, 1 },
2624  { ISD::CTLZ, MVT::v4i32, 1 },
2625  { ISD::CTLZ, MVT::v8i16, 4 },
2626  { ISD::CTLZ, MVT::v16i8, 4 },
2627  };
2628  static const CostTblEntry AVX512BWCostTbl[] = {
2629  { ISD::ABS, MVT::v32i16, 1 },
2630  { ISD::ABS, MVT::v64i8, 1 },
2631  { ISD::BITREVERSE, MVT::v8i64, 3 },
2632  { ISD::BITREVERSE, MVT::v16i32, 3 },
2633  { ISD::BITREVERSE, MVT::v32i16, 3 },
2634  { ISD::BITREVERSE, MVT::v64i8, 2 },
2635  { ISD::BSWAP, MVT::v8i64, 1 },
2636  { ISD::BSWAP, MVT::v16i32, 1 },
2637  { ISD::BSWAP, MVT::v32i16, 1 },
2638  { ISD::CTLZ, MVT::v8i64, 23 },
2639  { ISD::CTLZ, MVT::v16i32, 22 },
2640  { ISD::CTLZ, MVT::v32i16, 18 },
2641  { ISD::CTLZ, MVT::v64i8, 17 },
2642  { ISD::CTPOP, MVT::v8i64, 7 },
2643  { ISD::CTPOP, MVT::v16i32, 11 },
2644  { ISD::CTPOP, MVT::v32i16, 9 },
2645  { ISD::CTPOP, MVT::v64i8, 6 },
2646  { ISD::CTTZ, MVT::v8i64, 10 },
2647  { ISD::CTTZ, MVT::v16i32, 14 },
2648  { ISD::CTTZ, MVT::v32i16, 12 },
2649  { ISD::CTTZ, MVT::v64i8, 9 },
2650  { ISD::SADDSAT, MVT::v32i16, 1 },
2651  { ISD::SADDSAT, MVT::v64i8, 1 },
2652  { ISD::SMAX, MVT::v32i16, 1 },
2653  { ISD::SMAX, MVT::v64i8, 1 },
2654  { ISD::SMIN, MVT::v32i16, 1 },
2655  { ISD::SMIN, MVT::v64i8, 1 },
2656  { ISD::SSUBSAT, MVT::v32i16, 1 },
2657  { ISD::SSUBSAT, MVT::v64i8, 1 },
2658  { ISD::UADDSAT, MVT::v32i16, 1 },
2659  { ISD::UADDSAT, MVT::v64i8, 1 },
2660  { ISD::UMAX, MVT::v32i16, 1 },
2661  { ISD::UMAX, MVT::v64i8, 1 },
2662  { ISD::UMIN, MVT::v32i16, 1 },
2663  { ISD::UMIN, MVT::v64i8, 1 },
2664  { ISD::USUBSAT, MVT::v32i16, 1 },
2665  { ISD::USUBSAT, MVT::v64i8, 1 },
2666  };
2667  static const CostTblEntry AVX512CostTbl[] = {
2668  { ISD::ABS, MVT::v8i64, 1 },
2669  { ISD::ABS, MVT::v16i32, 1 },
2670  { ISD::ABS, MVT::v32i16, 2 }, // FIXME: include split
2671  { ISD::ABS, MVT::v64i8, 2 }, // FIXME: include split
2672  { ISD::ABS, MVT::v4i64, 1 },
2673  { ISD::ABS, MVT::v2i64, 1 },
2674  { ISD::BITREVERSE, MVT::v8i64, 36 },
2675  { ISD::BITREVERSE, MVT::v16i32, 24 },
2676  { ISD::BITREVERSE, MVT::v32i16, 10 },
2677  { ISD::BITREVERSE, MVT::v64i8, 10 },
2678  { ISD::BSWAP, MVT::v8i64, 4 },
2679  { ISD::BSWAP, MVT::v16i32, 4 },
2680  { ISD::BSWAP, MVT::v32i16, 4 },
2681  { ISD::CTLZ, MVT::v8i64, 29 },
2682  { ISD::CTLZ, MVT::v16i32, 35 },
2683  { ISD::CTLZ, MVT::v32i16, 28 },
2684  { ISD::CTLZ, MVT::v64i8, 18 },
2685  { ISD::CTPOP, MVT::v8i64, 16 },
2686  { ISD::CTPOP, MVT::v16i32, 24 },
2687  { ISD::CTPOP, MVT::v32i16, 18 },
2688  { ISD::CTPOP, MVT::v64i8, 12 },
2689  { ISD::CTTZ, MVT::v8i64, 20 },
2690  { ISD::CTTZ, MVT::v16i32, 28 },
2691  { ISD::CTTZ, MVT::v32i16, 24 },
2692  { ISD::CTTZ, MVT::v64i8, 18 },
2693  { ISD::SMAX, MVT::v8i64, 1 },
2694  { ISD::SMAX, MVT::v16i32, 1 },
2695  { ISD::SMAX, MVT::v32i16, 2 }, // FIXME: include split
2696  { ISD::SMAX, MVT::v64i8, 2 }, // FIXME: include split
2697  { ISD::SMAX, MVT::v4i64, 1 },
2698  { ISD::SMAX, MVT::v2i64, 1 },
2699  { ISD::SMIN, MVT::v8i64, 1 },
2700  { ISD::SMIN, MVT::v16i32, 1 },
2701  { ISD::SMIN, MVT::v32i16, 2 }, // FIXME: include split
2702  { ISD::SMIN, MVT::v64i8, 2 }, // FIXME: include split
2703  { ISD::SMIN, MVT::v4i64, 1 },
2704  { ISD::SMIN, MVT::v2i64, 1 },
2705  { ISD::UMAX, MVT::v8i64, 1 },
2706  { ISD::UMAX, MVT::v16i32, 1 },
2707  { ISD::UMAX, MVT::v32i16, 2 }, // FIXME: include split
2708  { ISD::UMAX, MVT::v64i8, 2 }, // FIXME: include split
2709  { ISD::UMAX, MVT::v4i64, 1 },
2710  { ISD::UMAX, MVT::v2i64, 1 },
2711  { ISD::UMIN, MVT::v8i64, 1 },
2712  { ISD::UMIN, MVT::v16i32, 1 },
2713  { ISD::UMIN, MVT::v32i16, 2 }, // FIXME: include split
2714  { ISD::UMIN, MVT::v64i8, 2 }, // FIXME: include split
2715  { ISD::UMIN, MVT::v4i64, 1 },
2716  { ISD::UMIN, MVT::v2i64, 1 },
2717  { ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd
2718  { ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq
2719  { ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq
2720  { ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq
2721  { ISD::UADDSAT, MVT::v16i32, 3 }, // not + pminud + paddd
2722  { ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq
2723  { ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq
2724  { ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq
2725  { ISD::SADDSAT, MVT::v32i16, 2 }, // FIXME: include split
2726  { ISD::SADDSAT, MVT::v64i8, 2 }, // FIXME: include split
2727  { ISD::SSUBSAT, MVT::v32i16, 2 }, // FIXME: include split
2728  { ISD::SSUBSAT, MVT::v64i8, 2 }, // FIXME: include split
2729  { ISD::UADDSAT, MVT::v32i16, 2 }, // FIXME: include split
2730  { ISD::UADDSAT, MVT::v64i8, 2 }, // FIXME: include split
2731  { ISD::USUBSAT, MVT::v32i16, 2 }, // FIXME: include split
2732  { ISD::USUBSAT, MVT::v64i8, 2 }, // FIXME: include split
2733  { ISD::FMAXNUM, MVT::f32, 2 },
2734  { ISD::FMAXNUM, MVT::v4f32, 2 },
2735  { ISD::FMAXNUM, MVT::v8f32, 2 },
2736  { ISD::FMAXNUM, MVT::v16f32, 2 },
2737  { ISD::FMAXNUM, MVT::f64, 2 },
2738  { ISD::FMAXNUM, MVT::v2f64, 2 },
2739  { ISD::FMAXNUM, MVT::v4f64, 2 },
2740  { ISD::FMAXNUM, MVT::v8f64, 2 },
2741  };
2742  static const CostTblEntry XOPCostTbl[] = {
2743  { ISD::BITREVERSE, MVT::v4i64, 4 },
2744  { ISD::BITREVERSE, MVT::v8i32, 4 },
2745  { ISD::BITREVERSE, MVT::v16i16, 4 },
2746  { ISD::BITREVERSE, MVT::v32i8, 4 },
2747  { ISD::BITREVERSE, MVT::v2i64, 1 },
2748  { ISD::BITREVERSE, MVT::v4i32, 1 },
2749  { ISD::BITREVERSE, MVT::v8i16, 1 },
2750  { ISD::BITREVERSE, MVT::v16i8, 1 },
2751  { ISD::BITREVERSE, MVT::i64, 3 },
2752  { ISD::BITREVERSE, MVT::i32, 3 },
2753  { ISD::BITREVERSE, MVT::i16, 3 },
2754  { ISD::BITREVERSE, MVT::i8, 3 }
2755  };
2756  static const CostTblEntry AVX2CostTbl[] = {
2757  { ISD::ABS, MVT::v4i64, 2 }, // VBLENDVPD(X,VPSUBQ(0,X),X)
2758  { ISD::ABS, MVT::v8i32, 1 },
2759  { ISD::ABS, MVT::v16i16, 1 },
2760  { ISD::ABS, MVT::v32i8, 1 },
2761  { ISD::BITREVERSE, MVT::v2i64, 3 },
2762  { ISD::BITREVERSE, MVT::v4i64, 3 },
2763  { ISD::BITREVERSE, MVT::v4i32, 3 },
2764  { ISD::BITREVERSE, MVT::v8i32, 3 },
2765  { ISD::BITREVERSE, MVT::v8i16, 3 },
2766  { ISD::BITREVERSE, MVT::v16i16, 3 },
2767  { ISD::BITREVERSE, MVT::v16i8, 3 },
2768  { ISD::BITREVERSE, MVT::v32i8, 3 },
2769  { ISD::BSWAP, MVT::v4i64, 1 },
2770  { ISD::BSWAP, MVT::v8i32, 1 },
2771  { ISD::BSWAP, MVT::v16i16, 1 },
2772  { ISD::CTLZ, MVT::v2i64, 7 },
2773  { ISD::CTLZ, MVT::v4i64, 7 },
2774  { ISD::CTLZ, MVT::v4i32, 5 },
2775  { ISD::CTLZ, MVT::v8i32, 5 },
2776  { ISD::CTLZ, MVT::v8i16, 4 },
2777  { ISD::CTLZ, MVT::v16i16, 4 },
2778  { ISD::CTLZ, MVT::v16i8, 3 },
2779  { ISD::CTLZ, MVT::v32i8, 3 },
2780  { ISD::CTPOP, MVT::v2i64, 3 },
2781  { ISD::CTPOP, MVT::v4i64, 3 },
2782  { ISD::CTPOP, MVT::v4i32, 7 },
2783  { ISD::CTPOP, MVT::v8i32, 7 },
2784  { ISD::CTPOP, MVT::v8i16, 3 },
2785  { ISD::CTPOP, MVT::v16i16, 3 },
2786  { ISD::CTPOP, MVT::v16i8, 2 },
2787  { ISD::CTPOP, MVT::v32i8, 2 },
2788  { ISD::CTTZ, MVT::v2i64, 4 },
2789  { ISD::CTTZ, MVT::v4i64, 4 },
2790  { ISD::CTTZ, MVT::v4i32, 7 },
2791  { ISD::CTTZ, MVT::v8i32, 7 },
2792  { ISD::CTTZ, MVT::v8i16, 4 },
2793  { ISD::CTTZ, MVT::v16i16, 4 },
2794  { ISD::CTTZ, MVT::v16i8, 3 },
2795  { ISD::CTTZ, MVT::v32i8, 3 },
2796  { ISD::SADDSAT, MVT::v16i16, 1 },
2797  { ISD::SADDSAT, MVT::v32i8, 1 },
2798  { ISD::SMAX, MVT::v8i32, 1 },
2799  { ISD::SMAX, MVT::v16i16, 1 },
2800  { ISD::SMAX, MVT::v32i8, 1 },
2801  { ISD::SMIN, MVT::v8i32, 1 },
2802  { ISD::SMIN, MVT::v16i16, 1 },
2803  { ISD::SMIN, MVT::v32i8, 1 },
2804  { ISD::SSUBSAT, MVT::v16i16, 1 },
2805  { ISD::SSUBSAT, MVT::v32i8, 1 },
2806  { ISD::UADDSAT, MVT::v16i16, 1 },
2807  { ISD::UADDSAT, MVT::v32i8, 1 },
2808  { ISD::UADDSAT, MVT::v8i32, 3 }, // not + pminud + paddd
2809  { ISD::UMAX, MVT::v8i32, 1 },
2810  { ISD::UMAX, MVT::v16i16, 1 },
2811  { ISD::UMAX, MVT::v32i8, 1 },
2812  { ISD::UMIN, MVT::v8i32, 1 },
2813  { ISD::UMIN, MVT::v16i16, 1 },
2814  { ISD::UMIN, MVT::v32i8, 1 },
2815  { ISD::USUBSAT, MVT::v16i16, 1 },
2816  { ISD::USUBSAT, MVT::v32i8, 1 },
2817  { ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd
2818  { ISD::FMAXNUM, MVT::v8f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS
2819  { ISD::FMAXNUM, MVT::v4f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD
2820  { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/
2821  { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
2822  { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
2823  { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/
2824  { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
2825  { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
2826  };
2827  static const CostTblEntry AVX1CostTbl[] = {
2828  { ISD::ABS, MVT::v4i64, 5 }, // VBLENDVPD(X,VPSUBQ(0,X),X)
2829  { ISD::ABS, MVT::v8i32, 3 },
2830  { ISD::ABS, MVT::v16i16, 3 },
2831  { ISD::ABS, MVT::v32i8, 3 },
2832  { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert
2833  { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert
2834  { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert
2835  { ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert
2836  { ISD::BSWAP, MVT::v4i64, 4 },
2837  { ISD::BSWAP, MVT::v8i32, 4 },
2838  { ISD::BSWAP, MVT::v16i16, 4 },
2839  { ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert
2840  { ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert
2841  { ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert
2842  { ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
2843  { ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert
2844  { ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert
2845  { ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert
2846  { ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert
2847  { ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert
2848  { ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert
2849  { ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert
2850  { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
2851  { ISD::SADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2852  { ISD::SADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2853  { ISD::SMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
2854  { ISD::SMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2855  { ISD::SMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2856  { ISD::SMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
2857  { ISD::SMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2858  { ISD::SMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2859  { ISD::SSUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2860  { ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2861  { ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2862  { ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2863  { ISD::UADDSAT, MVT::v8i32, 8 }, // 2 x 128-bit Op + extract/insert
2864  { ISD::UMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
2865  { ISD::UMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2866  { ISD::UMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2867  { ISD::UMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
2868  { ISD::UMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2869  { ISD::UMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2870  { ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2871  { ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2872  { ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert
2873  { ISD::FMAXNUM, MVT::f32, 3 }, // MAXSS + CMPUNORDSS + BLENDVPS
2874  { ISD::FMAXNUM, MVT::v4f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS
2875  { ISD::FMAXNUM, MVT::v8f32, 5 }, // MAXPS + CMPUNORDPS + BLENDVPS + ?
2876  { ISD::FMAXNUM, MVT::f64, 3 }, // MAXSD + CMPUNORDSD + BLENDVPD
2877  { ISD::FMAXNUM, MVT::v2f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD
2878  { ISD::FMAXNUM, MVT::v4f64, 5 }, // MAXPD + CMPUNORDPD + BLENDVPD + ?
2879  { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
2880  { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
2881  { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
2882  { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/
2883  { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/
2884  { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/
2885  };
2886  static const CostTblEntry GLMCostTbl[] = {
2887  { ISD::FSQRT, MVT::f32, 19 }, // sqrtss
2888  { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps
2889  { ISD::FSQRT, MVT::f64, 34 }, // sqrtsd
2890  { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd
2891  };
2892  static const CostTblEntry SLMCostTbl[] = {
2893  { ISD::FSQRT, MVT::f32, 20 }, // sqrtss
2894  { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps
2895  { ISD::FSQRT, MVT::f64, 35 }, // sqrtsd
2896  { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd
2897  };
2898  static const CostTblEntry SSE42CostTbl[] = {
2899  { ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd
2900  { ISD::UADDSAT, MVT::v4i32, 3 }, // not + pminud + paddd
2901  { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
2902  { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
2903  };
2904  static const CostTblEntry SSE41CostTbl[] = {
2905  { ISD::ABS, MVT::v2i64, 2 }, // BLENDVPD(X,PSUBQ(0,X),X)
2906  { ISD::SMAX, MVT::v4i32, 1 },
2907  { ISD::SMAX, MVT::v16i8, 1 },
2908  { ISD::SMIN, MVT::v4i32, 1 },
2909  { ISD::SMIN, MVT::v16i8, 1 },
2910  { ISD::UMAX, MVT::v4i32, 1 },
2911  { ISD::UMAX, MVT::v8i16, 1 },
2912  { ISD::UMIN, MVT::v4i32, 1 },
2913  { ISD::UMIN, MVT::v8i16, 1 },
2914  };
2915  static const CostTblEntry SSSE3CostTbl[] = {
2916  { ISD::ABS, MVT::v4i32, 1 },
2917  { ISD::ABS, MVT::v8i16, 1 },
2918  { ISD::ABS, MVT::v16i8, 1 },
2919  { ISD::BITREVERSE, MVT::v2i64, 5 },
2920  { ISD::BITREVERSE, MVT::v4i32, 5 },
2921  { ISD::BITREVERSE, MVT::v8i16, 5 },
2922  { ISD::BITREVERSE, MVT::v16i8, 5 },
2923  { ISD::BSWAP, MVT::v2i64, 1 },
2924  { ISD::BSWAP, MVT::v4i32, 1 },
2925  { ISD::BSWAP, MVT::v8i16, 1 },
2926  { ISD::CTLZ, MVT::v2i64, 23 },
2927  { ISD::CTLZ, MVT::v4i32, 18 },
2928  { ISD::CTLZ, MVT::v8i16, 14 },
2929  { ISD::CTLZ, MVT::v16i8, 9 },
2930  { ISD::CTPOP, MVT::v2i64, 7 },
2931  { ISD::CTPOP, MVT::v4i32, 11 },
2932  { ISD::CTPOP, MVT::v8i16, 9 },
2933  { ISD::CTPOP, MVT::v16i8, 6 },
2934  { ISD::CTTZ, MVT::v2i64, 10 },
2935  { ISD::CTTZ, MVT::v4i32, 14 },
2936  { ISD::CTTZ, MVT::v8i16, 12 },
2937  { ISD::CTTZ, MVT::v16i8, 9 }
2938  };
2939  static const CostTblEntry SSE2CostTbl[] = {
2940  { ISD::ABS, MVT::v2i64, 4 },
2941  { ISD::ABS, MVT::v4i32, 3 },
2942  { ISD::ABS, MVT::v8i16, 2 },
2943  { ISD::ABS, MVT::v16i8, 2 },
2944  { ISD::BITREVERSE, MVT::v2i64, 29 },
2945  { ISD::BITREVERSE, MVT::v4i32, 27 },
2946  { ISD::BITREVERSE, MVT::v8i16, 27 },
2947  { ISD::BITREVERSE, MVT::v16i8, 20 },
2948  { ISD::BSWAP, MVT::v2i64, 7 },
2949  { ISD::BSWAP, MVT::v4i32, 7 },
2950  { ISD::BSWAP, MVT::v8i16, 7 },
2951  { ISD::CTLZ, MVT::v2i64, 25 },
2952  { ISD::CTLZ, MVT::v4i32, 26 },
2953  { ISD::CTLZ, MVT::v8i16, 20 },
2954  { ISD::CTLZ, MVT::v16i8, 17 },
2955  { ISD::CTPOP, MVT::v2i64, 12 },
2956  { ISD::CTPOP, MVT::v4i32, 15 },
2957  { ISD::CTPOP, MVT::v8i16, 13 },
2958  { ISD::CTPOP, MVT::v16i8, 10 },
2959  { ISD::CTTZ, MVT::v2i64, 14 },
2960  { ISD::CTTZ, MVT::v4i32, 18 },
2961  { ISD::CTTZ, MVT::v8i16, 16 },
2962  { ISD::CTTZ, MVT::v16i8, 13 },
2963  { ISD::SADDSAT, MVT::v8i16, 1 },
2964  { ISD::SADDSAT, MVT::v16i8, 1 },
2965  { ISD::SMAX, MVT::v8i16, 1 },
2966  { ISD::SMIN, MVT::v8i16, 1 },
2967  { ISD::SSUBSAT, MVT::v8i16, 1 },
2968  { ISD::SSUBSAT, MVT::v16i8, 1 },
2969  { ISD::UADDSAT, MVT::v8i16, 1 },
2970  { ISD::UADDSAT, MVT::v16i8, 1 },
2971  { ISD::UMAX, MVT::v8i16, 2 },
2972  { ISD::UMAX, MVT::v16i8, 1 },
2973  { ISD::UMIN, MVT::v8i16, 2 },
2974  { ISD::UMIN, MVT::v16i8, 1 },
2975  { ISD::USUBSAT, MVT::v8i16, 1 },
2976  { ISD::USUBSAT, MVT::v16i8, 1 },
2977  { ISD::FMAXNUM, MVT::f64, 4 },
2978  { ISD::FMAXNUM, MVT::v2f64, 4 },
2979  { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/
2980  { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/
2981  };
2982  static const CostTblEntry SSE1CostTbl[] = {
2983  { ISD::FMAXNUM, MVT::f32, 4 },
2984  { ISD::FMAXNUM, MVT::v4f32, 4 },
2985  { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
2986  { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
2987  };
2988  static const CostTblEntry BMI64CostTbl[] = { // 64-bit targets
2989  { ISD::CTTZ, MVT::i64, 1 },
2990  };
2991  static const CostTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
2992  { ISD::CTTZ, MVT::i32, 1 },
2993  { ISD::CTTZ, MVT::i16, 1 },
2994  { ISD::CTTZ, MVT::i8, 1 },
2995  };
2996  static const CostTblEntry LZCNT64CostTbl[] = { // 64-bit targets
2997  { ISD::CTLZ, MVT::i64, 1 },
2998  };
2999  static const CostTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
3000  { ISD::CTLZ, MVT::i32, 1 },
3001  { ISD::CTLZ, MVT::i16, 1 },
3002  { ISD::CTLZ, MVT::i8, 1 },
3003  };
3004  static const CostTblEntry POPCNT64CostTbl[] = { // 64-bit targets
3005  { ISD::CTPOP, MVT::i64, 1 },
3006  };
3007  static const CostTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
3008  { ISD::CTPOP, MVT::i32, 1 },
3009  { ISD::CTPOP, MVT::i16, 1 },
3010  { ISD::CTPOP, MVT::i8, 1 },
3011  };
3012  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
3013  { ISD::ABS, MVT::i64, 2 }, // SUB+CMOV
3014  { ISD::BITREVERSE, MVT::i64, 14 },
3015  { ISD::BSWAP, MVT::i64, 1 },
3016  { ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV
3017  { ISD::CTTZ, MVT::i64, 3 }, // TEST+BSF+CMOV/BRANCH
3018  { ISD::CTPOP, MVT::i64, 10 },
3019  { ISD::SADDO, MVT::i64, 1 },
3020  { ISD::UADDO, MVT::i64, 1 },
3021  { ISD::UMULO, MVT::i64, 2 }, // mulq + seto
3022  };
3023  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
3024  { ISD::ABS, MVT::i32, 2 }, // SUB+CMOV
3025  { ISD::ABS, MVT::i16, 2 }, // SUB+CMOV
3026  { ISD::BITREVERSE, MVT::i32, 14 },
3027  { ISD::BITREVERSE, MVT::i16, 14 },
3028  { ISD::BITREVERSE, MVT::i8, 11 },
3029  { ISD::BSWAP, MVT::i32, 1 },
3030  { ISD::BSWAP, MVT::i16, 1 }, // ROL
3031  { ISD::CTLZ, MVT::i32, 4 }, // BSR+XOR or BSR+XOR+CMOV
3032  { ISD::CTLZ, MVT::i16, 4 }, // BSR+XOR or BSR+XOR+CMOV
3033  { ISD::CTLZ, MVT::i8, 4 }, // BSR+XOR or BSR+XOR+CMOV
3034  { ISD::CTTZ, MVT::i32, 3 }, // TEST+BSF+CMOV/BRANCH
3035  { ISD::CTTZ, MVT::i16, 3 }, // TEST+BSF+CMOV/BRANCH
3036  { ISD::CTTZ, MVT::i8, 3 }, // TEST+BSF+CMOV/BRANCH
3037  { ISD::CTPOP, MVT::i32, 8 },
3038  { ISD::CTPOP, MVT::i16, 9 },
3039  { ISD::CTPOP, MVT::i8, 7 },
3040  { ISD::SADDO, MVT::i32, 1 },
3041  { ISD::SADDO, MVT::i16, 1 },
3042  { ISD::SADDO, MVT::i8, 1 },
3043  { ISD::UADDO, MVT::i32, 1 },
3044  { ISD::UADDO, MVT::i16, 1 },
3045  { ISD::UADDO, MVT::i8, 1 },
3046  { ISD::UMULO, MVT::i32, 2 }, // mul + seto
3047  { ISD::UMULO, MVT::i16, 2 },
3048  { ISD::UMULO, MVT::i8, 2 },
3049  };
3050 
3051  Type *RetTy = ICA.getReturnType();
3052  Type *OpTy = RetTy;
3053  Intrinsic::ID IID = ICA.getID();
3054  unsigned ISD = ISD::DELETED_NODE;
3055  switch (IID) {
3056  default:
3057  break;
3058  case Intrinsic::abs:
3059  ISD = ISD::ABS;
3060  break;
3061  case Intrinsic::bitreverse:
3062  ISD = ISD::BITREVERSE;
3063  break;
3064  case Intrinsic::bswap:
3065  ISD = ISD::BSWAP;
3066  break;
3067  case Intrinsic::ctlz:
3068  ISD = ISD::CTLZ;
3069  break;
3070  case Intrinsic::ctpop:
3071  ISD = ISD::CTPOP;
3072  break;
3073  case Intrinsic::cttz:
3074  ISD = ISD::CTTZ;
3075  break;
3076  case Intrinsic::maxnum:
3077  case Intrinsic::minnum:
3078  // FMINNUM has same costs so don't duplicate.
3079  ISD = ISD::FMAXNUM;
3080  break;
3081  case Intrinsic::sadd_sat:
3082  ISD = ISD::SADDSAT;
3083  break;
3084  case Intrinsic::smax:
3085  ISD = ISD::SMAX;
3086  break;
3087  case Intrinsic::smin:
3088  ISD = ISD::SMIN;
3089  break;
3090  case Intrinsic::ssub_sat:
3091  ISD = ISD::SSUBSAT;
3092  break;
3093  case Intrinsic::uadd_sat:
3094  ISD = ISD::UADDSAT;
3095  break;
3096  case Intrinsic::umax:
3097  ISD = ISD::UMAX;
3098  break;
3099  case Intrinsic::umin:
3100  ISD = ISD::UMIN;
3101  break;
3102  case Intrinsic::usub_sat:
3103  ISD = ISD::USUBSAT;
3104  break;
3105  case Intrinsic::sqrt:
3106  ISD = ISD::FSQRT;
3107  break;
3108  case Intrinsic::sadd_with_overflow:
3109  case Intrinsic::ssub_with_overflow:
3110  // SSUBO has same costs so don't duplicate.
3111  ISD = ISD::SADDO;
3112  OpTy = RetTy->getContainedType(0);
3113  break;
3114  case Intrinsic::uadd_with_overflow:
3115  case Intrinsic::usub_with_overflow:
3116  // USUBO has same costs so don't duplicate.
3117  ISD = ISD::UADDO;
3118  OpTy = RetTy->getContainedType(0);
3119  break;
3120  case Intrinsic::umul_with_overflow:
3121  case Intrinsic::smul_with_overflow:
3122  // SMULO has same costs so don't duplicate.
3123  ISD = ISD::UMULO;
3124  OpTy = RetTy->getContainedType(0);
3125  break;
3126  }
3127 
3128  if (ISD != ISD::DELETED_NODE) {
3129  // Legalize the type.
3130  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, OpTy);
3131  MVT MTy = LT.second;
3132 
3133  // Attempt to lookup cost.
3134  if (ISD == ISD::BITREVERSE && ST->hasGFNI() && ST->hasSSSE3() &&
3135  MTy.isVector()) {
3136  // With PSHUFB the code is very similar for all types. If we have integer
3137  // byte operations, we just need a GF2P8AFFINEQB for vXi8. For other types
3138  // we also need a PSHUFB.
3139  unsigned Cost = MTy.getVectorElementType() == MVT::i8 ? 1 : 2;
3140 
3141  // Without byte operations, we need twice as many GF2P8AFFINEQB and PSHUFB
3142  // instructions. We also need an extract and an insert.
3143  if (!(MTy.is128BitVector() || (ST->hasAVX2() && MTy.is256BitVector()) ||
3144  (ST->hasBWI() && MTy.is512BitVector())))
3145  Cost = Cost * 2 + 2;
3146 
3147  return LT.first * Cost;
3148  }
3149 
3150  auto adjustTableCost = [](const CostTblEntry &Entry,
3151  InstructionCost LegalizationCost,
3152  FastMathFlags FMF) {
3153  // If there are no NANs to deal with, then these are reduced to a
3154  // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
3155  // assume is used in the non-fast case.
3156  if (Entry.ISD == ISD::FMAXNUM || Entry.ISD == ISD::FMINNUM) {
3157  if (FMF.noNaNs())
3158  return LegalizationCost * 1;
3159  }
3160  return LegalizationCost * (int)Entry.Cost;
3161  };
3162 
3163  if (ST->useGLMDivSqrtCosts())
3164  if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
3165  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3166 
3167  if (ST->isSLM())
3168  if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
3169  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3170 
3171  if (ST->hasBITALG())
3172  if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
3173  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3174 
3175  if (ST->hasVPOPCNTDQ())
3176  if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
3177  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3178 
3179  if (ST->hasCDI())
3180  if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
3181  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3182 
3183  if (ST->hasBWI())
3184  if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3185  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3186 
3187  if (ST->hasAVX512())
3188  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3189  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3190 
3191  if (ST->hasXOP())
3192  if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3193  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3194 
3195  if (ST->hasAVX2())
3196  if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3197  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3198 
3199  if (ST->hasAVX())
3200  if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3201  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3202 
3203  if (ST->hasSSE42())
3204  if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3205  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3206 
3207  if (ST->hasSSE41())
3208  if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3209  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3210 
3211  if (ST->hasSSSE3())
3212  if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
3213  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3214 
3215  if (ST->hasSSE2())
3216  if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3217  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3218 
3219  if (ST->hasSSE1())
3220  if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3221  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3222 
3223  if (ST->hasBMI()) {
3224  if (ST->is64Bit())
3225  if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
3226  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3227 
3228  if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
3229  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3230  }
3231 
3232  if (ST->hasLZCNT()) {
3233  if (ST->is64Bit())
3234  if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
3235  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3236 
3237  if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
3238  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3239  }
3240 
3241  if (ST->hasPOPCNT()) {
3242  if (ST->is64Bit())
3243  if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
3244  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3245 
3246  if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
3247  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3248  }
3249 
3250  if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
3251  if (const Instruction *II = ICA.getInst()) {
3252  if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
3253  return TTI::TCC_Free;
3254  if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
3255  if (LI->hasOneUse())
3256  return TTI::TCC_Free;
3257  }
3258  }
3259  }
3260 
3261  // TODO - add BMI (TZCNT) scalar handling
3262 
3263  if (ST->is64Bit())
3264  if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
3265  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3266 
3267  if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
3268  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3269  }
3270 
3272 }
3273 
3277  if (ICA.isTypeBasedOnly())
3279 
3280  static const CostTblEntry AVX512CostTbl[] = {
3281  { ISD::ROTL, MVT::v8i64, 1 },
3282  { ISD::ROTL, MVT::v4i64, 1 },
3283  { ISD::ROTL, MVT::v2i64, 1 },
3284  { ISD::ROTL, MVT::v16i32, 1 },
3285  { ISD::ROTL, MVT::v8i32, 1 },
3286  { ISD::ROTL, MVT::v4i32, 1 },
3287  { ISD::ROTR, MVT::v8i64, 1 },
3288  { ISD::ROTR, MVT::v4i64, 1 },
3289  { ISD::ROTR, MVT::v2i64, 1 },
3290  { ISD::ROTR, MVT::v16i32, 1 },
3291  { ISD::ROTR, MVT::v8i32, 1 },
3292  { ISD::ROTR, MVT::v4i32, 1 }
3293  };
3294  // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
3295  static const CostTblEntry XOPCostTbl[] = {
3296  { ISD::ROTL, MVT::v4i64, 4 },
3297  { ISD::ROTL, MVT::v8i32, 4 },
3298  { ISD::ROTL, MVT::v16i16, 4 },
3299  { ISD::ROTL, MVT::v32i8, 4 },
3300  { ISD::ROTL, MVT::v2i64, 1 },
3301  { ISD::ROTL, MVT::v4i32, 1 },
3302  { ISD::ROTL, MVT::v8i16, 1 },
3303  { ISD::ROTL, MVT::v16i8, 1 },
3304  { ISD::ROTR, MVT::v4i64, 6 },
3305  { ISD::ROTR, MVT::v8i32, 6 },
3306  { ISD::ROTR, MVT::v16i16, 6 },
3307  { ISD::ROTR, MVT::v32i8, 6 },
3308  { ISD::ROTR, MVT::v2i64, 2 },
3309  { ISD::ROTR, MVT::v4i32, 2 },
3310  { ISD::ROTR, MVT::v8i16, 2 },
3311  { ISD::ROTR, MVT::v16i8, 2 }
3312  };
3313  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
3314  { ISD::ROTL, MVT::i64, 1 },
3315  { ISD::ROTR, MVT::i64, 1 },
3316  { ISD::FSHL,