LLVM  13.0.0git
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements a TargetTransformInfo analysis pass specific to the
10 /// X86 target machine. It uses the target's detailed information to provide
11 /// more precise answers to certain TTI queries, while letting the target
12 /// independent and default TTI implementations handle the rest.
13 ///
14 //===----------------------------------------------------------------------===//
15 /// About Cost Model numbers used below it's necessary to say the following:
16 /// the numbers correspond to some "generic" X86 CPU instead of usage of
17 /// concrete CPU model. Usually the numbers correspond to CPU where the feature
18 /// apeared at the first time. For example, if we do Subtarget.hasSSE42() in
19 /// the lookups below the cost is based on Nehalem as that was the first CPU
20 /// to support that feature level and thus has most likely the worst case cost.
21 /// Some examples of other technologies/CPUs:
22 /// SSE 3 - Pentium4 / Athlon64
23 /// SSE 4.1 - Penryn
24 /// SSE 4.2 - Nehalem
25 /// AVX - Sandy Bridge
26 /// AVX2 - Haswell
27 /// AVX-512 - Xeon Phi / Skylake
28 /// And some examples of instruction target dependent costs (latency)
29 /// divss sqrtss rsqrtss
30 /// AMD K7 11-16 19 3
31 /// Piledriver 9-24 13-15 5
32 /// Jaguar 14 16 2
33 /// Pentium II,III 18 30 2
34 /// Nehalem 7-14 7-18 3
35 /// Haswell 10-13 11 5
36 /// TODO: Develop and implement the target dependent cost model and
37 /// specialize cost numbers for different Cost Model Targets such as throughput,
38 /// code size, latency and uop count.
39 //===----------------------------------------------------------------------===//
40 
41 #include "X86TargetTransformInfo.h"
44 #include "llvm/CodeGen/CostTable.h"
46 #include "llvm/IR/IntrinsicInst.h"
47 #include "llvm/Support/Debug.h"
48 
49 using namespace llvm;
50 
51 #define DEBUG_TYPE "x86tti"
52 
53 //===----------------------------------------------------------------------===//
54 //
55 // X86 cost model.
56 //
57 //===----------------------------------------------------------------------===//
58 
60 X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
61  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
62  // TODO: Currently the __builtin_popcount() implementation using SSE3
63  // instructions is inefficient. Once the problem is fixed, we should
64  // call ST->hasSSE3() instead of ST->hasPOPCNT().
66 }
67 
70  switch (Level) {
72  // - Penryn
73  // - Nehalem
74  // - Westmere
75  // - Sandy Bridge
76  // - Ivy Bridge
77  // - Haswell
78  // - Broadwell
79  // - Skylake
80  // - Kabylake
81  return 32 * 1024; // 32 KByte
83  // - Penryn
84  // - Nehalem
85  // - Westmere
86  // - Sandy Bridge
87  // - Ivy Bridge
88  // - Haswell
89  // - Broadwell
90  // - Skylake
91  // - Kabylake
92  return 256 * 1024; // 256 KByte
93  }
94 
95  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
96 }
97 
100  // - Penryn
101  // - Nehalem
102  // - Westmere
103  // - Sandy Bridge
104  // - Ivy Bridge
105  // - Haswell
106  // - Broadwell
107  // - Skylake
108  // - Kabylake
109  switch (Level) {
113  return 8;
114  }
115 
116  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
117 }
118 
119 unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
120  bool Vector = (ClassID == 1);
121  if (Vector && !ST->hasSSE1())
122  return 0;
123 
124  if (ST->is64Bit()) {
125  if (Vector && ST->hasAVX512())
126  return 32;
127  return 16;
128  }
129  return 8;
130 }
131 
132 TypeSize
134  unsigned PreferVectorWidth = ST->getPreferVectorWidth();
135  switch (K) {
137  return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
139  if (ST->hasAVX512() && PreferVectorWidth >= 512)
140  return TypeSize::getFixed(512);
141  if (ST->hasAVX() && PreferVectorWidth >= 256)
142  return TypeSize::getFixed(256);
143  if (ST->hasSSE1() && PreferVectorWidth >= 128)
144  return TypeSize::getFixed(128);
145  return TypeSize::getFixed(0);
147  return TypeSize::getScalable(0);
148  }
149 
150  llvm_unreachable("Unsupported register kind");
151 }
152 
153 unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
155  .getFixedSize();
156 }
157 
158 unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
159  // If the loop will not be vectorized, don't interleave the loop.
160  // Let regular unroll to unroll the loop, which saves the overflow
161  // check and memory check cost.
162  if (VF == 1)
163  return 1;
164 
165  if (ST->isAtom())
166  return 1;
167 
168  // Sandybridge and Haswell have multiple execution ports and pipelined
169  // vector units.
170  if (ST->hasAVX())
171  return 4;
172 
173  return 2;
174 }
175 
177  unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
179  TTI::OperandValueProperties Opd1PropInfo,
181  const Instruction *CxtI) {
182  // TODO: Handle more cost kinds.
184  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
185  Op2Info, Opd1PropInfo,
186  Opd2PropInfo, Args, CxtI);
187 
188  // vXi8 multiplications are always promoted to vXi16.
189  if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
190  Ty->getScalarSizeInBits() == 8) {
191  Type *WideVecTy =
192  VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
193  return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
195  CostKind) +
196  getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
198  CostKind) +
199  getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info,
200  Opd1PropInfo, Opd2PropInfo);
201  }
202 
203  // Legalize the type.
204  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
205 
206  int ISD = TLI->InstructionOpcodeToISD(Opcode);
207  assert(ISD && "Invalid opcode");
208 
209  static const CostTblEntry GLMCostTable[] = {
210  { ISD::FDIV, MVT::f32, 18 }, // divss
211  { ISD::FDIV, MVT::v4f32, 35 }, // divps
212  { ISD::FDIV, MVT::f64, 33 }, // divsd
213  { ISD::FDIV, MVT::v2f64, 65 }, // divpd
214  };
215 
216  if (ST->useGLMDivSqrtCosts())
217  if (const auto *Entry = CostTableLookup(GLMCostTable, ISD,
218  LT.second))
219  return LT.first * Entry->Cost;
220 
221  static const CostTblEntry SLMCostTable[] = {
222  { ISD::MUL, MVT::v4i32, 11 }, // pmulld
223  { ISD::MUL, MVT::v8i16, 2 }, // pmullw
224  { ISD::FMUL, MVT::f64, 2 }, // mulsd
225  { ISD::FMUL, MVT::v2f64, 4 }, // mulpd
226  { ISD::FMUL, MVT::v4f32, 2 }, // mulps
227  { ISD::FDIV, MVT::f32, 17 }, // divss
228  { ISD::FDIV, MVT::v4f32, 39 }, // divps
229  { ISD::FDIV, MVT::f64, 32 }, // divsd
230  { ISD::FDIV, MVT::v2f64, 69 }, // divpd
231  { ISD::FADD, MVT::v2f64, 2 }, // addpd
232  { ISD::FSUB, MVT::v2f64, 2 }, // subpd
233  // v2i64/v4i64 mul is custom lowered as a series of long:
234  // multiplies(3), shifts(3) and adds(2)
235  // slm muldq version throughput is 2 and addq throughput 4
236  // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
237  // 3X4 (addq throughput) = 17
238  { ISD::MUL, MVT::v2i64, 17 },
239  // slm addq\subq throughput is 4
240  { ISD::ADD, MVT::v2i64, 4 },
241  { ISD::SUB, MVT::v2i64, 4 },
242  };
243 
244  if (ST->isSLM()) {
245  if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
246  // Check if the operands can be shrinked into a smaller datatype.
247  bool Op1Signed = false;
248  unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
249  bool Op2Signed = false;
250  unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
251 
252  bool SignedMode = Op1Signed || Op2Signed;
253  unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
254 
255  if (OpMinSize <= 7)
256  return LT.first * 3; // pmullw/sext
257  if (!SignedMode && OpMinSize <= 8)
258  return LT.first * 3; // pmullw/zext
259  if (OpMinSize <= 15)
260  return LT.first * 5; // pmullw/pmulhw/pshuf
261  if (!SignedMode && OpMinSize <= 16)
262  return LT.first * 5; // pmullw/pmulhw/pshuf
263  }
264 
265  if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
266  LT.second)) {
267  return LT.first * Entry->Cost;
268  }
269  }
270 
271  if ((ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
272  ISD == ISD::UREM) &&
275  Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
276  if (ISD == ISD::SDIV || ISD == ISD::SREM) {
277  // On X86, vector signed division by constants power-of-two are
278  // normally expanded to the sequence SRA + SRL + ADD + SRA.
279  // The OperandValue properties may not be the same as that of the previous
280  // operation; conservatively assume OP_None.
281  InstructionCost Cost =
282  2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Op1Info,
285  Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info,
286  Op2Info,
289  Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, Op1Info,
290  Op2Info,
293 
294  if (ISD == ISD::SREM) {
295  // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
296  Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info,
297  Op2Info);
298  Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info,
299  Op2Info);
300  }
301 
302  return Cost;
303  }
304 
305  // Vector unsigned division/remainder will be simplified to shifts/masks.
306  if (ISD == ISD::UDIV)
307  return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
308  Op1Info, Op2Info,
311 
312  else // UREM
313  return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
314  Op1Info, Op2Info,
317  }
318 
319  static const CostTblEntry AVX512BWUniformConstCostTable[] = {
320  { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand.
321  { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand.
322  { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb.
323  };
324 
326  ST->hasBWI()) {
327  if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
328  LT.second))
329  return LT.first * Entry->Cost;
330  }
331 
332  static const CostTblEntry AVX512UniformConstCostTable[] = {
333  { ISD::SRA, MVT::v2i64, 1 },
334  { ISD::SRA, MVT::v4i64, 1 },
335  { ISD::SRA, MVT::v8i64, 1 },
336 
337  { ISD::SHL, MVT::v64i8, 4 }, // psllw + pand.
338  { ISD::SRL, MVT::v64i8, 4 }, // psrlw + pand.
339  { ISD::SRA, MVT::v64i8, 8 }, // psrlw, pand, pxor, psubb.
340 
341  { ISD::SDIV, MVT::v16i32, 6 }, // pmuludq sequence
342  { ISD::SREM, MVT::v16i32, 8 }, // pmuludq+mul+sub sequence
343  { ISD::UDIV, MVT::v16i32, 5 }, // pmuludq sequence
344  { ISD::UREM, MVT::v16i32, 7 }, // pmuludq+mul+sub sequence
345  };
346 
348  ST->hasAVX512()) {
349  if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
350  LT.second))
351  return LT.first * Entry->Cost;
352  }
353 
354  static const CostTblEntry AVX2UniformConstCostTable[] = {
355  { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand.
356  { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand.
357  { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb.
358 
359  { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
360 
361  { ISD::SDIV, MVT::v8i32, 6 }, // pmuludq sequence
362  { ISD::SREM, MVT::v8i32, 8 }, // pmuludq+mul+sub sequence
363  { ISD::UDIV, MVT::v8i32, 5 }, // pmuludq sequence
364  { ISD::UREM, MVT::v8i32, 7 }, // pmuludq+mul+sub sequence
365  };
366 
368  ST->hasAVX2()) {
369  if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
370  LT.second))
371  return LT.first * Entry->Cost;
372  }
373 
374  static const CostTblEntry SSE2UniformConstCostTable[] = {
375  { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand.
376  { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand.
377  { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
378 
379  { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split.
380  { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split.
381  { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
382 
383  { ISD::SDIV, MVT::v8i32, 12+2 }, // 2*pmuludq sequence + split.
384  { ISD::SREM, MVT::v8i32, 16+2 }, // 2*pmuludq+mul+sub sequence + split.
385  { ISD::SDIV, MVT::v4i32, 6 }, // pmuludq sequence
386  { ISD::SREM, MVT::v4i32, 8 }, // pmuludq+mul+sub sequence
387  { ISD::UDIV, MVT::v8i32, 10+2 }, // 2*pmuludq sequence + split.
388  { ISD::UREM, MVT::v8i32, 14+2 }, // 2*pmuludq+mul+sub sequence + split.
389  { ISD::UDIV, MVT::v4i32, 5 }, // pmuludq sequence
390  { ISD::UREM, MVT::v4i32, 7 }, // pmuludq+mul+sub sequence
391  };
392 
393  // XOP has faster vXi8 shifts.
395  ST->hasSSE2() && !ST->hasXOP()) {
396  if (const auto *Entry =
397  CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
398  return LT.first * Entry->Cost;
399  }
400 
401  static const CostTblEntry AVX512BWConstCostTable[] = {
402  { ISD::SDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
403  { ISD::SREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
404  { ISD::UDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
405  { ISD::UREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
406  { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence
407  { ISD::SREM, MVT::v32i16, 8 }, // vpmulhw+mul+sub sequence
408  { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence
409  { ISD::UREM, MVT::v32i16, 8 }, // vpmulhuw+mul+sub sequence
410  };
411 
414  ST->hasBWI()) {
415  if (const auto *Entry =
416  CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
417  return LT.first * Entry->Cost;
418  }
419 
420  static const CostTblEntry AVX512ConstCostTable[] = {
421  { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
422  { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
423  { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
424  { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
425  { ISD::SDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
426  { ISD::SREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
427  { ISD::UDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
428  { ISD::UREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
429  { ISD::SDIV, MVT::v32i16, 12 }, // 2*vpmulhw sequence
430  { ISD::SREM, MVT::v32i16, 16 }, // 2*vpmulhw+mul+sub sequence
431  { ISD::UDIV, MVT::v32i16, 12 }, // 2*vpmulhuw sequence
432  { ISD::UREM, MVT::v32i16, 16 }, // 2*vpmulhuw+mul+sub sequence
433  };
434 
437  ST->hasAVX512()) {
438  if (const auto *Entry =
439  CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
440  return LT.first * Entry->Cost;
441  }
442 
443  static const CostTblEntry AVX2ConstCostTable[] = {
444  { ISD::SDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
445  { ISD::SREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
446  { ISD::UDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
447  { ISD::UREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
448  { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
449  { ISD::SREM, MVT::v16i16, 8 }, // vpmulhw+mul+sub sequence
450  { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
451  { ISD::UREM, MVT::v16i16, 8 }, // vpmulhuw+mul+sub sequence
452  { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
453  { ISD::SREM, MVT::v8i32, 19 }, // vpmuldq+mul+sub sequence
454  { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
455  { ISD::UREM, MVT::v8i32, 19 }, // vpmuludq+mul+sub sequence
456  };
457 
460  ST->hasAVX2()) {
461  if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
462  return LT.first * Entry->Cost;
463  }
464 
465  static const CostTblEntry SSE2ConstCostTable[] = {
466  { ISD::SDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
467  { ISD::SREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
468  { ISD::SDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
469  { ISD::SREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
470  { ISD::UDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
471  { ISD::UREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
472  { ISD::UDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
473  { ISD::UREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
474  { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
475  { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split.
476  { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
477  { ISD::SREM, MVT::v8i16, 8 }, // pmulhw+mul+sub sequence
478  { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
479  { ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split.
480  { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
481  { ISD::UREM, MVT::v8i16, 8 }, // pmulhuw+mul+sub sequence
482  { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split.
483  { ISD::SREM, MVT::v8i32, 48+2 }, // 2*pmuludq+mul+sub sequence + split.
484  { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
485  { ISD::SREM, MVT::v4i32, 24 }, // pmuludq+mul+sub sequence
486  { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split.
487  { ISD::UREM, MVT::v8i32, 40+2 }, // 2*pmuludq+mul+sub sequence + split.
488  { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
489  { ISD::UREM, MVT::v4i32, 20 }, // pmuludq+mul+sub sequence
490  };
491 
494  ST->hasSSE2()) {
495  // pmuldq sequence.
496  if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
497  return LT.first * 32;
498  if (ISD == ISD::SREM && LT.second == MVT::v8i32 && ST->hasAVX())
499  return LT.first * 38;
500  if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
501  return LT.first * 15;
502  if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41())
503  return LT.first * 20;
504 
505  if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
506  return LT.first * Entry->Cost;
507  }
508 
509  static const CostTblEntry AVX512BWShiftCostTable[] = {
510  { ISD::SHL, MVT::v16i8, 4 }, // extend/vpsllvw/pack sequence.
511  { ISD::SRL, MVT::v16i8, 4 }, // extend/vpsrlvw/pack sequence.
512  { ISD::SRA, MVT::v16i8, 4 }, // extend/vpsravw/pack sequence.
513  { ISD::SHL, MVT::v32i8, 4 }, // extend/vpsllvw/pack sequence.
514  { ISD::SRL, MVT::v32i8, 4 }, // extend/vpsrlvw/pack sequence.
515  { ISD::SRA, MVT::v32i8, 6 }, // extend/vpsravw/pack sequence.
516  { ISD::SHL, MVT::v64i8, 6 }, // extend/vpsllvw/pack sequence.
517  { ISD::SRL, MVT::v64i8, 7 }, // extend/vpsrlvw/pack sequence.
518  { ISD::SRA, MVT::v64i8, 15 }, // extend/vpsravw/pack sequence.
519 
520  { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
521  { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
522  { ISD::SRA, MVT::v8i16, 1 }, // vpsravw
523  { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
524  { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
525  { ISD::SRA, MVT::v16i16, 1 }, // vpsravw
526  { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
527  { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
528  { ISD::SRA, MVT::v32i16, 1 }, // vpsravw
529  };
530 
531  if (ST->hasBWI())
532  if (const auto *Entry = CostTableLookup(AVX512BWShiftCostTable, ISD, LT.second))
533  return LT.first * Entry->Cost;
534 
535  static const CostTblEntry AVX2UniformCostTable[] = {
536  // Uniform splats are cheaper for the following instructions.
537  { ISD::SHL, MVT::v16i16, 1 }, // psllw.
538  { ISD::SRL, MVT::v16i16, 1 }, // psrlw.
539  { ISD::SRA, MVT::v16i16, 1 }, // psraw.
540  { ISD::SHL, MVT::v32i16, 2 }, // 2*psllw.
541  { ISD::SRL, MVT::v32i16, 2 }, // 2*psrlw.
542  { ISD::SRA, MVT::v32i16, 2 }, // 2*psraw.
543 
544  { ISD::SHL, MVT::v8i32, 1 }, // pslld
545  { ISD::SRL, MVT::v8i32, 1 }, // psrld
546  { ISD::SRA, MVT::v8i32, 1 }, // psrad
547  { ISD::SHL, MVT::v4i64, 1 }, // psllq
548  { ISD::SRL, MVT::v4i64, 1 }, // psrlq
549  };
550 
551  if (ST->hasAVX2() &&
553  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
554  if (const auto *Entry =
555  CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
556  return LT.first * Entry->Cost;
557  }
558 
559  static const CostTblEntry SSE2UniformCostTable[] = {
560  // Uniform splats are cheaper for the following instructions.
561  { ISD::SHL, MVT::v8i16, 1 }, // psllw.
562  { ISD::SHL, MVT::v4i32, 1 }, // pslld
563  { ISD::SHL, MVT::v2i64, 1 }, // psllq.
564 
565  { ISD::SRL, MVT::v8i16, 1 }, // psrlw.
566  { ISD::SRL, MVT::v4i32, 1 }, // psrld.
567  { ISD::SRL, MVT::v2i64, 1 }, // psrlq.
568 
569  { ISD::SRA, MVT::v8i16, 1 }, // psraw.
570  { ISD::SRA, MVT::v4i32, 1 }, // psrad.
571  };
572 
573  if (ST->hasSSE2() &&
575  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
576  if (const auto *Entry =
577  CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
578  return LT.first * Entry->Cost;
579  }
580 
581  static const CostTblEntry AVX512DQCostTable[] = {
582  { ISD::MUL, MVT::v2i64, 2 }, // pmullq
583  { ISD::MUL, MVT::v4i64, 2 }, // pmullq
584  { ISD::MUL, MVT::v8i64, 2 } // pmullq
585  };
586 
587  // Look for AVX512DQ lowering tricks for custom cases.
588  if (ST->hasDQI())
589  if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
590  return LT.first * Entry->Cost;
591 
592  static const CostTblEntry AVX512BWCostTable[] = {
593  { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence.
594  { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence.
595  { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence.
596  };
597 
598  // Look for AVX512BW lowering tricks for custom cases.
599  if (ST->hasBWI())
600  if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
601  return LT.first * Entry->Cost;
602 
603  static const CostTblEntry AVX512CostTable[] = {
604  { ISD::SHL, MVT::v4i32, 1 },
605  { ISD::SRL, MVT::v4i32, 1 },
606  { ISD::SRA, MVT::v4i32, 1 },
607  { ISD::SHL, MVT::v8i32, 1 },
608  { ISD::SRL, MVT::v8i32, 1 },
609  { ISD::SRA, MVT::v8i32, 1 },
610  { ISD::SHL, MVT::v16i32, 1 },
611  { ISD::SRL, MVT::v16i32, 1 },
612  { ISD::SRA, MVT::v16i32, 1 },
613 
614  { ISD::SHL, MVT::v2i64, 1 },
615  { ISD::SRL, MVT::v2i64, 1 },
616  { ISD::SHL, MVT::v4i64, 1 },
617  { ISD::SRL, MVT::v4i64, 1 },
618  { ISD::SHL, MVT::v8i64, 1 },
619  { ISD::SRL, MVT::v8i64, 1 },
620 
621  { ISD::SRA, MVT::v2i64, 1 },
622  { ISD::SRA, MVT::v4i64, 1 },
623  { ISD::SRA, MVT::v8i64, 1 },
624 
625  { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org)
626  { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org)
627  { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org)
628  { ISD::MUL, MVT::v8i64, 6 }, // 3*pmuludq/3*shift/2*add
629 
630  { ISD::FNEG, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
631  { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
632  { ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
633  { ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
634  { ISD::FDIV, MVT::f64, 4 }, // Skylake from http://www.agner.org/
635  { ISD::FDIV, MVT::v2f64, 4 }, // Skylake from http://www.agner.org/
636  { ISD::FDIV, MVT::v4f64, 8 }, // Skylake from http://www.agner.org/
637  { ISD::FDIV, MVT::v8f64, 16 }, // Skylake from http://www.agner.org/
638 
639  { ISD::FNEG, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
640  { ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
641  { ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
642  { ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
643  { ISD::FDIV, MVT::f32, 3 }, // Skylake from http://www.agner.org/
644  { ISD::FDIV, MVT::v4f32, 3 }, // Skylake from http://www.agner.org/
645  { ISD::FDIV, MVT::v8f32, 5 }, // Skylake from http://www.agner.org/
646  { ISD::FDIV, MVT::v16f32, 10 }, // Skylake from http://www.agner.org/
647  };
648 
649  if (ST->hasAVX512())
650  if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
651  return LT.first * Entry->Cost;
652 
653  static const CostTblEntry AVX2ShiftCostTable[] = {
654  // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
655  // customize them to detect the cases where shift amount is a scalar one.
656  { ISD::SHL, MVT::v4i32, 2 }, // vpsllvd (Haswell from agner.org)
657  { ISD::SRL, MVT::v4i32, 2 }, // vpsrlvd (Haswell from agner.org)
658  { ISD::SRA, MVT::v4i32, 2 }, // vpsravd (Haswell from agner.org)
659  { ISD::SHL, MVT::v8i32, 2 }, // vpsllvd (Haswell from agner.org)
660  { ISD::SRL, MVT::v8i32, 2 }, // vpsrlvd (Haswell from agner.org)
661  { ISD::SRA, MVT::v8i32, 2 }, // vpsravd (Haswell from agner.org)
662  { ISD::SHL, MVT::v2i64, 1 }, // vpsllvq (Haswell from agner.org)
663  { ISD::SRL, MVT::v2i64, 1 }, // vpsrlvq (Haswell from agner.org)
664  { ISD::SHL, MVT::v4i64, 1 }, // vpsllvq (Haswell from agner.org)
665  { ISD::SRL, MVT::v4i64, 1 }, // vpsrlvq (Haswell from agner.org)
666  };
667 
668  if (ST->hasAVX512()) {
669  if (ISD == ISD::SHL && LT.second == MVT::v32i16 &&
672  // On AVX512, a packed v32i16 shift left by a constant build_vector
673  // is lowered into a vector multiply (vpmullw).
674  return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
675  Op1Info, Op2Info,
678  }
679 
680  // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
681  if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
682  if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
685  // On AVX2, a packed v16i16 shift left by a constant build_vector
686  // is lowered into a vector multiply (vpmullw).
687  return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
688  Op1Info, Op2Info,
691 
692  if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
693  return LT.first * Entry->Cost;
694  }
695 
696  static const CostTblEntry XOPShiftCostTable[] = {
697  // 128bit shifts take 1cy, but right shifts require negation beforehand.
698  { ISD::SHL, MVT::v16i8, 1 },
699  { ISD::SRL, MVT::v16i8, 2 },
700  { ISD::SRA, MVT::v16i8, 2 },
701  { ISD::SHL, MVT::v8i16, 1 },
702  { ISD::SRL, MVT::v8i16, 2 },
703  { ISD::SRA, MVT::v8i16, 2 },
704  { ISD::SHL, MVT::v4i32, 1 },
705  { ISD::SRL, MVT::v4i32, 2 },
706  { ISD::SRA, MVT::v4i32, 2 },
707  { ISD::SHL, MVT::v2i64, 1 },
708  { ISD::SRL, MVT::v2i64, 2 },
709  { ISD::SRA, MVT::v2i64, 2 },
710  // 256bit shifts require splitting if AVX2 didn't catch them above.
711  { ISD::SHL, MVT::v32i8, 2+2 },
712  { ISD::SRL, MVT::v32i8, 4+2 },
713  { ISD::SRA, MVT::v32i8, 4+2 },
714  { ISD::SHL, MVT::v16i16, 2+2 },
715  { ISD::SRL, MVT::v16i16, 4+2 },
716  { ISD::SRA, MVT::v16i16, 4+2 },
717  { ISD::SHL, MVT::v8i32, 2+2 },
718  { ISD::SRL, MVT::v8i32, 4+2 },
719  { ISD::SRA, MVT::v8i32, 4+2 },
720  { ISD::SHL, MVT::v4i64, 2+2 },
721  { ISD::SRL, MVT::v4i64, 4+2 },
722  { ISD::SRA, MVT::v4i64, 4+2 },
723  };
724 
725  // Look for XOP lowering tricks.
726  if (ST->hasXOP()) {
727  // If the right shift is constant then we'll fold the negation so
728  // it's as cheap as a left shift.
729  int ShiftISD = ISD;
730  if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) &&
733  ShiftISD = ISD::SHL;
734  if (const auto *Entry =
735  CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
736  return LT.first * Entry->Cost;
737  }
738 
739  static const CostTblEntry SSE2UniformShiftCostTable[] = {
740  // Uniform splats are cheaper for the following instructions.
741  { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split.
742  { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split.
743  { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split.
744 
745  { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split.
746  { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split.
747  { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split.
748 
749  { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split.
750  { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split.
751  { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle.
752  { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split.
753  };
754 
755  if (ST->hasSSE2() &&
757  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
758 
759  // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
760  if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2())
761  return LT.first * 4; // 2*psrad + shuffle.
762 
763  if (const auto *Entry =
764  CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
765  return LT.first * Entry->Cost;
766  }
767 
768  if (ISD == ISD::SHL &&
770  MVT VT = LT.second;
771  // Vector shift left by non uniform constant can be lowered
772  // into vector multiply.
773  if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
774  ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
775  ISD = ISD::MUL;
776  }
777 
778  static const CostTblEntry AVX2CostTable[] = {
779  { ISD::SHL, MVT::v16i8, 6 }, // vpblendvb sequence.
780  { ISD::SHL, MVT::v32i8, 6 }, // vpblendvb sequence.
781  { ISD::SHL, MVT::v64i8, 12 }, // 2*vpblendvb sequence.
782  { ISD::SHL, MVT::v8i16, 5 }, // extend/vpsrlvd/pack sequence.
783  { ISD::SHL, MVT::v16i16, 7 }, // extend/vpsrlvd/pack sequence.
784  { ISD::SHL, MVT::v32i16, 14 }, // 2*extend/vpsrlvd/pack sequence.
785 
786  { ISD::SRL, MVT::v16i8, 6 }, // vpblendvb sequence.
787  { ISD::SRL, MVT::v32i8, 6 }, // vpblendvb sequence.
788  { ISD::SRL, MVT::v64i8, 12 }, // 2*vpblendvb sequence.
789  { ISD::SRL, MVT::v8i16, 5 }, // extend/vpsrlvd/pack sequence.
790  { ISD::SRL, MVT::v16i16, 7 }, // extend/vpsrlvd/pack sequence.
791  { ISD::SRL, MVT::v32i16, 14 }, // 2*extend/vpsrlvd/pack sequence.
792 
793  { ISD::SRA, MVT::v16i8, 17 }, // vpblendvb sequence.
794  { ISD::SRA, MVT::v32i8, 17 }, // vpblendvb sequence.
795  { ISD::SRA, MVT::v64i8, 34 }, // 2*vpblendvb sequence.
796  { ISD::SRA, MVT::v8i16, 5 }, // extend/vpsravd/pack sequence.
797  { ISD::SRA, MVT::v16i16, 7 }, // extend/vpsravd/pack sequence.
798  { ISD::SRA, MVT::v32i16, 14 }, // 2*extend/vpsravd/pack sequence.
799  { ISD::SRA, MVT::v2i64, 2 }, // srl/xor/sub sequence.
800  { ISD::SRA, MVT::v4i64, 2 }, // srl/xor/sub sequence.
801 
802  { ISD::SUB, MVT::v32i8, 1 }, // psubb
803  { ISD::ADD, MVT::v32i8, 1 }, // paddb
804  { ISD::SUB, MVT::v16i16, 1 }, // psubw
805  { ISD::ADD, MVT::v16i16, 1 }, // paddw
806  { ISD::SUB, MVT::v8i32, 1 }, // psubd
807  { ISD::ADD, MVT::v8i32, 1 }, // paddd
808  { ISD::SUB, MVT::v4i64, 1 }, // psubq
809  { ISD::ADD, MVT::v4i64, 1 }, // paddq
810 
811  { ISD::MUL, MVT::v16i16, 1 }, // pmullw
812  { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org)
813  { ISD::MUL, MVT::v4i64, 6 }, // 3*pmuludq/3*shift/2*add
814 
815  { ISD::FNEG, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
816  { ISD::FNEG, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
817  { ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
818  { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
819  { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
820  { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
821  { ISD::FMUL, MVT::f64, 1 }, // Haswell from http://www.agner.org/
822  { ISD::FMUL, MVT::v2f64, 1 }, // Haswell from http://www.agner.org/
823  { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
824  { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
825 
826  { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
827  { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
828  { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
829  { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/
830  { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
831  { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
832  };
833 
834  // Look for AVX2 lowering tricks for custom cases.
835  if (ST->hasAVX2())
836  if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
837  return LT.first * Entry->Cost;
838 
839  static const CostTblEntry AVX1CostTable[] = {
840  // We don't have to scalarize unsupported ops. We can issue two half-sized
841  // operations and we only need to extract the upper YMM half.
842  // Two ops + 1 extract + 1 insert = 4.
843  { ISD::MUL, MVT::v16i16, 4 },
844  { ISD::MUL, MVT::v8i32, 5 }, // BTVER2 from http://www.agner.org/
845  { ISD::MUL, MVT::v4i64, 12 },
846 
847  { ISD::SUB, MVT::v32i8, 4 },
848  { ISD::ADD, MVT::v32i8, 4 },
849  { ISD::SUB, MVT::v16i16, 4 },
850  { ISD::ADD, MVT::v16i16, 4 },
851  { ISD::SUB, MVT::v8i32, 4 },
852  { ISD::ADD, MVT::v8i32, 4 },
853  { ISD::SUB, MVT::v4i64, 4 },
854  { ISD::ADD, MVT::v4i64, 4 },
855 
856  { ISD::SHL, MVT::v16i8, 10 }, // pblendvb sequence .
857  { ISD::SHL, MVT::v32i8, 22 }, // pblendvb sequence + split.
858  { ISD::SHL, MVT::v8i16, 6 }, // pblendvb sequence.
859  { ISD::SHL, MVT::v16i16, 13 }, // pblendvb sequence + split.
860  { ISD::SHL, MVT::v4i32, 3 }, // pslld/paddd/cvttps2dq/pmulld
861  { ISD::SHL, MVT::v8i32, 9 }, // pslld/paddd/cvttps2dq/pmulld + split
862  { ISD::SHL, MVT::v2i64, 2 }, // Shift each lane + blend.
863  { ISD::SHL, MVT::v4i64, 6 }, // Shift each lane + blend + split.
864 
865  { ISD::SRL, MVT::v16i8, 11 }, // pblendvb sequence.
866  { ISD::SRL, MVT::v32i8, 23 }, // pblendvb sequence + split.
867  { ISD::SRL, MVT::v8i16, 13 }, // pblendvb sequence.
868  { ISD::SRL, MVT::v16i16, 28 }, // pblendvb sequence + split.
869  { ISD::SRL, MVT::v4i32, 6 }, // Shift each lane + blend.
870  { ISD::SRL, MVT::v8i32, 14 }, // Shift each lane + blend + split.
871  { ISD::SRL, MVT::v2i64, 2 }, // Shift each lane + blend.
872  { ISD::SRL, MVT::v4i64, 6 }, // Shift each lane + blend + split.
873 
874  { ISD::SRA, MVT::v16i8, 21 }, // pblendvb sequence.
875  { ISD::SRA, MVT::v32i8, 44 }, // pblendvb sequence + split.
876  { ISD::SRA, MVT::v8i16, 13 }, // pblendvb sequence.
877  { ISD::SRA, MVT::v16i16, 28 }, // pblendvb sequence + split.
878  { ISD::SRA, MVT::v4i32, 6 }, // Shift each lane + blend.
879  { ISD::SRA, MVT::v8i32, 14 }, // Shift each lane + blend + split.
880  { ISD::SRA, MVT::v2i64, 5 }, // Shift each lane + blend.
881  { ISD::SRA, MVT::v4i64, 12 }, // Shift each lane + blend + split.
882 
883  { ISD::FNEG, MVT::v4f64, 2 }, // BTVER2 from http://www.agner.org/
884  { ISD::FNEG, MVT::v8f32, 2 }, // BTVER2 from http://www.agner.org/
885 
886  { ISD::FMUL, MVT::f64, 2 }, // BTVER2 from http://www.agner.org/
887  { ISD::FMUL, MVT::v2f64, 2 }, // BTVER2 from http://www.agner.org/
888  { ISD::FMUL, MVT::v4f64, 4 }, // BTVER2 from http://www.agner.org/
889 
890  { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
891  { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
892  { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
893  { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/
894  { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/
895  { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/
896  };
897 
898  if (ST->hasAVX())
899  if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
900  return LT.first * Entry->Cost;
901 
902  static const CostTblEntry SSE42CostTable[] = {
903  { ISD::FADD, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
904  { ISD::FADD, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
905  { ISD::FADD, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
906  { ISD::FADD, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
907 
908  { ISD::FSUB, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
909  { ISD::FSUB, MVT::f32 , 1 }, // Nehalem from http://www.agner.org/
910  { ISD::FSUB, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
911  { ISD::FSUB, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
912 
913  { ISD::FMUL, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
914  { ISD::FMUL, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
915  { ISD::FMUL, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
916  { ISD::FMUL, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
917 
918  { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/
919  { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
920  { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/
921  { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
922 
923  { ISD::MUL, MVT::v2i64, 6 } // 3*pmuludq/3*shift/2*add
924  };
925 
926  if (ST->hasSSE42())
927  if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
928  return LT.first * Entry->Cost;
929 
930  static const CostTblEntry SSE41CostTable[] = {
931  { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence.
932  { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence.
933  { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
934 
935  { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence.
936  { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence.
937  { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend.
938 
939  { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence.
940  { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence.
941  { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
942 
943  { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org)
944  };
945 
946  if (ST->hasSSE41())
947  if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
948  return LT.first * Entry->Cost;
949 
950  static const CostTblEntry SSE2CostTable[] = {
951  // We don't correctly identify costs of casts because they are marked as
952  // custom.
953  { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence.
954  { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence.
955  { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
956  { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
957 
958  { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence.
959  { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence.
960  { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
961  { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
962 
963  { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence.
964  { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence.
965  { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend.
966  { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence.
967 
968  { ISD::MUL, MVT::v8i16, 1 }, // pmullw
969  { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
970  { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
971 
972  { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/
973  { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
974  { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
975  { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
976 
977  { ISD::FNEG, MVT::f32, 1 }, // Pentium IV from http://www.agner.org/
978  { ISD::FNEG, MVT::f64, 1 }, // Pentium IV from http://www.agner.org/
979  { ISD::FNEG, MVT::v4f32, 1 }, // Pentium IV from http://www.agner.org/
980  { ISD::FNEG, MVT::v2f64, 1 }, // Pentium IV from http://www.agner.org/
981 
982  { ISD::FADD, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
983  { ISD::FADD, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
984 
985  { ISD::FSUB, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
986  { ISD::FSUB, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
987  };
988 
989  if (ST->hasSSE2())
990  if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
991  return LT.first * Entry->Cost;
992 
993  static const CostTblEntry SSE1CostTable[] = {
994  { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/
995  { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
996 
997  { ISD::FNEG, MVT::f32, 2 }, // Pentium III from http://www.agner.org/
998  { ISD::FNEG, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
999 
1000  { ISD::FADD, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
1001  { ISD::FADD, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
1002 
1003  { ISD::FSUB, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
1004  { ISD::FSUB, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
1005  };
1006 
1007  if (ST->hasSSE1())
1008  if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1009  return LT.first * Entry->Cost;
1010 
1011  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
1012  { ISD::ADD, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/
1013  { ISD::SUB, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/
1014  };
1015 
1016  if (ST->is64Bit())
1017  if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1018  return LT.first * Entry->Cost;
1019 
1020  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1021  { ISD::ADD, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
1022  { ISD::ADD, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
1023  { ISD::ADD, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
1024 
1025  { ISD::SUB, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
1026  { ISD::SUB, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
1027  { ISD::SUB, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
1028  };
1029 
1030  if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1031  return LT.first * Entry->Cost;
1032 
1033  // It is not a good idea to vectorize division. We have to scalarize it and
1034  // in the process we will often end up having to spilling regular
1035  // registers. The overhead of division is going to dominate most kernels
1036  // anyways so try hard to prevent vectorization of division - it is
1037  // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1038  // to hide "20 cycles" for each lane.
1039  if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM ||
1040  ISD == ISD::UDIV || ISD == ISD::UREM)) {
1042  Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info,
1044  return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1045  }
1046 
1047  // Fallback to the default implementation.
1048  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info);
1049 }
1050 
1052  VectorType *BaseTp,
1053  ArrayRef<int> Mask, int Index,
1054  VectorType *SubTp) {
1055  // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1056  // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1057  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, BaseTp);
1058 
1060  // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1061  if (Kind == TTI::SK_Transpose)
1063 
1064  // For Broadcasts we are splatting the first element from the first input
1065  // register, so only need to reference that input and all the output
1066  // registers are the same.
1067  if (Kind == TTI::SK_Broadcast)
1068  LT.first = 1;
1069 
1070  // Subvector extractions are free if they start at the beginning of a
1071  // vector and cheap if the subvectors are aligned.
1072  if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1073  int NumElts = LT.second.getVectorNumElements();
1074  if ((Index % NumElts) == 0)
1075  return 0;
1076  std::pair<InstructionCost, MVT> SubLT =
1077  TLI->getTypeLegalizationCost(DL, SubTp);
1078  if (SubLT.second.isVector()) {
1079  int NumSubElts = SubLT.second.getVectorNumElements();
1080  if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1081  return SubLT.first;
1082  // Handle some cases for widening legalization. For now we only handle
1083  // cases where the original subvector was naturally aligned and evenly
1084  // fit in its legalized subvector type.
1085  // FIXME: Remove some of the alignment restrictions.
1086  // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1087  // vectors.
1088  int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1089  if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1090  (NumSubElts % OrigSubElts) == 0 &&
1091  LT.second.getVectorElementType() ==
1092  SubLT.second.getVectorElementType() &&
1093  LT.second.getVectorElementType().getSizeInBits() ==
1094  BaseTp->getElementType()->getPrimitiveSizeInBits()) {
1095  assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1096  "Unexpected number of elements!");
1097  auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
1098  LT.second.getVectorNumElements());
1099  auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
1100  SubLT.second.getVectorNumElements());
1101  int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1102  InstructionCost ExtractCost = getShuffleCost(
1103  TTI::SK_ExtractSubvector, VecTy, None, ExtractIndex, SubTy);
1104 
1105  // If the original size is 32-bits or more, we can use pshufd. Otherwise
1106  // if we have SSSE3 we can use pshufb.
1107  if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1108  return ExtractCost + 1; // pshufd or pshufb
1109 
1110  assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1111  "Unexpected vector size");
1112 
1113  return ExtractCost + 2; // worst case pshufhw + pshufd
1114  }
1115  }
1116  }
1117 
1118  // Subvector insertions are cheap if the subvectors are aligned.
1119  // Note that in general, the insertion starting at the beginning of a vector
1120  // isn't free, because we need to preserve the rest of the wide vector.
1121  if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1122  int NumElts = LT.second.getVectorNumElements();
1123  std::pair<InstructionCost, MVT> SubLT =
1124  TLI->getTypeLegalizationCost(DL, SubTp);
1125  if (SubLT.second.isVector()) {
1126  int NumSubElts = SubLT.second.getVectorNumElements();
1127  if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1128  return SubLT.first;
1129  }
1130  }
1131 
1132  // Handle some common (illegal) sub-vector types as they are often very cheap
1133  // to shuffle even on targets without PSHUFB.
1134  EVT VT = TLI->getValueType(DL, BaseTp);
1135  if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1136  !ST->hasSSSE3()) {
1137  static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
1138  {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw
1139  {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw
1140  {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw
1141  {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw
1142  {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck
1143 
1144  {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw
1145  {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw
1146  {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus
1147  {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck
1148 
1149  {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw
1150  {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw
1151  {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw
1152  {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw
1153  {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck
1154 
1155  {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
1156  {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
1157  {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw
1158  {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw
1159  {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck
1160  };
1161 
1162  if (ST->hasSSE2())
1163  if (const auto *Entry =
1164  CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1165  return Entry->Cost;
1166  }
1167 
1168  // We are going to permute multiple sources and the result will be in multiple
1169  // destinations. Providing an accurate cost only for splits where the element
1170  // type remains the same.
1171  if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
1172  MVT LegalVT = LT.second;
1173  if (LegalVT.isVector() &&
1174  LegalVT.getVectorElementType().getSizeInBits() ==
1175  BaseTp->getElementType()->getPrimitiveSizeInBits() &&
1176  LegalVT.getVectorNumElements() <
1177  cast<FixedVectorType>(BaseTp)->getNumElements()) {
1178 
1179  unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
1180  unsigned LegalVTSize = LegalVT.getStoreSize();
1181  // Number of source vectors after legalization:
1182  unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1183  // Number of destination vectors after legalization:
1184  InstructionCost NumOfDests = LT.first;
1185 
1186  auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
1187  LegalVT.getVectorNumElements());
1188 
1189  InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1190  return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1191  None, 0, nullptr);
1192  }
1193 
1194  return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp);
1195  }
1196 
1197  // For 2-input shuffles, we must account for splitting the 2 inputs into many.
1198  if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
1199  // We assume that source and destination have the same vector type.
1200  InstructionCost NumOfDests = LT.first;
1201  InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1;
1202  LT.first = NumOfDests * NumOfShufflesPerDest;
1203  }
1204 
1205  static const CostTblEntry AVX512VBMIShuffleTbl[] = {
1206  {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
1207  {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
1208 
1209  {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
1210  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
1211 
1212  {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
1213  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
1214  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b
1215  };
1216 
1217  if (ST->hasVBMI())
1218  if (const auto *Entry =
1219  CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1220  return LT.first * Entry->Cost;
1221 
1222  static const CostTblEntry AVX512BWShuffleTbl[] = {
1223  {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1224  {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1225 
1226  {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
1227  {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
1228  {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
1229 
1230  {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
1231  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
1232  {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
1233 
1234  {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
1235  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
1236  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
1237  {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1238 
1239  {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
1240  {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
1241  };
1242 
1243  if (ST->hasBWI())
1244  if (const auto *Entry =
1245  CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1246  return LT.first * Entry->Cost;
1247 
1248  static const CostTblEntry AVX512ShuffleTbl[] = {
1249  {TTI::SK_Broadcast, MVT::v8f64, 1}, // vbroadcastpd
1250  {TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps
1251  {TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq
1252  {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd
1253  {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1254  {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1255 
1256  {TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd
1257  {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps
1258  {TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq
1259  {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd
1260  {TTI::SK_Reverse, MVT::v32i16, 7}, // per mca
1261  {TTI::SK_Reverse, MVT::v64i8, 7}, // per mca
1262 
1263  {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd
1264  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1265  {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // vpermpd
1266  {TTI::SK_PermuteSingleSrc, MVT::v16f32, 1}, // vpermps
1267  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1268  {TTI::SK_PermuteSingleSrc, MVT::v4f32, 1}, // vpermps
1269  {TTI::SK_PermuteSingleSrc, MVT::v8i64, 1}, // vpermq
1270  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1271  {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // vpermq
1272  {TTI::SK_PermuteSingleSrc, MVT::v16i32, 1}, // vpermd
1273  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1274  {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // vpermd
1275  {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1276 
1277  {TTI::SK_PermuteTwoSrc, MVT::v8f64, 1}, // vpermt2pd
1278  {TTI::SK_PermuteTwoSrc, MVT::v16f32, 1}, // vpermt2ps
1279  {TTI::SK_PermuteTwoSrc, MVT::v8i64, 1}, // vpermt2q
1280  {TTI::SK_PermuteTwoSrc, MVT::v16i32, 1}, // vpermt2d
1281  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 1}, // vpermt2pd
1282  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 1}, // vpermt2ps
1283  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 1}, // vpermt2q
1284  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 1}, // vpermt2d
1285  {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1}, // vpermt2pd
1286  {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1}, // vpermt2ps
1287  {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1}, // vpermt2q
1288  {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1}, // vpermt2d
1289 
1290  // FIXME: This just applies the type legalization cost rules above
1291  // assuming these completely split.
1296 
1297  {TTI::SK_Select, MVT::v32i16, 1}, // vpternlogq
1298  {TTI::SK_Select, MVT::v64i8, 1}, // vpternlogq
1299  {TTI::SK_Select, MVT::v8f64, 1}, // vblendmpd
1300  {TTI::SK_Select, MVT::v16f32, 1}, // vblendmps
1301  {TTI::SK_Select, MVT::v8i64, 1}, // vblendmq
1302  {TTI::SK_Select, MVT::v16i32, 1}, // vblendmd
1303  };
1304 
1305  if (ST->hasAVX512())
1306  if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1307  return LT.first * Entry->Cost;
1308 
1309  static const CostTblEntry AVX2ShuffleTbl[] = {
1310  {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1311  {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1312  {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1313  {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1314  {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1315  {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1316 
1317  {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1318  {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1319  {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1320  {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1321  {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1322  {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1323 
1324  {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1325  {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1326 
1327  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1328  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1329  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1330  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1331  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1332  // + vpblendvb
1333  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1334  // + vpblendvb
1335 
1336  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1337  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1338  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1339  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1340  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1341  // + vpblendvb
1342  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
1343  // + vpblendvb
1344  };
1345 
1346  if (ST->hasAVX2())
1347  if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1348  return LT.first * Entry->Cost;
1349 
1350  static const CostTblEntry XOPShuffleTbl[] = {
1351  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
1352  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
1353  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
1354  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
1355  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1356  // + vinsertf128
1357  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
1358  // + vinsertf128
1359 
1360  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1361  // + vinsertf128
1362  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
1363  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
1364  // + vinsertf128
1365  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
1366  };
1367 
1368  if (ST->hasXOP())
1369  if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1370  return LT.first * Entry->Cost;
1371 
1372  static const CostTblEntry AVX1ShuffleTbl[] = {
1373  {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1374  {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1375  {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1376  {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1377  {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1378  {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
1379 
1380  {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1381  {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1382  {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1383  {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1384  {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1385  // + vinsertf128
1386  {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
1387  // + vinsertf128
1388 
1389  {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
1390  {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
1391  {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
1392  {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
1393  {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
1394  {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
1395 
1396  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
1397  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
1398  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1399  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1400  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
1401  // + 2*por + vinsertf128
1402  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
1403  // + 2*por + vinsertf128
1404 
1405  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
1406  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
1407  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1408  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1409  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
1410  // + 4*por + vinsertf128
1411  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
1412  // + 4*por + vinsertf128
1413  };
1414 
1415  if (ST->hasAVX())
1416  if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
1417  return LT.first * Entry->Cost;
1418 
1419  static const CostTblEntry SSE41ShuffleTbl[] = {
1420  {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
1421  {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1422  {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
1423  {TTI::SK_Select, MVT::v4f32, 1}, // blendps
1424  {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
1425  {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
1426  };
1427 
1428  if (ST->hasSSE41())
1429  if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
1430  return LT.first * Entry->Cost;
1431 
1432  static const CostTblEntry SSSE3ShuffleTbl[] = {
1433  {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
1434  {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
1435 
1436  {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
1437  {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
1438 
1439  {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
1440  {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
1441 
1442  {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
1443  {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1444 
1445  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
1446  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
1447  };
1448 
1449  if (ST->hasSSSE3())
1450  if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
1451  return LT.first * Entry->Cost;
1452 
1453  static const CostTblEntry SSE2ShuffleTbl[] = {
1454  {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
1455  {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
1456  {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
1457  {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
1458  {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
1459 
1460  {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
1461  {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
1462  {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
1463  {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
1464  {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
1465  // + 2*pshufd + 2*unpck + packus
1466 
1467  {TTI::SK_Select, MVT::v2i64, 1}, // movsd
1468  {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1469  {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
1470  {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
1471  {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
1472 
1473  {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
1474  {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
1475  {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
1476  {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
1477  // + pshufd/unpck
1478  { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
1479  // + 2*pshufd + 2*unpck + 2*packus
1480 
1481  { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
1482  { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
1483  { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
1484  { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
1485  { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
1486  };
1487 
1488  if (ST->hasSSE2())
1489  if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
1490  return LT.first * Entry->Cost;
1491 
1492  static const CostTblEntry SSE1ShuffleTbl[] = {
1493  { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
1494  { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
1495  { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
1496  { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
1497  { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
1498  };
1499 
1500  if (ST->hasSSE1())
1501  if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
1502  return LT.first * Entry->Cost;
1503 
1504  return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp);
1505 }
1506 
1508  Type *Src,
1511  const Instruction *I) {
1512  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1513  assert(ISD && "Invalid opcode");
1514 
1515  // TODO: Allow non-throughput costs that aren't binary.
1516  auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
1518  return Cost == 0 ? 0 : 1;
1519  return Cost;
1520  };
1521 
1522  // FIXME: Need a better design of the cost table to handle non-simple types of
1523  // potential massive combinations (elem_num x src_type x dst_type).
1524 
1525  static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
1528 
1529  // Mask sign extend has an instruction.
1541 
1542  // Mask zero extend is a sext + shift.
1554 
1556  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm
1557  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // widen to zmm
1558  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // widen to zmm
1559  { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // vpmovwb
1560  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // widen to zmm
1561  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // widen to zmm
1562  { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // vpmovwb
1563  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // widen to zmm
1564  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // widen to zmm
1565  { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // vpmovwb
1566  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // widen to zmm
1567  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // widen to zmm
1568  { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // widen to zmm
1571  };
1572 
1573  static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
1576 
1579 
1582 
1585  };
1586 
1587  // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
1588  // 256-bit wide vectors.
1589 
1590  static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
1594 
1595  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
1596  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
1597  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
1598  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd
1599  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
1600  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
1601  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
1602  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd
1603  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd
1604  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd
1605  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd
1606  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd
1607  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq
1608  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq
1609  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq
1610  { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 2 }, // vpmovdb
1611  { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 2 }, // vpmovdb
1612  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 }, // vpmovdb
1613  { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 }, // vpmovdb
1614  { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 2 }, // vpmovqb
1615  { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1 }, // vpshufb
1616  { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 }, // vpmovqb
1617  { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 }, // vpmovqw
1618  { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, // vpmovqd
1619  { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd
1620  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb
1621 
1622  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32
1624 
1625  // Sign extend is zmm vpternlogd+vptruncdb.
1626  // Zero extend is zmm broadcast load+vptruncdw.
1635 
1636  // Sign extend is zmm vpternlogd+vptruncdw.
1637  // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
1646 
1647  { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd
1648  { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld
1649  { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd
1650  { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld
1651  { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd
1652  { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld
1653  { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq
1654  { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq
1655  { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq
1656  { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq
1657 
1658  { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd
1659  { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld
1660  { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq
1661  { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq
1662 
1673 
1674  { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
1675  { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
1676 
1685 
1696 
1701 
1708  };
1709 
1710  static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] {
1711  // Mask sign extend has an instruction.
1721 
1722  // Mask zero extend is a sext + shift.
1732 
1734  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // vpsllw+vptestmb
1735  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // vpsllw+vptestmw
1736  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // vpsllw+vptestmb
1737  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // vpsllw+vptestmw
1738  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // vpsllw+vptestmb
1739  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // vpsllw+vptestmw
1740  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // vpsllw+vptestmb
1741  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // vpsllw+vptestmw
1742  { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // vpsllw+vptestmb
1743  };
1744 
1745  static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = {
1750 
1755 
1760 
1765  };
1766 
1767  static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = {
1768  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
1769  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
1770  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
1771  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8
1772  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
1773  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
1774  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
1775  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16
1776  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd
1777  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd
1778  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd
1779  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq
1780  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq
1781  { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd
1782  { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, // vpmovqb
1783  { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, // vpmovqw
1784  { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, // vpmovwb
1785 
1786  // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
1787  // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
1796 
1797  // sign extend is vpcmpeq+maskedmove+vpmovdw
1798  // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
1807 
1808  { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd
1809  { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld
1810  { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd
1811  { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld
1812  { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd
1813  { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld
1814  { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq
1815  { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq
1816  { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq
1817  { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq
1818 
1833 
1836 
1839 
1842 
1848  };
1849 
1850  static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
1871 
1877 
1880 
1882  };
1883 
1884  static const TypeConversionCostTblEntry AVXConversionTbl[] = {
1903 
1909 
1910  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // and+extract+packuswb
1911  { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 2 }, // and+packusdw+packuswb
1915  { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 3 }, // and+extract+2*packusdw
1921 
1934 
1950  // The generic code to compute the scalar overhead is currently broken.
1951  // Workaround this limitation by estimating the scalarization overhead
1952  // here. We have roughly 10 instructions per scalar element.
1953  // Multiply that by the vector width.
1954  // FIXME: remove that when PR19268 is fixed.
1957 
1962 
1968  // This node is expanded into scalarized operations but BasicTTI is overly
1969  // optimistic estimating its cost. It computes 3 per element (one
1970  // vector-extract, one scalar conversion and one vector-insert). The
1971  // problem is that the inserts form a read-modify-write chain so latency
1972  // should be factored in too. Inflating the cost per element by 1.
1974 
1977  };
1978 
1979  static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
1986 
2017 
2018  // These truncates end up widening elements.
2019  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ
2020  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ
2021  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD
2022 
2031  { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1 }, // PSHUFB
2032 
2035 
2038 
2042  };
2043 
2044  static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
2045  // These are somewhat magic numbers justified by looking at the output of
2046  // Intel's IACA, running some kernels and making sure when we take
2047  // legalization into account the throughput will be overestimated.
2049  { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
2057 
2058  { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
2066 
2073 
2075 
2078 
2088 
2113 
2114  // These truncates are really widening elements.
2115  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD
2116  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // PUNPCKLWD+DQ
2117  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // PUNPCKLBW+WD+PSHUFD
2118  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 1 }, // PUNPCKLWD
2119  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD
2120  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW
2121 
2122  { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // PAND+PACKUSWB
2123  { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // PAND+PACKUSWB
2124  { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // PAND+PACKUSWB
2126  { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 3 }, // PAND+2*PACKUSWB
2134  { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB
2135  { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW
2136  { ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1 }, // PSHUFD
2137  };
2138 
2139  std::pair<InstructionCost, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
2140  std::pair<InstructionCost, MVT> LTDest =
2141  TLI->getTypeLegalizationCost(DL, Dst);
2142 
2143  if (ST->hasSSE41() && !ST->hasAVX())
2144  if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
2145  LTDest.second, LTSrc.second))
2146  return AdjustCost(LTSrc.first * Entry->Cost);
2147 
2148  if (ST->hasSSE2() && !ST->hasAVX())
2149  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2150  LTDest.second, LTSrc.second))
2151  return AdjustCost(LTSrc.first * Entry->Cost);
2152 
2153  EVT SrcTy = TLI->getValueType(DL, Src);
2154  EVT DstTy = TLI->getValueType(DL, Dst);
2155 
2156  // The function getSimpleVT only handles simple value types.
2157  if (!SrcTy.isSimple() || !DstTy.isSimple())
2158  return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind));
2159 
2160  MVT SimpleSrcTy = SrcTy.getSimpleVT();
2161  MVT SimpleDstTy = DstTy.getSimpleVT();
2162 
2163  if (ST->useAVX512Regs()) {
2164  if (ST->hasBWI())
2165  if (const auto *Entry = ConvertCostTableLookup(AVX512BWConversionTbl, ISD,
2166  SimpleDstTy, SimpleSrcTy))
2167  return AdjustCost(Entry->Cost);
2168 
2169  if (ST->hasDQI())
2170  if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
2171  SimpleDstTy, SimpleSrcTy))
2172  return AdjustCost(Entry->Cost);
2173 
2174  if (ST->hasAVX512())
2175  if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
2176  SimpleDstTy, SimpleSrcTy))
2177  return AdjustCost(Entry->Cost);
2178  }
2179 
2180  if (ST->hasBWI())
2181  if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
2182  SimpleDstTy, SimpleSrcTy))
2183  return AdjustCost(Entry->Cost);
2184 
2185  if (ST->hasDQI())
2186  if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
2187  SimpleDstTy, SimpleSrcTy))
2188  return AdjustCost(Entry->Cost);
2189 
2190  if (ST->hasAVX512())
2191  if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2192  SimpleDstTy, SimpleSrcTy))
2193  return AdjustCost(Entry->Cost);
2194 
2195  if (ST->hasAVX2()) {
2196  if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2197  SimpleDstTy, SimpleSrcTy))
2198  return AdjustCost(Entry->Cost);
2199  }
2200 
2201  if (ST->hasAVX()) {
2202  if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2203  SimpleDstTy, SimpleSrcTy))
2204  return AdjustCost(Entry->Cost);
2205  }
2206 
2207  if (ST->hasSSE41()) {
2208  if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
2209  SimpleDstTy, SimpleSrcTy))
2210  return AdjustCost(Entry->Cost);
2211  }
2212 
2213  if (ST->hasSSE2()) {
2214  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2215  SimpleDstTy, SimpleSrcTy))
2216  return AdjustCost(Entry->Cost);
2217  }
2218 
2219  return AdjustCost(
2220  BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2221 }
2222 
2224  Type *CondTy,
2225  CmpInst::Predicate VecPred,
2227  const Instruction *I) {
2228  // TODO: Handle other cost kinds.
2230  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2231  I);
2232 
2233  // Legalize the type.
2234  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2235 
2236  MVT MTy = LT.second;
2237 
2238  int ISD = TLI->InstructionOpcodeToISD(Opcode);
2239  assert(ISD && "Invalid opcode");
2240 
2241  unsigned ExtraCost = 0;
2242  if (I && (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp)) {
2243  // Some vector comparison predicates cost extra instructions.
2244  if (MTy.isVector() &&
2245  !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
2246  (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
2247  ST->hasBWI())) {
2248  switch (cast<CmpInst>(I)->getPredicate()) {
2249  case CmpInst::Predicate::ICMP_NE:
2250  // xor(cmpeq(x,y),-1)
2251  ExtraCost = 1;
2252  break;
2253  case CmpInst::Predicate::ICMP_SGE:
2254  case CmpInst::Predicate::ICMP_SLE:
2255  // xor(cmpgt(x,y),-1)
2256  ExtraCost = 1;
2257  break;
2258  case CmpInst::Predicate::ICMP_ULT:
2259  case CmpInst::Predicate::ICMP_UGT:
2260  // cmpgt(xor(x,signbit),xor(y,signbit))
2261  // xor(cmpeq(pmaxu(x,y),x),-1)
2262  ExtraCost = 2;
2263  break;
2264  case CmpInst::Predicate::ICMP_ULE:
2265  case CmpInst::Predicate::ICMP_UGE:
2266  if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
2267  (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
2268  // cmpeq(psubus(x,y),0)
2269  // cmpeq(pminu(x,y),x)
2270  ExtraCost = 1;
2271  } else {
2272  // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
2273  ExtraCost = 3;
2274  }
2275  break;
2276  default:
2277  break;
2278  }
2279  }
2280  }
2281 
2282  static const CostTblEntry SLMCostTbl[] = {
2283  // slm pcmpeq/pcmpgt throughput is 2
2284  { ISD::SETCC, MVT::v2i64, 2 },
2285  };
2286 
2287  static const CostTblEntry AVX512BWCostTbl[] = {
2288  { ISD::SETCC, MVT::v32i16, 1 },
2289  { ISD::SETCC, MVT::v64i8, 1 },
2290 
2291  { ISD::SELECT, MVT::v32i16, 1 },
2292  { ISD::SELECT, MVT::v64i8, 1 },
2293  };
2294 
2295  static const CostTblEntry AVX512CostTbl[] = {
2296  { ISD::SETCC, MVT::v8i64, 1 },
2297  { ISD::SETCC, MVT::v16i32, 1 },
2298  { ISD::SETCC, MVT::v8f64, 1 },
2299  { ISD::SETCC, MVT::v16f32, 1 },
2300 
2301  { ISD::SELECT, MVT::v8i64, 1 },
2302  { ISD::SELECT, MVT::v16i32, 1 },
2303  { ISD::SELECT, MVT::v8f64, 1 },
2304  { ISD::SELECT, MVT::v16f32, 1 },
2305 
2306  { ISD::SETCC, MVT::v32i16, 2 }, // FIXME: should probably be 4
2307  { ISD::SETCC, MVT::v64i8, 2 }, // FIXME: should probably be 4
2308 
2309  { ISD::SELECT, MVT::v32i16, 2 }, // FIXME: should be 3
2310  { ISD::SELECT, MVT::v64i8, 2 }, // FIXME: should be 3
2311  };
2312 
2313  static const CostTblEntry AVX2CostTbl[] = {
2314  { ISD::SETCC, MVT::v4i64, 1 },
2315  { ISD::SETCC, MVT::v8i32, 1 },
2316  { ISD::SETCC, MVT::v16i16, 1 },
2317  { ISD::SETCC, MVT::v32i8, 1 },
2318 
2319  { ISD::SELECT, MVT::v4i64, 1 }, // pblendvb
2320  { ISD::SELECT, MVT::v8i32, 1 }, // pblendvb
2321  { ISD::SELECT, MVT::v16i16, 1 }, // pblendvb
2322  { ISD::SELECT, MVT::v32i8, 1 }, // pblendvb
2323  };
2324 
2325  static const CostTblEntry AVX1CostTbl[] = {
2326  { ISD::SETCC, MVT::v4f64, 1 },
2327  { ISD::SETCC, MVT::v8f32, 1 },
2328  // AVX1 does not support 8-wide integer compare.
2329  { ISD::SETCC, MVT::v4i64, 4 },
2330  { ISD::SETCC, MVT::v8i32, 4 },
2331  { ISD::SETCC, MVT::v16i16, 4 },
2332  { ISD::SETCC, MVT::v32i8, 4 },
2333 
2334  { ISD::SELECT, MVT::v4f64, 1 }, // vblendvpd
2335  { ISD::SELECT, MVT::v8f32, 1 }, // vblendvps
2336  { ISD::SELECT, MVT::v4i64, 1 }, // vblendvpd
2337  { ISD::SELECT, MVT::v8i32, 1 }, // vblendvps
2338  { ISD::SELECT, MVT::v16i16, 3 }, // vandps + vandnps + vorps
2339  { ISD::SELECT, MVT::v32i8, 3 }, // vandps + vandnps + vorps
2340  };
2341 
2342  static const CostTblEntry SSE42CostTbl[] = {
2343  { ISD::SETCC, MVT::v2f64, 1 },
2344  { ISD::SETCC, MVT::v4f32, 1 },
2345  { ISD::SETCC, MVT::v2i64, 1 },
2346  };
2347 
2348  static const CostTblEntry SSE41CostTbl[] = {
2349  { ISD::SELECT, MVT::v2f64, 1 }, // blendvpd
2350  { ISD::SELECT, MVT::v4f32, 1 }, // blendvps
2351  { ISD::SELECT, MVT::v2i64, 1 }, // pblendvb
2352  { ISD::SELECT, MVT::v4i32, 1 }, // pblendvb
2353  { ISD::SELECT, MVT::v8i16, 1 }, // pblendvb
2354  { ISD::SELECT, MVT::v16i8, 1 }, // pblendvb
2355  };
2356 
2357  static const CostTblEntry SSE2CostTbl[] = {
2358  { ISD::SETCC, MVT::v2f64, 2 },
2359  { ISD::SETCC, MVT::f64, 1 },
2360  { ISD::SETCC, MVT::v2i64, 8 },
2361  { ISD::SETCC, MVT::v4i32, 1 },
2362  { ISD::SETCC, MVT::v8i16, 1 },
2363  { ISD::SETCC, MVT::v16i8, 1 },
2364 
2365  { ISD::SELECT, MVT::v2f64, 3 }, // andpd + andnpd + orpd
2366  { ISD::SELECT, MVT::v2i64, 3 }, // pand + pandn + por
2367  { ISD::SELECT, MVT::v4i32, 3 }, // pand + pandn + por
2368  { ISD::SELECT, MVT::v8i16, 3 }, // pand + pandn + por
2369  { ISD::SELECT, MVT::v16i8, 3 }, // pand + pandn + por
2370  };
2371 
2372  static const CostTblEntry SSE1CostTbl[] = {
2373  { ISD::SETCC, MVT::v4f32, 2 },
2374  { ISD::SETCC, MVT::f32, 1 },
2375 
2376  { ISD::SELECT, MVT::v4f32, 3 }, // andps + andnps + orps
2377  };
2378 
2379  if (ST->isSLM())
2380  if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
2381  return LT.first * (ExtraCost + Entry->Cost);
2382 
2383  if (ST->hasBWI())
2384  if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
2385  return LT.first * (ExtraCost + Entry->Cost);
2386 
2387  if (ST->hasAVX512())
2388  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2389  return LT.first * (ExtraCost + Entry->Cost);
2390 
2391  if (ST->hasAVX2())
2392  if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
2393  return LT.first * (ExtraCost + Entry->Cost);
2394 
2395  if (ST->hasAVX())
2396  if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
2397  return LT.first * (ExtraCost + Entry->Cost);
2398 
2399  if (ST->hasSSE42())
2400  if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
2401  return LT.first * (ExtraCost + Entry->Cost);
2402 
2403  if (ST->hasSSE41())
2404  if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
2405  return LT.first * (ExtraCost + Entry->Cost);
2406 
2407  if (ST->hasSSE2())
2408  if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
2409  return LT.first * (ExtraCost + Entry->Cost);
2410 
2411  if (ST->hasSSE1())
2412  if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
2413  return LT.first * (ExtraCost + Entry->Cost);
2414 
2415  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
2416 }
2417 
2419 
2423 
2424  // Costs should match the codegen from:
2425  // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
2426  // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
2427  // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
2428  // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
2429  // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
2430 
2431  // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
2432  // specialized in these tables yet.
2433  static const CostTblEntry AVX512CDCostTbl[] = {
2434  { ISD::CTLZ, MVT::v8i64, 1 },
2435  { ISD::CTLZ, MVT::v16i32, 1 },
2436  { ISD::CTLZ, MVT::v32i16, 8 },
2437  { ISD::CTLZ, MVT::v64i8, 20 },
2438  { ISD::CTLZ, MVT::v4i64, 1 },
2439  { ISD::CTLZ, MVT::v8i32, 1 },
2440  { ISD::CTLZ, MVT::v16i16, 4 },
2441  { ISD::CTLZ, MVT::v32i8, 10 },
2442  { ISD::CTLZ, MVT::v2i64, 1 },
2443  { ISD::CTLZ, MVT::v4i32, 1 },
2444  { ISD::CTLZ, MVT::v8i16, 4 },
2445  { ISD::CTLZ, MVT::v16i8, 4 },
2446  };
2447  static const CostTblEntry AVX512BWCostTbl[] = {
2448  { ISD::ABS, MVT::v32i16, 1 },
2449  { ISD::ABS, MVT::v64i8, 1 },
2450  { ISD::BITREVERSE, MVT::v8i64, 5 },
2451  { ISD::BITREVERSE, MVT::v16i32, 5 },
2452  { ISD::BITREVERSE, MVT::v32i16, 5 },
2453  { ISD::BITREVERSE, MVT::v64i8, 5 },
2454  { ISD::BSWAP, MVT::v8i64, 1 },
2455  { ISD::BSWAP, MVT::v16i32, 1 },
2456  { ISD::BSWAP, MVT::v32i16, 1 },
2457  { ISD::CTLZ, MVT::v8i64, 23 },
2458  { ISD::CTLZ, MVT::v16i32, 22 },
2459  { ISD::CTLZ, MVT::v32i16, 18 },
2460  { ISD::CTLZ, MVT::v64i8, 17 },
2461  { ISD::CTPOP, MVT::v8i64, 7 },
2462  { ISD::CTPOP, MVT::v16i32, 11 },
2463  { ISD::CTPOP, MVT::v32i16, 9 },
2464  { ISD::CTPOP, MVT::v64i8, 6 },
2465  { ISD::CTTZ, MVT::v8i64, 10 },
2466  { ISD::CTTZ, MVT::v16i32, 14 },
2467  { ISD::CTTZ, MVT::v32i16, 12 },
2468  { ISD::CTTZ, MVT::v64i8, 9 },
2469  { ISD::SADDSAT, MVT::v32i16, 1 },
2470  { ISD::SADDSAT, MVT::v64i8, 1 },
2471  { ISD::SMAX, MVT::v32i16, 1 },
2472  { ISD::SMAX, MVT::v64i8, 1 },
2473  { ISD::SMIN, MVT::v32i16, 1 },
2474  { ISD::SMIN, MVT::v64i8, 1 },
2475  { ISD::SSUBSAT, MVT::v32i16, 1 },
2476  { ISD::SSUBSAT, MVT::v64i8, 1 },
2477  { ISD::UADDSAT, MVT::v32i16, 1 },
2478  { ISD::UADDSAT, MVT::v64i8, 1 },
2479  { ISD::UMAX, MVT::v32i16, 1 },
2480  { ISD::UMAX, MVT::v64i8, 1 },
2481  { ISD::UMIN, MVT::v32i16, 1 },
2482  { ISD::UMIN, MVT::v64i8, 1 },
2483  { ISD::USUBSAT, MVT::v32i16, 1 },
2484  { ISD::USUBSAT, MVT::v64i8, 1 },
2485  };
2486  static const CostTblEntry AVX512CostTbl[] = {
2487  { ISD::ABS, MVT::v8i64, 1 },
2488  { ISD::ABS, MVT::v16i32, 1 },
2489  { ISD::ABS, MVT::v32i16, 2 }, // FIXME: include split
2490  { ISD::ABS, MVT::v64i8, 2 }, // FIXME: include split
2491  { ISD::ABS, MVT::v4i64, 1 },
2492  { ISD::ABS, MVT::v2i64, 1 },
2493  { ISD::BITREVERSE, MVT::v8i64, 36 },
2494  { ISD::BITREVERSE, MVT::v16i32, 24 },
2495  { ISD::BITREVERSE, MVT::v32i16, 10 },
2496  { ISD::BITREVERSE, MVT::v64i8, 10 },
2497  { ISD::BSWAP, MVT::v8i64, 4 },
2498  { ISD::BSWAP, MVT::v16i32, 4 },
2499  { ISD::BSWAP, MVT::v32i16, 4 },
2500  { ISD::CTLZ, MVT::v8i64, 29 },
2501  { ISD::CTLZ, MVT::v16i32, 35 },
2502  { ISD::CTLZ, MVT::v32i16, 28 },
2503  { ISD::CTLZ, MVT::v64i8, 18 },
2504  { ISD::CTPOP, MVT::v8i64, 16 },
2505  { ISD::CTPOP, MVT::v16i32, 24 },
2506  { ISD::CTPOP, MVT::v32i16, 18 },
2507  { ISD::CTPOP, MVT::v64i8, 12 },
2508  { ISD::CTTZ, MVT::v8i64, 20 },
2509  { ISD::CTTZ, MVT::v16i32, 28 },
2510  { ISD::CTTZ, MVT::v32i16, 24 },
2511  { ISD::CTTZ, MVT::v64i8, 18 },
2512  { ISD::SMAX, MVT::v8i64, 1 },
2513  { ISD::SMAX, MVT::v16i32, 1 },
2514  { ISD::SMAX, MVT::v32i16, 2 }, // FIXME: include split
2515  { ISD::SMAX, MVT::v64i8, 2 }, // FIXME: include split
2516  { ISD::SMAX, MVT::v4i64, 1 },
2517  { ISD::SMAX, MVT::v2i64, 1 },
2518  { ISD::SMIN, MVT::v8i64, 1 },
2519  { ISD::SMIN, MVT::v16i32, 1 },
2520  { ISD::SMIN, MVT::v32i16, 2 }, // FIXME: include split
2521  { ISD::SMIN, MVT::v64i8, 2 }, // FIXME: include split
2522  { ISD::SMIN, MVT::v4i64, 1 },
2523  { ISD::SMIN, MVT::v2i64, 1 },
2524  { ISD::UMAX, MVT::v8i64, 1 },
2525  { ISD::UMAX, MVT::v16i32, 1 },
2526  { ISD::UMAX, MVT::v32i16, 2 }, // FIXME: include split
2527  { ISD::UMAX, MVT::v64i8, 2 }, // FIXME: include split
2528  { ISD::UMAX, MVT::v4i64, 1 },
2529  { ISD::UMAX, MVT::v2i64, 1 },
2530  { ISD::UMIN, MVT::v8i64, 1 },
2531  { ISD::UMIN, MVT::v16i32, 1 },
2532  { ISD::UMIN, MVT::v32i16, 2 }, // FIXME: include split
2533  { ISD::UMIN, MVT::v64i8, 2 }, // FIXME: include split
2534  { ISD::UMIN, MVT::v4i64, 1 },
2535  { ISD::UMIN, MVT::v2i64, 1 },
2536  { ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd
2537  { ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq
2538  { ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq
2539  { ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq
2540  { ISD::UADDSAT, MVT::v16i32, 3 }, // not + pminud + paddd
2541  { ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq
2542  { ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq
2543  { ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq
2544  { ISD::SADDSAT, MVT::v32i16, 2 }, // FIXME: include split
2545  { ISD::SADDSAT, MVT::v64i8, 2 }, // FIXME: include split
2546  { ISD::SSUBSAT, MVT::v32i16, 2 }, // FIXME: include split
2547  { ISD::SSUBSAT, MVT::v64i8, 2 }, // FIXME: include split
2548  { ISD::UADDSAT, MVT::v32i16, 2 }, // FIXME: include split
2549  { ISD::UADDSAT, MVT::v64i8, 2 }, // FIXME: include split
2550  { ISD::USUBSAT, MVT::v32i16, 2 }, // FIXME: include split
2551  { ISD::USUBSAT, MVT::v64i8, 2 }, // FIXME: include split
2552  { ISD::FMAXNUM, MVT::f32, 2 },
2553  { ISD::FMAXNUM, MVT::v4f32, 2 },
2554  { ISD::FMAXNUM, MVT::v8f32, 2 },
2555  { ISD::FMAXNUM, MVT::v16f32, 2 },
2556  { ISD::FMAXNUM, MVT::f64, 2 },
2557  { ISD::FMAXNUM, MVT::v2f64, 2 },
2558  { ISD::FMAXNUM, MVT::v4f64, 2 },
2559  { ISD::FMAXNUM, MVT::v8f64, 2 },
2560  };
2561  static const CostTblEntry XOPCostTbl[] = {
2562  { ISD::BITREVERSE, MVT::v4i64, 4 },
2563  { ISD::BITREVERSE, MVT::v8i32, 4 },
2564  { ISD::BITREVERSE, MVT::v16i16, 4 },
2565  { ISD::BITREVERSE, MVT::v32i8, 4 },
2566  { ISD::BITREVERSE, MVT::v2i64, 1 },
2567  { ISD::BITREVERSE, MVT::v4i32, 1 },
2568  { ISD::BITREVERSE, MVT::v8i16, 1 },
2569  { ISD::BITREVERSE, MVT::v16i8, 1 },
2570  { ISD::BITREVERSE, MVT::i64, 3 },
2571  { ISD::BITREVERSE, MVT::i32, 3 },
2572  { ISD::BITREVERSE, MVT::i16, 3 },
2573  { ISD::BITREVERSE, MVT::i8, 3 }
2574  };
2575  static const CostTblEntry AVX2CostTbl[] = {
2576  { ISD::ABS, MVT::v4i64, 2 }, // VBLENDVPD(X,VPSUBQ(0,X),X)
2577  { ISD::ABS, MVT::v8i32, 1 },
2578  { ISD::ABS, MVT::v16i16, 1 },
2579  { ISD::ABS, MVT::v32i8, 1 },
2580  { ISD::BITREVERSE, MVT::v4i64, 5 },
2581  { ISD::BITREVERSE, MVT::v8i32, 5 },
2582  { ISD::BITREVERSE, MVT::v16i16, 5 },
2583  { ISD::BITREVERSE, MVT::v32i8, 5 },
2584  { ISD::BSWAP, MVT::v4i64, 1 },
2585  { ISD::BSWAP, MVT::v8i32, 1 },
2586  { ISD::BSWAP, MVT::v16i16, 1 },
2587  { ISD::CTLZ, MVT::v4i64, 23 },
2588  { ISD::CTLZ, MVT::v8i32, 18 },
2589  { ISD::CTLZ, MVT::v16i16, 14 },
2590  { ISD::CTLZ, MVT::v32i8, 9 },
2591  { ISD::CTPOP, MVT::v4i64, 7 },
2592  { ISD::CTPOP, MVT::v8i32, 11 },
2593  { ISD::CTPOP, MVT::v16i16, 9 },
2594  { ISD::CTPOP, MVT::v32i8, 6 },
2595  { ISD::CTTZ, MVT::v4i64, 10 },
2596  { ISD::CTTZ, MVT::v8i32, 14 },
2597  { ISD::CTTZ, MVT::v16i16, 12 },
2598  { ISD::CTTZ, MVT::v32i8, 9 },
2599  { ISD::SADDSAT, MVT::v16i16, 1 },
2600  { ISD::SADDSAT, MVT::v32i8, 1 },
2601  { ISD::SMAX, MVT::v8i32, 1 },
2602  { ISD::SMAX, MVT::v16i16, 1 },
2603  { ISD::SMAX, MVT::v32i8, 1 },
2604  { ISD::SMIN, MVT::v8i32, 1 },
2605  { ISD::SMIN, MVT::v16i16, 1 },
2606  { ISD::SMIN, MVT::v32i8, 1 },
2607  { ISD::SSUBSAT, MVT::v16i16, 1 },
2608  { ISD::SSUBSAT, MVT::v32i8, 1 },
2609  { ISD::UADDSAT, MVT::v16i16, 1 },
2610  { ISD::UADDSAT, MVT::v32i8, 1 },
2611  { ISD::UADDSAT, MVT::v8i32, 3 }, // not + pminud + paddd
2612  { ISD::UMAX, MVT::v8i32, 1 },
2613  { ISD::UMAX, MVT::v16i16, 1 },
2614  { ISD::UMAX, MVT::v32i8, 1 },
2615  { ISD::UMIN, MVT::v8i32, 1 },
2616  { ISD::UMIN, MVT::v16i16, 1 },
2617  { ISD::UMIN, MVT::v32i8, 1 },
2618  { ISD::USUBSAT, MVT::v16i16, 1 },
2619  { ISD::USUBSAT, MVT::v32i8, 1 },
2620  { ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd
2621  { ISD::FMAXNUM, MVT::v8f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS
2622  { ISD::FMAXNUM, MVT::v4f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD
2623  { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/
2624  { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
2625  { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
2626  { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/
2627  { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
2628  { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
2629  };
2630  static const CostTblEntry AVX1CostTbl[] = {
2631  { ISD::ABS, MVT::v4i64, 5 }, // VBLENDVPD(X,VPSUBQ(0,X),X)
2632  { ISD::ABS, MVT::v8i32, 3 },
2633  { ISD::ABS, MVT::v16i16, 3 },
2634  { ISD::ABS, MVT::v32i8, 3 },
2635  { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert
2636  { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert
2637  { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert
2638  { ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert
2639  { ISD::BSWAP, MVT::v4i64, 4 },
2640  { ISD::BSWAP, MVT::v8i32, 4 },
2641  { ISD::BSWAP, MVT::v16i16, 4 },
2642  { ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert
2643  { ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert
2644  { ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert
2645  { ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
2646  { ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert
2647  { ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert
2648  { ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert
2649  { ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert
2650  { ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert
2651  { ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert
2652  { ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert
2653  { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
2654  { ISD::SADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2655  { ISD::SADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2656  { ISD::SMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
2657  { ISD::SMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2658  { ISD::SMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2659  { ISD::SMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
2660  { ISD::SMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2661  { ISD::SMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2662  { ISD::SSUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2663  { ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2664  { ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2665  { ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2666  { ISD::UADDSAT, MVT::v8i32, 8 }, // 2 x 128-bit Op + extract/insert
2667  { ISD::UMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
2668  { ISD::UMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2669  { ISD::UMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2670  { ISD::UMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
2671  { ISD::UMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2672  { ISD::UMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2673  { ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2674  { ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2675  { ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert
2676  { ISD::FMAXNUM, MVT::f32, 3 }, // MAXSS + CMPUNORDSS + BLENDVPS
2677  { ISD::FMAXNUM, MVT::v4f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS
2678  { ISD::FMAXNUM, MVT::v8f32, 5 }, // MAXPS + CMPUNORDPS + BLENDVPS + ?
2679  { ISD::FMAXNUM, MVT::f64, 3 }, // MAXSD + CMPUNORDSD + BLENDVPD
2680  { ISD::FMAXNUM, MVT::v2f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD
2681  { ISD::FMAXNUM, MVT::v4f64, 5 }, // MAXPD + CMPUNORDPD + BLENDVPD + ?
2682  { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
2683  { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
2684  { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
2685  { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/
2686  { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/
2687  { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/
2688  };
2689  static const CostTblEntry GLMCostTbl[] = {
2690  { ISD::FSQRT, MVT::f32, 19 }, // sqrtss
2691  { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps
2692  { ISD::FSQRT, MVT::f64, 34 }, // sqrtsd
2693  { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd
2694  };
2695  static const CostTblEntry SLMCostTbl[] = {
2696  { ISD::FSQRT, MVT::f32, 20 }, // sqrtss
2697  { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps
2698  { ISD::FSQRT, MVT::f64, 35 }, // sqrtsd
2699  { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd
2700  };
2701  static const CostTblEntry SSE42CostTbl[] = {
2702  { ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd
2703  { ISD::UADDSAT, MVT::v4i32, 3 }, // not + pminud + paddd
2704  { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
2705  { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
2706  };
2707  static const CostTblEntry SSE41CostTbl[] = {
2708  { ISD::ABS, MVT::v2i64, 2 }, // BLENDVPD(X,PSUBQ(0,X),X)
2709  { ISD::SMAX, MVT::v4i32, 1 },
2710  { ISD::SMAX, MVT::v16i8, 1 },
2711  { ISD::SMIN, MVT::v4i32, 1 },
2712  { ISD::SMIN, MVT::v16i8, 1 },
2713  { ISD::UMAX, MVT::v4i32, 1 },
2714  { ISD::UMAX, MVT::v8i16, 1 },
2715  { ISD::UMIN, MVT::v4i32, 1 },
2716  { ISD::UMIN, MVT::v8i16, 1 },
2717  };
2718  static const CostTblEntry SSSE3CostTbl[] = {
2719  { ISD::ABS, MVT::v4i32, 1 },
2720  { ISD::ABS, MVT::v8i16, 1 },
2721  { ISD::ABS, MVT::v16i8, 1 },
2722  { ISD::BITREVERSE, MVT::v2i64, 5 },
2723  { ISD::BITREVERSE, MVT::v4i32, 5 },
2724  { ISD::BITREVERSE, MVT::v8i16, 5 },
2725  { ISD::BITREVERSE, MVT::v16i8, 5 },
2726  { ISD::BSWAP, MVT::v2i64, 1 },
2727  { ISD::BSWAP, MVT::v4i32, 1 },
2728  { ISD::BSWAP, MVT::v8i16, 1 },
2729  { ISD::CTLZ, MVT::v2i64, 23 },
2730  { ISD::CTLZ, MVT::v4i32, 18 },
2731  { ISD::CTLZ, MVT::v8i16, 14 },
2732  { ISD::CTLZ, MVT::v16i8, 9 },
2733  { ISD::CTPOP, MVT::v2i64, 7 },
2734  { ISD::CTPOP, MVT::v4i32, 11 },
2735  { ISD::CTPOP, MVT::v8i16, 9 },
2736  { ISD::CTPOP, MVT::v16i8, 6 },
2737  { ISD::CTTZ, MVT::v2i64, 10 },
2738  { ISD::CTTZ, MVT::v4i32, 14 },
2739  { ISD::CTTZ, MVT::v8i16, 12 },
2740  { ISD::CTTZ, MVT::v16i8, 9 }
2741  };
2742  static const CostTblEntry SSE2CostTbl[] = {
2743  { ISD::ABS, MVT::v2i64, 4 },
2744  { ISD::ABS, MVT::v4i32, 3 },
2745  { ISD::ABS, MVT::v8i16, 2 },
2746  { ISD::ABS, MVT::v16i8, 2 },
2747  { ISD::BITREVERSE, MVT::v2i64, 29 },
2748  { ISD::BITREVERSE, MVT::v4i32, 27 },
2749  { ISD::BITREVERSE, MVT::v8i16, 27 },
2750  { ISD::BITREVERSE, MVT::v16i8, 20 },
2751  { ISD::BSWAP, MVT::v2i64, 7 },
2752  { ISD::BSWAP, MVT::v4i32, 7 },
2753  { ISD::BSWAP, MVT::v8i16, 7 },
2754  { ISD::CTLZ, MVT::v2i64, 25 },
2755  { ISD::CTLZ, MVT::v4i32, 26 },
2756  { ISD::CTLZ, MVT::v8i16, 20 },
2757  { ISD::CTLZ, MVT::v16i8, 17 },
2758  { ISD::CTPOP, MVT::v2i64, 12 },
2759  { ISD::CTPOP, MVT::v4i32, 15 },
2760  { ISD::CTPOP, MVT::v8i16, 13 },
2761  { ISD::CTPOP, MVT::v16i8, 10 },
2762  { ISD::CTTZ, MVT::v2i64, 14 },
2763  { ISD::CTTZ, MVT::v4i32, 18 },
2764  { ISD::CTTZ, MVT::v8i16, 16 },
2765  { ISD::CTTZ, MVT::v16i8, 13 },
2766  { ISD::SADDSAT, MVT::v8i16, 1 },
2767  { ISD::SADDSAT, MVT::v16i8, 1 },
2768  { ISD::SMAX, MVT::v8i16, 1 },
2769  { ISD::SMIN, MVT::v8i16, 1 },
2770  { ISD::SSUBSAT, MVT::v8i16, 1 },
2771  { ISD::SSUBSAT, MVT::v16i8, 1 },
2772  { ISD::UADDSAT, MVT::v8i16, 1 },
2773  { ISD::UADDSAT, MVT::v16i8, 1 },
2774  { ISD::UMAX, MVT::v8i16, 2 },
2775  { ISD::UMAX, MVT::v16i8, 1 },
2776  { ISD::UMIN, MVT::v8i16, 2 },
2777  { ISD::UMIN, MVT::v16i8, 1 },
2778  { ISD::USUBSAT, MVT::v8i16, 1 },
2779  { ISD::USUBSAT, MVT::v16i8, 1 },
2780  { ISD::FMAXNUM, MVT::f64, 4 },
2781  { ISD::FMAXNUM, MVT::v2f64, 4 },
2782  { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/
2783  { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/
2784  };
2785  static const CostTblEntry SSE1CostTbl[] = {
2786  { ISD::FMAXNUM, MVT::f32, 4 },
2787  { ISD::FMAXNUM, MVT::v4f32, 4 },
2788  { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
2789  { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
2790  };
2791  static const CostTblEntry BMI64CostTbl[] = { // 64-bit targets
2792  { ISD::CTTZ, MVT::i64, 1 },
2793  };
2794  static const CostTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
2795  { ISD::CTTZ, MVT::i32, 1 },
2796  { ISD::CTTZ, MVT::i16, 1 },
2797  { ISD::CTTZ, MVT::i8, 1 },
2798  };
2799  static const CostTblEntry LZCNT64CostTbl[] = { // 64-bit targets
2800  { ISD::CTLZ, MVT::i64, 1 },
2801  };
2802  static const CostTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
2803  { ISD::CTLZ, MVT::i32, 1 },
2804  { ISD::CTLZ, MVT::i16, 1 },
2805  { ISD::CTLZ, MVT::i8, 1 },
2806  };
2807  static const CostTblEntry POPCNT64CostTbl[] = { // 64-bit targets
2808  { ISD::CTPOP, MVT::i64, 1 },
2809  };
2810  static const CostTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
2811  { ISD::CTPOP, MVT::i32, 1 },
2812  { ISD::CTPOP, MVT::i16, 1 },
2813  { ISD::CTPOP, MVT::i8, 1 },
2814  };
2815  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
2816  { ISD::ABS, MVT::i64, 2 }, // SUB+CMOV
2817  { ISD::BITREVERSE, MVT::i64, 14 },
2818  { ISD::BSWAP, MVT::i64, 1 },
2819  { ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV
2820  { ISD::CTTZ, MVT::i64, 3 }, // TEST+BSF+CMOV/BRANCH
2821  { ISD::CTPOP, MVT::i64, 10 },
2822  { ISD::SADDO, MVT::i64, 1 },
2823  { ISD::UADDO, MVT::i64, 1 },
2824  { ISD::UMULO, MVT::i64, 2 }, // mulq + seto
2825  };
2826  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
2827  { ISD::ABS, MVT::i32, 2 }, // SUB+CMOV
2828  { ISD::ABS, MVT::i16, 2 }, // SUB+CMOV
2829  { ISD::BITREVERSE, MVT::i32, 14 },
2830  { ISD::BITREVERSE, MVT::i16, 14 },
2831  { ISD::BITREVERSE, MVT::i8, 11 },
2832  { ISD::BSWAP, MVT::i32, 1 },
2833  { ISD::BSWAP, MVT::i16, 1 }, // ROL
2834  { ISD::CTLZ, MVT::i32, 4 }, // BSR+XOR or BSR+XOR+CMOV
2835  { ISD::CTLZ, MVT::i16, 4 }, // BSR+XOR or BSR+XOR+CMOV
2836  { ISD::CTLZ, MVT::i8, 4 }, // BSR+XOR or BSR+XOR+CMOV
2837  { ISD::CTTZ, MVT::i32, 3 }, // TEST+BSF+CMOV/BRANCH
2838  { ISD::CTTZ, MVT::i16, 3 }, // TEST+BSF+CMOV/BRANCH
2839  { ISD::CTTZ, MVT::i8, 3 }, // TEST+BSF+CMOV/BRANCH
2840  { ISD::CTPOP, MVT::i32, 8 },
2841  { ISD::CTPOP, MVT::i16, 9 },
2842  { ISD::CTPOP, MVT::i8, 7 },
2843  { ISD::SADDO, MVT::i32, 1 },
2844  { ISD::SADDO, MVT::i16, 1 },
2845  { ISD::SADDO, MVT::i8, 1 },
2846  { ISD::UADDO, MVT::i32, 1 },
2847  { ISD::UADDO, MVT::i16, 1 },
2848  { ISD::UADDO, MVT::i8, 1 },
2849  { ISD::UMULO, MVT::i32, 2 }, // mul + seto
2850  { ISD::UMULO, MVT::i16, 2 },
2851  { ISD::UMULO, MVT::i8, 2 },
2852  };
2853 
2854  Type *RetTy = ICA.getReturnType();
2855  Type *OpTy = RetTy;
2856  Intrinsic::ID IID = ICA.getID();
2857  unsigned ISD = ISD::DELETED_NODE;
2858  switch (IID) {
2859  default:
2860  break;
2861  case Intrinsic::abs:
2862  ISD = ISD::ABS;
2863  break;
2864  case Intrinsic::bitreverse:
2865  ISD = ISD::BITREVERSE;
2866  break;
2867  case Intrinsic::bswap:
2868  ISD = ISD::BSWAP;
2869  break;
2870  case Intrinsic::ctlz:
2871  ISD = ISD::CTLZ;
2872  break;
2873  case Intrinsic::ctpop:
2874  ISD = ISD::CTPOP;
2875  break;
2876  case Intrinsic::cttz:
2877  ISD = ISD::CTTZ;
2878  break;
2879  case Intrinsic::maxnum:
2880  case Intrinsic::minnum:
2881  // FMINNUM has same costs so don't duplicate.
2882  ISD = ISD::FMAXNUM;
2883  break;
2884  case Intrinsic::sadd_sat:
2885  ISD = ISD::SADDSAT;
2886  break;
2887  case Intrinsic::smax:
2888  ISD = ISD::SMAX;
2889  break;
2890  case Intrinsic::smin:
2891  ISD = ISD::SMIN;
2892  break;
2893  case Intrinsic::ssub_sat:
2894  ISD = ISD::SSUBSAT;
2895  break;
2896  case Intrinsic::uadd_sat:
2897  ISD = ISD::UADDSAT;
2898  break;
2899  case Intrinsic::umax:
2900  ISD = ISD::UMAX;
2901  break;
2902  case Intrinsic::umin:
2903  ISD = ISD::UMIN;
2904  break;
2905  case Intrinsic::usub_sat:
2906  ISD = ISD::USUBSAT;
2907  break;
2908  case Intrinsic::sqrt:
2909  ISD = ISD::FSQRT;
2910  break;
2911  case Intrinsic::sadd_with_overflow:
2912  case Intrinsic::ssub_with_overflow:
2913  // SSUBO has same costs so don't duplicate.
2914  ISD = ISD::SADDO;
2915  OpTy = RetTy->getContainedType(0);
2916  break;
2917  case Intrinsic::uadd_with_overflow:
2918  case Intrinsic::usub_with_overflow:
2919  // USUBO has same costs so don't duplicate.
2920  ISD = ISD::UADDO;
2921  OpTy = RetTy->getContainedType(0);
2922  break;
2923  case Intrinsic::umul_with_overflow:
2924  case Intrinsic::smul_with_overflow:
2925  // SMULO has same costs so don't duplicate.
2926  ISD = ISD::UMULO;
2927  OpTy = RetTy->getContainedType(0);
2928  break;
2929  }
2930 
2931  if (ISD != ISD::DELETED_NODE) {
2932  // Legalize the type.
2933  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, OpTy);
2934  MVT MTy = LT.second;
2935 
2936  // Attempt to lookup cost.
2937  if (ISD == ISD::BITREVERSE && ST->hasGFNI() && ST->hasSSSE3() &&
2938  MTy.isVector()) {
2939  // With PSHUFB the code is very similar for all types. If we have integer
2940  // byte operations, we just need a GF2P8AFFINEQB for vXi8. For other types
2941  // we also need a PSHUFB.
2942  unsigned Cost = MTy.getVectorElementType() == MVT::i8 ? 1 : 2;
2943 
2944  // Without byte operations, we need twice as many GF2P8AFFINEQB and PSHUFB
2945  // instructions. We also need an extract and an insert.
2946  if (!(MTy.is128BitVector() || (ST->hasAVX2() && MTy.is256BitVector()) ||
2947  (ST->hasBWI() && MTy.is512BitVector())))
2948  Cost = Cost * 2 + 2;
2949 
2950  return LT.first * Cost;
2951  }
2952 
2953  auto adjustTableCost = [](const CostTblEntry &Entry,
2954  InstructionCost LegalizationCost,
2955  FastMathFlags FMF) {
2956  // If there are no NANs to deal with, then these are reduced to a
2957  // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
2958  // assume is used in the non-fast case.
2959  if (Entry.ISD == ISD::FMAXNUM || Entry.ISD == ISD::FMINNUM) {
2960  if (FMF.noNaNs())
2961  return LegalizationCost * 1;
2962  }
2963  return LegalizationCost * (int)Entry.Cost;
2964  };
2965 
2966  if (ST->useGLMDivSqrtCosts())
2967  if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
2968  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2969 
2970  if (ST->isSLM())
2971  if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
2972  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2973 
2974  if (ST->hasCDI())
2975  if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
2976  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2977 
2978  if (ST->hasBWI())
2979  if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
2980  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2981 
2982  if (ST->hasAVX512())
2983  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2984  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2985 
2986  if (ST->hasXOP())
2987  if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
2988  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2989 
2990  if (ST->hasAVX2())
2991  if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
2992  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2993 
2994  if (ST->hasAVX())
2995  if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
2996  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2997 
2998  if (ST->hasSSE42())
2999  if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3000  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3001 
3002  if (ST->hasSSE41())
3003  if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3004  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3005 
3006  if (ST->hasSSSE3())
3007  if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
3008  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3009 
3010  if (ST->hasSSE2())
3011  if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3012  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3013 
3014  if (ST->hasSSE1())
3015  if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3016  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3017 
3018  if (ST->hasBMI()) {
3019  if (ST->is64Bit())
3020  if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
3021  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3022 
3023  if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
3024  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3025  }
3026 
3027  if (ST->hasLZCNT()) {
3028  if (ST->is64Bit())
3029  if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
3030  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3031 
3032  if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
3033  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3034  }
3035 
3036  if (ST->hasPOPCNT()) {
3037  if (ST->is64Bit())
3038  if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
3039  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3040 
3041  if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
3042  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3043  }
3044 
3045  if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
3046  if (const Instruction *II = ICA.getInst()) {
3047  if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
3048  return TTI::TCC_Free;
3049  if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
3050  if (LI->hasOneUse())
3051  return TTI::TCC_Free;
3052  }
3053  }
3054  }
3055 
3056  // TODO - add BMI (TZCNT) scalar handling
3057 
3058  if (ST->is64Bit())
3059  if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
3060  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3061 
3062  if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
3063  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3064  }
3065 
3067 }
3068 
3072  if (ICA.isTypeBasedOnly())
3074 
3075  static const CostTblEntry AVX512CostTbl[] = {
3076  { ISD::ROTL, MVT::v8i64, 1 },
3077  { ISD::ROTL, MVT::v4i64, 1 },
3078  { ISD::ROTL, MVT::v2i64, 1 },
3079  { ISD::ROTL, MVT::v16i32, 1 },
3080  { ISD::ROTL, MVT::v8i32, 1 },
3081  { ISD::ROTL, MVT::v4i32, 1 },
3082  { ISD::ROTR, MVT::v8i64, 1 },
3083  { ISD::ROTR, MVT::v4i64, 1 },
3084  { ISD::ROTR, MVT::v2i64, 1 },
3085  { ISD::ROTR, MVT::v16i32, 1 },
3086  { ISD::ROTR, MVT::v8i32, 1 },
3087  { ISD::ROTR, MVT::v4i32, 1 }
3088  };
3089  // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
3090  static const CostTblEntry XOPCostTbl[] = {
3091  { ISD::ROTL, MVT::v4i64, 4 },
3092  { ISD::ROTL, MVT::v8i32, 4 },
3093  { ISD::ROTL, MVT::v16i16, 4 },
3094  { ISD::ROTL, MVT::v32i8, 4 },
3095  { ISD::ROTL, MVT::v2i64, 1 },
3096  { ISD::ROTL, MVT::v4i32, 1 },
3097  { ISD::ROTL, MVT::v8i16, 1 },
3098  { ISD::ROTL, MVT::v16i8, 1 },
3099  { ISD::ROTR, MVT::v4i64, 6 },
3100  { ISD::ROTR, MVT::v8i32, 6 },
3101  { ISD::ROTR, MVT::v16i16, 6 },
3102  { ISD::ROTR, MVT::v32i8, 6 },
3103  { ISD::ROTR, MVT::v2i64, 2 },
3104  { ISD::ROTR, MVT::v4i32, 2 },
3105  { ISD::ROTR, MVT::v8i16, 2 },
3106  { ISD::ROTR, MVT::v16i8, 2 }
3107  };
3108  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
3109  { ISD::ROTL, MVT::i64, 1 },
3110  { ISD::ROTR, MVT::i64, 1 },
3111  { ISD::FSHL, MVT::i64, 4 }
3112  };
3113  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
3114  { ISD::ROTL, MVT::i32, 1 },
3115  { ISD::ROTL, MVT::i16, 1 },
3116  { ISD::ROTL, MVT::i8, 1 },
3117  { ISD::ROTR, MVT::i32, 1 },
3118  { ISD::ROTR, MVT::i16, 1 },
3119  { ISD::ROTR, MVT::i8, 1 },
3120  { ISD::FSHL, MVT::i32, 4 },
3121  { ISD::FSHL, MVT::i16, 4 },
3122  { ISD::FSHL, MVT::i8, 4 }
3123  };
3124 
3125  Intrinsic::ID IID = ICA.getID();
3126  Type *RetTy = ICA.getReturnType();
3128  unsigned ISD = ISD::DELETED_NODE;
3129  switch (IID) {
3130  default:
3131  break;
3132  case Intrinsic::fshl:
3133  ISD = ISD::FSHL;
3134  if (Args[0] == Args[1])
3135  ISD = ISD::ROTL;
3136  break;
3137  case Intrinsic::fshr:
3138  // FSHR has same costs so don't duplicate.
3139  ISD = ISD::FSHL;
3140  if (Args[0] == Args[1])
3141  ISD = ISD::ROTR;
3142  break;
3143  }
3144 
3145  if (ISD != ISD::DELETED_NODE) {
3146  // Legalize the type.
3147  std::pair<InstructionCost, MVT> LT =
3148  TLI->getTypeLegalizationCost(DL, RetTy);
3149  MVT MTy = LT.second;
3150 
3151  // Attempt to lookup cost.
3152  if (ST->hasAVX512())
3153  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3154  return LT.first * Entry->Cost;
3155 
3156  if (ST->hasXOP())
3157  if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3158  return LT.first * Entry->Cost;
3159 
3160  if (ST->is64Bit())
3161  if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
3162  return LT.first * Entry->Cost;
3163 
3164  if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
3165  return LT.first * Entry->Cost;
3166  }
3167 
3169 }
3170 
3172  unsigned Index) {
3173  static const CostTblEntry SLMCostTbl[] = {
3178  };
3179 
3180  assert(Val->isVectorTy() && "This must be a vector type");
3181  Type *ScalarType = Val->getScalarType();
3182  int RegisterFileMoveCost = 0;
3183 
3184  if (Index != -1U && (Opcode == Instruction::ExtractElement ||
3185  Opcode == Instruction::InsertElement)) {
3186  // Legalize the type.
3187  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
3188 
3189  // This type is legalized to a scalar type.
3190  if (!LT.second.isVector())
3191  return 0;
3192 
3193  // The type may be split. Normalize the index to the new type.
3194  unsigned NumElts = LT.second.getVectorNumElements();
3195  unsigned SubNumElts = NumElts;
3196  Index = Index % NumElts;
3197 
3198  // For >128-bit vectors, we need to extract higher 128-bit subvectors.
3199  // For inserts, we also need to insert the subvector back.
3200  if (LT.second.getSizeInBits() > 128) {
3201  assert((LT.second.getSizeInBits() % 128) == 0 && "Illegal vector");
3202  unsigned NumSubVecs = LT.second.getSizeInBits() / 128;
3203  SubNumElts = NumElts / NumSubVecs;
3204  if (SubNumElts <= Index) {
3205  RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
3206  Index %= SubNumElts;
3207  }
3208  }
3209 
3210  if (Index == 0) {
3211  // Floating point scalars are already located in index #0.
3212  // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
3213  // true for all.
3214  if (ScalarType->isFloatingPointTy())
3215  return RegisterFileMoveCost;
3216 
3217  // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
3218  if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
3219  return 1 + RegisterFileMoveCost;
3220  }
3221 
3222  int ISD = TLI->InstructionOpcodeToISD(Opcode);
3223  assert(ISD && "Unexpected vector opcode");
3224  MVT MScalarTy = LT.second.getScalarType();
3225  if (ST->isSLM())
3226  if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
3227  return Entry->Cost + RegisterFileMoveCost;
3228 
3229  // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
3230  if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
3231  (MScalarTy.isInteger() && ST->hasSSE41()))
3232  return 1 + RegisterFileMoveCost;
3233 
3234  // Assume insertps is relatively cheap on all targets.
3235  if (MScalarTy == MVT::f32 && ST->hasSSE41() &&
3236  Opcode == Instruction::InsertElement)
3237  return 1 + RegisterFileMoveCost;
3238 
3239  // For extractions we just need to shuffle the element to index 0, which
3240  // should be very cheap (assume cost = 1). For insertions we need to shuffle
3241  // the elements to its destination. In both cases we must handle the
3242  // subvector move(s).
3243  // If the vector type is already less than 128-bits then don't reduce it.
3244  // TODO: Under what circumstances should we shuffle using the full width?
3245  InstructionCost ShuffleCost = 1;
3246  if (Opcode == Instruction::InsertElement) {
3247  auto *SubTy = cast<VectorType>(Val);
3248  EVT VT = TLI->getValueType(DL, Val);
3249  if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
3250  SubTy = FixedVectorType::get(ScalarType, SubNumElts);
3251  ShuffleCost =
3252  getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, None, 0, SubTy);
3253  }
3254  int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
3255  return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
3256  }
3257 
3258  // Add to the base cost if we know that the extracted element of a vector is
3259  // destined to be moved to and used in the integer register file.
3260  if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
3261  RegisterFileMoveCost += 1;
3262 
3263  return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
3264 }
3265 
3267  const APInt &DemandedElts,
3268  bool Insert,
3269  bool Extract) {
3270  InstructionCost Cost = 0;
3271 
3272  // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
3273  // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
3274  if (Insert) {
3275  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
3276  MVT MScalarTy = LT.second.getScalarType();
3277 
3278  if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
3279  (MScalarTy.isInteger() && ST->hasSSE41()) ||
3280  (MScalarTy == MVT::f32 && ST->hasSSE41())) {
3281  // For types we can insert directly, insertion into 128-bit sub vectors is
3282  // cheap, followed by a cheap chain of concatenations.
3283  if (LT.second.getSizeInBits() <= 128) {
3284  Cost +=
3285  BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false);
3286  } else {
3287  // In each 128-lane, if at least one index is demanded but not all
3288  // indices are demanded and this 128-lane is not the first 128-lane of
3289  // the legalized-vector, then this 128-lane needs a extracti128; If in
3290  // each 128-lane, there is at least one demanded index, this 128-lane
3291  // needs a inserti128.
3292 
3293  // The following cases will help you build a better understanding:
3294  // Assume we insert several elements into a v8i32 vector in avx2,
3295  // Case#1: inserting into 1th index needs vpinsrd + inserti128.
3296  // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
3297  // inserti128.
3298  // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
3299  const int CostValue = *LT.first.getValue();
3300  assert(CostValue >= 0 && "Negative cost!");
3301  unsigned Num128Lanes = LT.second.getSizeInBits() / 128 * CostValue;
3302  unsigned NumElts = LT.second.getVectorNumElements() * CostValue;
3303  APInt WidenedDemandedElts = DemandedElts.zextOrSelf(NumElts);
3304  unsigned Scale = NumElts / Num128Lanes;
3305  // We iterate each 128-lane, and check if we need a
3306  // extracti128/inserti128 for this 128-lane.
3307  for (unsigned I = 0; I < NumElts; I += Scale) {
3308  APInt Mask = WidenedDemandedElts.getBitsSet(NumElts, I, I + Scale);
3309  APInt MaskedDE = Mask & WidenedDemandedElts;
3310  unsigned Population = MaskedDE.countPopulation();
3311  Cost += (Population > 0 && Population != Scale &&
3312  I % LT.second.getVectorNumElements() != 0);
3313  Cost += Population > 0;
3314  }
3315  Cost += DemandedElts.countPopulation();
3316 
3317  // For vXf32 cases, insertion into the 0'th index in each v4f32
3318  // 128-bit vector is free.
3319  // NOTE: This assumes legalization widens vXf32 vectors.
3320  if (MScalarTy == MVT::f32)
3321  for (unsigned i = 0, e = cast<FixedVectorType>(Ty)->getNumElements();
3322  i < e; i += 4)
3323  if (DemandedElts[i])
3324  Cost--;
3325  }
3326  } else if (LT.second.isVector()) {
3327  // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
3328  // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
3329  // series of UNPCK followed by CONCAT_VECTORS - all of these can be
3330  // considered cheap.
3331  if (Ty->isIntOrIntVectorTy())
3332  Cost += DemandedElts.countPopulation();
3333 
3334  // Get the smaller of the legalized or original pow2-extended number of
3335  // vector elements, which represents the number of unpacks we'll end up
3336  // performing.
3337  unsigned NumElts = LT.second.getVectorNumElements();
3338  unsigned Pow2Elts =
3339  PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements());
3340  Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
3341  }
3342  }
3343 
3344  // TODO: Use default extraction for now, but we should investigate extending this
3345  // to handle repeated subvector extraction.
3346  if (Extract)
3347  Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, false, Extract);
3348 
3349  return Cost;
3350 }
3351 
3353  MaybeAlign Alignment,
3354  unsigned AddressSpace,
3356  const Instruction *I) {
3357  // TODO: Handle other cost kinds.
3359  if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
3360  // Store instruction with index and scale costs 2 Uops.
3361  // Check the preceding GEP to identify non-const indices.
3362  if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
3363  if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
3364  return TTI::TCC_Basic * 2;
3365  }
3366  }
3367  return TTI::TCC_Basic;
3368  }
3369 
3370  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
3371  "Invalid Opcode");
3372  // Type legalization can't handle structs
3373  if (TLI->getValueType(DL, Src, true) == MVT::Other)
3374  return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
3375  CostKind);
3376 
3377  // Legalize the type.
3378  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
3379 
3380  auto *VTy = dyn_cast<FixedVectorType>(Src);
3381 
3382  // Handle the simple case of non-vectors.
3383  // NOTE: this assumes that legalization never creates vector from scalars!
3384  if (!VTy || !LT.second.isVector())
3385  // Each load/store unit costs 1.
3386  return LT.first * 1;
3387 
3388  bool IsLoad = Opcode == Instruction::Load;
3389 
3390  Type *EltTy = VTy->getElementType();
3391 
3392  const int EltTyBits = DL.getTypeSizeInBits(EltTy);
3393 
3394  InstructionCost Cost = 0;
3395 
3396  // Source of truth: how many elements were there in the original IR vector?
3397  const unsigned SrcNumElt = VTy->getNumElements();
3398 
3399  // How far have we gotten?
3400  int NumEltRemaining = SrcNumElt;
3401  // Note that we intentionally capture by-reference, NumEltRemaining changes.
3402  auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
3403 
3404  const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
3405 
3406  // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
3407  const unsigned XMMBits = 128;
3408  if (XMMBits % EltTyBits != 0)
3409  // Vector size must be a multiple of the element size. I.e. no padding.
3410  return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
3411  CostKind);
3412  const int NumEltPerXMM = XMMBits / EltTyBits;
3413 
3414  auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
3415 
3416  for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
3417  NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
3418  // How many elements would a single op deal with at once?
3419  if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
3420  // Vector size must be a multiple of the element size. I.e. no padding.
3421  return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
3422  CostKind);
3423  int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
3424 
3425  assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
3426  assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||
3427  (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
3428