LLVM  13.0.0git
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements a TargetTransformInfo analysis pass specific to the
10 /// X86 target machine. It uses the target's detailed information to provide
11 /// more precise answers to certain TTI queries, while letting the target
12 /// independent and default TTI implementations handle the rest.
13 ///
14 //===----------------------------------------------------------------------===//
15 /// About Cost Model numbers used below it's necessary to say the following:
16 /// the numbers correspond to some "generic" X86 CPU instead of usage of
17 /// concrete CPU model. Usually the numbers correspond to CPU where the feature
18 /// apeared at the first time. For example, if we do Subtarget.hasSSE42() in
19 /// the lookups below the cost is based on Nehalem as that was the first CPU
20 /// to support that feature level and thus has most likely the worst case cost.
21 /// Some examples of other technologies/CPUs:
22 /// SSE 3 - Pentium4 / Athlon64
23 /// SSE 4.1 - Penryn
24 /// SSE 4.2 - Nehalem
25 /// AVX - Sandy Bridge
26 /// AVX2 - Haswell
27 /// AVX-512 - Xeon Phi / Skylake
28 /// And some examples of instruction target dependent costs (latency)
29 /// divss sqrtss rsqrtss
30 /// AMD K7 11-16 19 3
31 /// Piledriver 9-24 13-15 5
32 /// Jaguar 14 16 2
33 /// Pentium II,III 18 30 2
34 /// Nehalem 7-14 7-18 3
35 /// Haswell 10-13 11 5
36 /// TODO: Develop and implement the target dependent cost model and
37 /// specialize cost numbers for different Cost Model Targets such as throughput,
38 /// code size, latency and uop count.
39 //===----------------------------------------------------------------------===//
40 
41 #include "X86TargetTransformInfo.h"
44 #include "llvm/CodeGen/CostTable.h"
46 #include "llvm/IR/IntrinsicInst.h"
47 #include "llvm/Support/Debug.h"
48 
49 using namespace llvm;
50 
51 #define DEBUG_TYPE "x86tti"
52 
53 //===----------------------------------------------------------------------===//
54 //
55 // X86 cost model.
56 //
57 //===----------------------------------------------------------------------===//
58 
60 X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
61  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
62  // TODO: Currently the __builtin_popcount() implementation using SSE3
63  // instructions is inefficient. Once the problem is fixed, we should
64  // call ST->hasSSE3() instead of ST->hasPOPCNT().
66 }
67 
70  switch (Level) {
72  // - Penryn
73  // - Nehalem
74  // - Westmere
75  // - Sandy Bridge
76  // - Ivy Bridge
77  // - Haswell
78  // - Broadwell
79  // - Skylake
80  // - Kabylake
81  return 32 * 1024; // 32 KByte
83  // - Penryn
84  // - Nehalem
85  // - Westmere
86  // - Sandy Bridge
87  // - Ivy Bridge
88  // - Haswell
89  // - Broadwell
90  // - Skylake
91  // - Kabylake
92  return 256 * 1024; // 256 KByte
93  }
94 
95  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
96 }
97 
100  // - Penryn
101  // - Nehalem
102  // - Westmere
103  // - Sandy Bridge
104  // - Ivy Bridge
105  // - Haswell
106  // - Broadwell
107  // - Skylake
108  // - Kabylake
109  switch (Level) {
113  return 8;
114  }
115 
116  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
117 }
118 
119 unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
120  bool Vector = (ClassID == 1);
121  if (Vector && !ST->hasSSE1())
122  return 0;
123 
124  if (ST->is64Bit()) {
125  if (Vector && ST->hasAVX512())
126  return 32;
127  return 16;
128  }
129  return 8;
130 }
131 
132 unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const {
133  unsigned PreferVectorWidth = ST->getPreferVectorWidth();
134  if (Vector) {
135  if (ST->hasAVX512() && PreferVectorWidth >= 512)
136  return 512;
137  if (ST->hasAVX() && PreferVectorWidth >= 256)
138  return 256;
139  if (ST->hasSSE1() && PreferVectorWidth >= 128)
140  return 128;
141  return 0;
142  }
143 
144  if (ST->is64Bit())
145  return 64;
146 
147  return 32;
148 }
149 
150 unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
151  return getRegisterBitWidth(true);
152 }
153 
154 unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
155  // If the loop will not be vectorized, don't interleave the loop.
156  // Let regular unroll to unroll the loop, which saves the overflow
157  // check and memory check cost.
158  if (VF == 1)
159  return 1;
160 
161  if (ST->isAtom())
162  return 1;
163 
164  // Sandybridge and Haswell have multiple execution ports and pipelined
165  // vector units.
166  if (ST->hasAVX())
167  return 4;
168 
169  return 2;
170 }
171 
172 int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
174  TTI::OperandValueKind Op1Info,
175  TTI::OperandValueKind Op2Info,
176  TTI::OperandValueProperties Opd1PropInfo,
177  TTI::OperandValueProperties Opd2PropInfo,
179  const Instruction *CxtI) {
180  // TODO: Handle more cost kinds.
182  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
183  Op2Info, Opd1PropInfo,
184  Opd2PropInfo, Args, CxtI);
185  // Legalize the type.
186  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
187 
188  int ISD = TLI->InstructionOpcodeToISD(Opcode);
189  assert(ISD && "Invalid opcode");
190 
191  static const CostTblEntry GLMCostTable[] = {
192  { ISD::FDIV, MVT::f32, 18 }, // divss
193  { ISD::FDIV, MVT::v4f32, 35 }, // divps
194  { ISD::FDIV, MVT::f64, 33 }, // divsd
195  { ISD::FDIV, MVT::v2f64, 65 }, // divpd
196  };
197 
198  if (ST->useGLMDivSqrtCosts())
199  if (const auto *Entry = CostTableLookup(GLMCostTable, ISD,
200  LT.second))
201  return LT.first * Entry->Cost;
202 
203  static const CostTblEntry SLMCostTable[] = {
204  { ISD::MUL, MVT::v4i32, 11 }, // pmulld
205  { ISD::MUL, MVT::v8i16, 2 }, // pmullw
206  { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
207  { ISD::FMUL, MVT::f64, 2 }, // mulsd
208  { ISD::FMUL, MVT::v2f64, 4 }, // mulpd
209  { ISD::FMUL, MVT::v4f32, 2 }, // mulps
210  { ISD::FDIV, MVT::f32, 17 }, // divss
211  { ISD::FDIV, MVT::v4f32, 39 }, // divps
212  { ISD::FDIV, MVT::f64, 32 }, // divsd
213  { ISD::FDIV, MVT::v2f64, 69 }, // divpd
214  { ISD::FADD, MVT::v2f64, 2 }, // addpd
215  { ISD::FSUB, MVT::v2f64, 2 }, // subpd
216  // v2i64/v4i64 mul is custom lowered as a series of long:
217  // multiplies(3), shifts(3) and adds(2)
218  // slm muldq version throughput is 2 and addq throughput 4
219  // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
220  // 3X4 (addq throughput) = 17
221  { ISD::MUL, MVT::v2i64, 17 },
222  // slm addq\subq throughput is 4
223  { ISD::ADD, MVT::v2i64, 4 },
224  { ISD::SUB, MVT::v2i64, 4 },
225  };
226 
227  if (ST->isSLM()) {
228  if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
229  // Check if the operands can be shrinked into a smaller datatype.
230  bool Op1Signed = false;
231  unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
232  bool Op2Signed = false;
233  unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
234 
235  bool SignedMode = Op1Signed || Op2Signed;
236  unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
237 
238  if (OpMinSize <= 7)
239  return LT.first * 3; // pmullw/sext
240  if (!SignedMode && OpMinSize <= 8)
241  return LT.first * 3; // pmullw/zext
242  if (OpMinSize <= 15)
243  return LT.first * 5; // pmullw/pmulhw/pshuf
244  if (!SignedMode && OpMinSize <= 16)
245  return LT.first * 5; // pmullw/pmulhw/pshuf
246  }
247 
248  if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
249  LT.second)) {
250  return LT.first * Entry->Cost;
251  }
252  }
253 
254  if ((ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
255  ISD == ISD::UREM) &&
258  Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
259  if (ISD == ISD::SDIV || ISD == ISD::SREM) {
260  // On X86, vector signed division by constants power-of-two are
261  // normally expanded to the sequence SRA + SRL + ADD + SRA.
262  // The OperandValue properties may not be the same as that of the previous
263  // operation; conservatively assume OP_None.
264  int Cost =
265  2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Op1Info,
266  Op2Info,
269  Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info,
270  Op2Info,
273  Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, Op1Info,
274  Op2Info,
277 
278  if (ISD == ISD::SREM) {
279  // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
280  Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info,
281  Op2Info);
282  Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info,
283  Op2Info);
284  }
285 
286  return Cost;
287  }
288 
289  // Vector unsigned division/remainder will be simplified to shifts/masks.
290  if (ISD == ISD::UDIV)
291  return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
292  Op1Info, Op2Info,
295 
296  else // UREM
297  return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
298  Op1Info, Op2Info,
301  }
302 
303  static const CostTblEntry AVX512BWUniformConstCostTable[] = {
304  { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand.
305  { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand.
306  { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb.
307  };
308 
310  ST->hasBWI()) {
311  if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
312  LT.second))
313  return LT.first * Entry->Cost;
314  }
315 
316  static const CostTblEntry AVX512UniformConstCostTable[] = {
317  { ISD::SRA, MVT::v2i64, 1 },
318  { ISD::SRA, MVT::v4i64, 1 },
319  { ISD::SRA, MVT::v8i64, 1 },
320 
321  { ISD::SHL, MVT::v64i8, 4 }, // psllw + pand.
322  { ISD::SRL, MVT::v64i8, 4 }, // psrlw + pand.
323  { ISD::SRA, MVT::v64i8, 8 }, // psrlw, pand, pxor, psubb.
324 
325  { ISD::SDIV, MVT::v16i32, 6 }, // pmuludq sequence
326  { ISD::SREM, MVT::v16i32, 8 }, // pmuludq+mul+sub sequence
327  { ISD::UDIV, MVT::v16i32, 5 }, // pmuludq sequence
328  { ISD::UREM, MVT::v16i32, 7 }, // pmuludq+mul+sub sequence
329  };
330 
332  ST->hasAVX512()) {
333  if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
334  LT.second))
335  return LT.first * Entry->Cost;
336  }
337 
338  static const CostTblEntry AVX2UniformConstCostTable[] = {
339  { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand.
340  { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand.
341  { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb.
342 
343  { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
344 
345  { ISD::SDIV, MVT::v8i32, 6 }, // pmuludq sequence
346  { ISD::SREM, MVT::v8i32, 8 }, // pmuludq+mul+sub sequence
347  { ISD::UDIV, MVT::v8i32, 5 }, // pmuludq sequence
348  { ISD::UREM, MVT::v8i32, 7 }, // pmuludq+mul+sub sequence
349  };
350 
352  ST->hasAVX2()) {
353  if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
354  LT.second))
355  return LT.first * Entry->Cost;
356  }
357 
358  static const CostTblEntry SSE2UniformConstCostTable[] = {
359  { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand.
360  { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand.
361  { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
362 
363  { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split.
364  { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split.
365  { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
366 
367  { ISD::SDIV, MVT::v8i32, 12+2 }, // 2*pmuludq sequence + split.
368  { ISD::SREM, MVT::v8i32, 16+2 }, // 2*pmuludq+mul+sub sequence + split.
369  { ISD::SDIV, MVT::v4i32, 6 }, // pmuludq sequence
370  { ISD::SREM, MVT::v4i32, 8 }, // pmuludq+mul+sub sequence
371  { ISD::UDIV, MVT::v8i32, 10+2 }, // 2*pmuludq sequence + split.
372  { ISD::UREM, MVT::v8i32, 14+2 }, // 2*pmuludq+mul+sub sequence + split.
373  { ISD::UDIV, MVT::v4i32, 5 }, // pmuludq sequence
374  { ISD::UREM, MVT::v4i32, 7 }, // pmuludq+mul+sub sequence
375  };
376 
377  // XOP has faster vXi8 shifts.
379  ST->hasSSE2() && !ST->hasXOP()) {
380  if (const auto *Entry =
381  CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
382  return LT.first * Entry->Cost;
383  }
384 
385  static const CostTblEntry AVX512BWConstCostTable[] = {
386  { ISD::SDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
387  { ISD::SREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
388  { ISD::UDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
389  { ISD::UREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
390  { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence
391  { ISD::SREM, MVT::v32i16, 8 }, // vpmulhw+mul+sub sequence
392  { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence
393  { ISD::UREM, MVT::v32i16, 8 }, // vpmulhuw+mul+sub sequence
394  };
395 
398  ST->hasBWI()) {
399  if (const auto *Entry =
400  CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
401  return LT.first * Entry->Cost;
402  }
403 
404  static const CostTblEntry AVX512ConstCostTable[] = {
405  { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
406  { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
407  { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
408  { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
409  { ISD::SDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
410  { ISD::SREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
411  { ISD::UDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
412  { ISD::UREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
413  { ISD::SDIV, MVT::v32i16, 12 }, // 2*vpmulhw sequence
414  { ISD::SREM, MVT::v32i16, 16 }, // 2*vpmulhw+mul+sub sequence
415  { ISD::UDIV, MVT::v32i16, 12 }, // 2*vpmulhuw sequence
416  { ISD::UREM, MVT::v32i16, 16 }, // 2*vpmulhuw+mul+sub sequence
417  };
418 
421  ST->hasAVX512()) {
422  if (const auto *Entry =
423  CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
424  return LT.first * Entry->Cost;
425  }
426 
427  static const CostTblEntry AVX2ConstCostTable[] = {
428  { ISD::SDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
429  { ISD::SREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
430  { ISD::UDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
431  { ISD::UREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
432  { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
433  { ISD::SREM, MVT::v16i16, 8 }, // vpmulhw+mul+sub sequence
434  { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
435  { ISD::UREM, MVT::v16i16, 8 }, // vpmulhuw+mul+sub sequence
436  { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
437  { ISD::SREM, MVT::v8i32, 19 }, // vpmuldq+mul+sub sequence
438  { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
439  { ISD::UREM, MVT::v8i32, 19 }, // vpmuludq+mul+sub sequence
440  };
441 
444  ST->hasAVX2()) {
445  if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
446  return LT.first * Entry->Cost;
447  }
448 
449  static const CostTblEntry SSE2ConstCostTable[] = {
450  { ISD::SDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
451  { ISD::SREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
452  { ISD::SDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
453  { ISD::SREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
454  { ISD::UDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
455  { ISD::UREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
456  { ISD::UDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
457  { ISD::UREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
458  { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
459  { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split.
460  { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
461  { ISD::SREM, MVT::v8i16, 8 }, // pmulhw+mul+sub sequence
462  { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
463  { ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split.
464  { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
465  { ISD::UREM, MVT::v8i16, 8 }, // pmulhuw+mul+sub sequence
466  { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split.
467  { ISD::SREM, MVT::v8i32, 48+2 }, // 2*pmuludq+mul+sub sequence + split.
468  { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
469  { ISD::SREM, MVT::v4i32, 24 }, // pmuludq+mul+sub sequence
470  { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split.
471  { ISD::UREM, MVT::v8i32, 40+2 }, // 2*pmuludq+mul+sub sequence + split.
472  { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
473  { ISD::UREM, MVT::v4i32, 20 }, // pmuludq+mul+sub sequence
474  };
475 
478  ST->hasSSE2()) {
479  // pmuldq sequence.
480  if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
481  return LT.first * 32;
482  if (ISD == ISD::SREM && LT.second == MVT::v8i32 && ST->hasAVX())
483  return LT.first * 38;
484  if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
485  return LT.first * 15;
486  if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41())
487  return LT.first * 20;
488 
489  if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
490  return LT.first * Entry->Cost;
491  }
492 
493  static const CostTblEntry AVX512BWShiftCostTable[] = {
494  { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
495  { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
496  { ISD::SRA, MVT::v8i16, 1 }, // vpsravw
497 
498  { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
499  { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
500  { ISD::SRA, MVT::v16i16, 1 }, // vpsravw
501 
502  { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
503  { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
504  { ISD::SRA, MVT::v32i16, 1 }, // vpsravw
505  };
506 
507  if (ST->hasBWI())
508  if (const auto *Entry = CostTableLookup(AVX512BWShiftCostTable, ISD, LT.second))
509  return LT.first * Entry->Cost;
510 
511  static const CostTblEntry AVX2UniformCostTable[] = {
512  // Uniform splats are cheaper for the following instructions.
513  { ISD::SHL, MVT::v16i16, 1 }, // psllw.
514  { ISD::SRL, MVT::v16i16, 1 }, // psrlw.
515  { ISD::SRA, MVT::v16i16, 1 }, // psraw.
516  { ISD::SHL, MVT::v32i16, 2 }, // 2*psllw.
517  { ISD::SRL, MVT::v32i16, 2 }, // 2*psrlw.
518  { ISD::SRA, MVT::v32i16, 2 }, // 2*psraw.
519  };
520 
521  if (ST->hasAVX2() &&
523  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
524  if (const auto *Entry =
525  CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
526  return LT.first * Entry->Cost;
527  }
528 
529  static const CostTblEntry SSE2UniformCostTable[] = {
530  // Uniform splats are cheaper for the following instructions.
531  { ISD::SHL, MVT::v8i16, 1 }, // psllw.
532  { ISD::SHL, MVT::v4i32, 1 }, // pslld
533  { ISD::SHL, MVT::v2i64, 1 }, // psllq.
534 
535  { ISD::SRL, MVT::v8i16, 1 }, // psrlw.
536  { ISD::SRL, MVT::v4i32, 1 }, // psrld.
537  { ISD::SRL, MVT::v2i64, 1 }, // psrlq.
538 
539  { ISD::SRA, MVT::v8i16, 1 }, // psraw.
540  { ISD::SRA, MVT::v4i32, 1 }, // psrad.
541  };
542 
543  if (ST->hasSSE2() &&
545  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
546  if (const auto *Entry =
547  CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
548  return LT.first * Entry->Cost;
549  }
550 
551  static const CostTblEntry AVX512DQCostTable[] = {
552  { ISD::MUL, MVT::v2i64, 1 },
553  { ISD::MUL, MVT::v4i64, 1 },
554  { ISD::MUL, MVT::v8i64, 1 }
555  };
556 
557  // Look for AVX512DQ lowering tricks for custom cases.
558  if (ST->hasDQI())
559  if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
560  return LT.first * Entry->Cost;
561 
562  static const CostTblEntry AVX512BWCostTable[] = {
563  { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence.
564  { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence.
565  { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence.
566 
567  { ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence.
568  { ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence.
569  { ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence.
570  };
571 
572  // Look for AVX512BW lowering tricks for custom cases.
573  if (ST->hasBWI())
574  if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
575  return LT.first * Entry->Cost;
576 
577  static const CostTblEntry AVX512CostTable[] = {
578  { ISD::SHL, MVT::v16i32, 1 },
579  { ISD::SRL, MVT::v16i32, 1 },
580  { ISD::SRA, MVT::v16i32, 1 },
581 
582  { ISD::SHL, MVT::v8i64, 1 },
583  { ISD::SRL, MVT::v8i64, 1 },
584 
585  { ISD::SRA, MVT::v2i64, 1 },
586  { ISD::SRA, MVT::v4i64, 1 },
587  { ISD::SRA, MVT::v8i64, 1 },
588 
589  { ISD::MUL, MVT::v64i8, 26 }, // extend/pmullw/trunc sequence.
590  { ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence.
591  { ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence.
592  { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org)
593  { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org)
594  { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org)
595  { ISD::MUL, MVT::v8i64, 8 }, // 3*pmuludq/3*shift/2*add
596 
597  { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
598  { ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
599  { ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
600 
601  { ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
602  { ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
603  { ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
604  };
605 
606  if (ST->hasAVX512())
607  if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
608  return LT.first * Entry->Cost;
609 
610  static const CostTblEntry AVX2ShiftCostTable[] = {
611  // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
612  // customize them to detect the cases where shift amount is a scalar one.
613  { ISD::SHL, MVT::v4i32, 1 },
614  { ISD::SRL, MVT::v4i32, 1 },
615  { ISD::SRA, MVT::v4i32, 1 },
616  { ISD::SHL, MVT::v8i32, 1 },
617  { ISD::SRL, MVT::v8i32, 1 },
618  { ISD::SRA, MVT::v8i32, 1 },
619  { ISD::SHL, MVT::v2i64, 1 },
620  { ISD::SRL, MVT::v2i64, 1 },
621  { ISD::SHL, MVT::v4i64, 1 },
622  { ISD::SRL, MVT::v4i64, 1 },
623  };
624 
625  if (ST->hasAVX512()) {
626  if (ISD == ISD::SHL && LT.second == MVT::v32i16 &&
629  // On AVX512, a packed v32i16 shift left by a constant build_vector
630  // is lowered into a vector multiply (vpmullw).
631  return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
632  Op1Info, Op2Info,
635  }
636 
637  // Look for AVX2 lowering tricks.
638  if (ST->hasAVX2()) {
639  if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
642  // On AVX2, a packed v16i16 shift left by a constant build_vector
643  // is lowered into a vector multiply (vpmullw).
644  return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
645  Op1Info, Op2Info,
648 
649  if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
650  return LT.first * Entry->Cost;
651  }
652 
653  static const CostTblEntry XOPShiftCostTable[] = {
654  // 128bit shifts take 1cy, but right shifts require negation beforehand.
655  { ISD::SHL, MVT::v16i8, 1 },
656  { ISD::SRL, MVT::v16i8, 2 },
657  { ISD::SRA, MVT::v16i8, 2 },
658  { ISD::SHL, MVT::v8i16, 1 },
659  { ISD::SRL, MVT::v8i16, 2 },
660  { ISD::SRA, MVT::v8i16, 2 },
661  { ISD::SHL, MVT::v4i32, 1 },
662  { ISD::SRL, MVT::v4i32, 2 },
663  { ISD::SRA, MVT::v4i32, 2 },
664  { ISD::SHL, MVT::v2i64, 1 },
665  { ISD::SRL, MVT::v2i64, 2 },
666  { ISD::SRA, MVT::v2i64, 2 },
667  // 256bit shifts require splitting if AVX2 didn't catch them above.
668  { ISD::SHL, MVT::v32i8, 2+2 },
669  { ISD::SRL, MVT::v32i8, 4+2 },
670  { ISD::SRA, MVT::v32i8, 4+2 },
671  { ISD::SHL, MVT::v16i16, 2+2 },
672  { ISD::SRL, MVT::v16i16, 4+2 },
673  { ISD::SRA, MVT::v16i16, 4+2 },
674  { ISD::SHL, MVT::v8i32, 2+2 },
675  { ISD::SRL, MVT::v8i32, 4+2 },
676  { ISD::SRA, MVT::v8i32, 4+2 },
677  { ISD::SHL, MVT::v4i64, 2+2 },
678  { ISD::SRL, MVT::v4i64, 4+2 },
679  { ISD::SRA, MVT::v4i64, 4+2 },
680  };
681 
682  // Look for XOP lowering tricks.
683  if (ST->hasXOP()) {
684  // If the right shift is constant then we'll fold the negation so
685  // it's as cheap as a left shift.
686  int ShiftISD = ISD;
687  if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) &&
690  ShiftISD = ISD::SHL;
691  if (const auto *Entry =
692  CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
693  return LT.first * Entry->Cost;
694  }
695 
696  static const CostTblEntry SSE2UniformShiftCostTable[] = {
697  // Uniform splats are cheaper for the following instructions.
698  { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split.
699  { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split.
700  { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split.
701 
702  { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split.
703  { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split.
704  { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split.
705 
706  { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split.
707  { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split.
708  { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle.
709  { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split.
710  };
711 
712  if (ST->hasSSE2() &&
714  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
715 
716  // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
717  if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2())
718  return LT.first * 4; // 2*psrad + shuffle.
719 
720  if (const auto *Entry =
721  CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
722  return LT.first * Entry->Cost;
723  }
724 
725  if (ISD == ISD::SHL &&
727  MVT VT = LT.second;
728  // Vector shift left by non uniform constant can be lowered
729  // into vector multiply.
730  if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
731  ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
732  ISD = ISD::MUL;
733  }
734 
735  static const CostTblEntry AVX2CostTable[] = {
736  { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence.
737  { ISD::SHL, MVT::v64i8, 22 }, // 2*vpblendvb sequence.
738  { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
739  { ISD::SHL, MVT::v32i16, 20 }, // 2*extend/vpsrlvd/pack sequence.
740 
741  { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence.
742  { ISD::SRL, MVT::v64i8, 22 }, // 2*vpblendvb sequence.
743  { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
744  { ISD::SRL, MVT::v32i16, 20 }, // 2*extend/vpsrlvd/pack sequence.
745 
746  { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence.
747  { ISD::SRA, MVT::v64i8, 48 }, // 2*vpblendvb sequence.
748  { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence.
749  { ISD::SRA, MVT::v32i16, 20 }, // 2*extend/vpsravd/pack sequence.
750  { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence.
751  { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence.
752 
753  { ISD::SUB, MVT::v32i8, 1 }, // psubb
754  { ISD::ADD, MVT::v32i8, 1 }, // paddb
755  { ISD::SUB, MVT::v16i16, 1 }, // psubw
756  { ISD::ADD, MVT::v16i16, 1 }, // paddw
757  { ISD::SUB, MVT::v8i32, 1 }, // psubd
758  { ISD::ADD, MVT::v8i32, 1 }, // paddd
759  { ISD::SUB, MVT::v4i64, 1 }, // psubq
760  { ISD::ADD, MVT::v4i64, 1 }, // paddq
761 
762  { ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence.
763  { ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence.
764  { ISD::MUL, MVT::v16i16, 1 }, // pmullw
765  { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org)
766  { ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add
767 
768  { ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
769  { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
770  { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
771  { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
772  { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
773  { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
774 
775  { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
776  { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
777  { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
778  { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/
779  { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
780  { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
781  };
782 
783  // Look for AVX2 lowering tricks for custom cases.
784  if (ST->hasAVX2())
785  if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
786  return LT.first * Entry->Cost;
787 
788  static const CostTblEntry AVX1CostTable[] = {
789  // We don't have to scalarize unsupported ops. We can issue two half-sized
790  // operations and we only need to extract the upper YMM half.
791  // Two ops + 1 extract + 1 insert = 4.
792  { ISD::MUL, MVT::v16i16, 4 },
793  { ISD::MUL, MVT::v8i32, 4 },
794  { ISD::SUB, MVT::v32i8, 4 },
795  { ISD::ADD, MVT::v32i8, 4 },
796  { ISD::SUB, MVT::v16i16, 4 },
797  { ISD::ADD, MVT::v16i16, 4 },
798  { ISD::SUB, MVT::v8i32, 4 },
799  { ISD::ADD, MVT::v8i32, 4 },
800  { ISD::SUB, MVT::v4i64, 4 },
801  { ISD::ADD, MVT::v4i64, 4 },
802 
803  // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
804  // are lowered as a series of long multiplies(3), shifts(3) and adds(2)
805  // Because we believe v4i64 to be a legal type, we must also include the
806  // extract+insert in the cost table. Therefore, the cost here is 18
807  // instead of 8.
808  { ISD::MUL, MVT::v4i64, 18 },
809 
810  { ISD::MUL, MVT::v32i8, 26 }, // extend/pmullw/trunc sequence.
811 
812  { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
813  { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
814  { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
815  { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/
816  { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/
817  { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/
818  };
819 
820  if (ST->hasAVX())
821  if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
822  return LT.first * Entry->Cost;
823 
824  static const CostTblEntry SSE42CostTable[] = {
825  { ISD::FADD, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
826  { ISD::FADD, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
827  { ISD::FADD, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
828  { ISD::FADD, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
829 
830  { ISD::FSUB, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
831  { ISD::FSUB, MVT::f32 , 1 }, // Nehalem from http://www.agner.org/
832  { ISD::FSUB, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
833  { ISD::FSUB, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
834 
835  { ISD::FMUL, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
836  { ISD::FMUL, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
837  { ISD::FMUL, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
838  { ISD::FMUL, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
839 
840  { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/
841  { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
842  { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/
843  { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
844  };
845 
846  if (ST->hasSSE42())
847  if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
848  return LT.first * Entry->Cost;
849 
850  static const CostTblEntry SSE41CostTable[] = {
851  { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence.
852  { ISD::SHL, MVT::v32i8, 2*11+2 }, // pblendvb sequence + split.
853  { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence.
854  { ISD::SHL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
855  { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
856  { ISD::SHL, MVT::v8i32, 2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split
857 
858  { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence.
859  { ISD::SRL, MVT::v32i8, 2*12+2 }, // pblendvb sequence + split.
860  { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence.
861  { ISD::SRL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
862  { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend.
863  { ISD::SRL, MVT::v8i32, 2*11+2 }, // Shift each lane + blend + split.
864 
865  { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence.
866  { ISD::SRA, MVT::v32i8, 2*24+2 }, // pblendvb sequence + split.
867  { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence.
868  { ISD::SRA, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
869  { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
870  { ISD::SRA, MVT::v8i32, 2*12+2 }, // Shift each lane + blend + split.
871 
872  { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org)
873  };
874 
875  if (ST->hasSSE41())
876  if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
877  return LT.first * Entry->Cost;
878 
879  static const CostTblEntry SSE2CostTable[] = {
880  // We don't correctly identify costs of casts because they are marked as
881  // custom.
882  { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence.
883  { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence.
884  { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
885  { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
886  { ISD::SHL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
887 
888  { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence.
889  { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence.
890  { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
891  { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
892  { ISD::SRL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
893 
894  { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence.
895  { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence.
896  { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend.
897  { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence.
898  { ISD::SRA, MVT::v4i64, 2*12+2 }, // srl/xor/sub sequence+split.
899 
900  { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence.
901  { ISD::MUL, MVT::v8i16, 1 }, // pmullw
902  { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
903  { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
904 
905  { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/
906  { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
907  { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
908  { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
909 
910  { ISD::FADD, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
911  { ISD::FADD, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
912 
913  { ISD::FSUB, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
914  { ISD::FSUB, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
915  };
916 
917  if (ST->hasSSE2())
918  if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
919  return LT.first * Entry->Cost;
920 
921  static const CostTblEntry SSE1CostTable[] = {
922  { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/
923  { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
924 
925  { ISD::FADD, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
926  { ISD::FADD, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
927 
928  { ISD::FSUB, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
929  { ISD::FSUB, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
930 
931  { ISD::ADD, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
932  { ISD::ADD, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
933  { ISD::ADD, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
934 
935  { ISD::SUB, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
936  { ISD::SUB, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
937  { ISD::SUB, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
938  };
939 
940  if (ST->hasSSE1())
941  if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
942  return LT.first * Entry->Cost;
943 
944  // It is not a good idea to vectorize division. We have to scalarize it and
945  // in the process we will often end up having to spilling regular
946  // registers. The overhead of division is going to dominate most kernels
947  // anyways so try hard to prevent vectorization of division - it is
948  // generally a bad idea. Assume somewhat arbitrarily that we have to be able
949  // to hide "20 cycles" for each lane.
950  if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM ||
951  ISD == ISD::UDIV || ISD == ISD::UREM)) {
952  int ScalarCost = getArithmeticInstrCost(
953  Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info,
955  return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
956  }
957 
958  // Fallback to the default implementation.
959  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info);
960 }
961 
963  int Index, VectorType *SubTp) {
964  // 64-bit packed float vectors (v2f32) are widened to type v4f32.
965  // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
966  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, BaseTp);
967 
968  // Treat Transpose as 2-op shuffles - there's no difference in lowering.
969  if (Kind == TTI::SK_Transpose)
971 
972  // For Broadcasts we are splatting the first element from the first input
973  // register, so only need to reference that input and all the output
974  // registers are the same.
975  if (Kind == TTI::SK_Broadcast)
976  LT.first = 1;
977 
978  // Subvector extractions are free if they start at the beginning of a
979  // vector and cheap if the subvectors are aligned.
980  if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
981  int NumElts = LT.second.getVectorNumElements();
982  if ((Index % NumElts) == 0)
983  return 0;
984  std::pair<int, MVT> SubLT = TLI->getTypeLegalizationCost(DL, SubTp);
985  if (SubLT.second.isVector()) {
986  int NumSubElts = SubLT.second.getVectorNumElements();
987  if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
988  return SubLT.first;
989  // Handle some cases for widening legalization. For now we only handle
990  // cases where the original subvector was naturally aligned and evenly
991  // fit in its legalized subvector type.
992  // FIXME: Remove some of the alignment restrictions.
993  // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
994  // vectors.
995  int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
996  if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
997  (NumSubElts % OrigSubElts) == 0 &&
998  LT.second.getVectorElementType() ==
999  SubLT.second.getVectorElementType() &&
1000  LT.second.getVectorElementType().getSizeInBits() ==
1001  BaseTp->getElementType()->getPrimitiveSizeInBits()) {
1002  assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1003  "Unexpected number of elements!");
1004  auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
1005  LT.second.getVectorNumElements());
1006  auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
1007  SubLT.second.getVectorNumElements());
1008  int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1009  int ExtractCost = getShuffleCost(TTI::SK_ExtractSubvector, VecTy,
1010  ExtractIndex, SubTy);
1011 
1012  // If the original size is 32-bits or more, we can use pshufd. Otherwise
1013  // if we have SSSE3 we can use pshufb.
1014  if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1015  return ExtractCost + 1; // pshufd or pshufb
1016 
1017  assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1018  "Unexpected vector size");
1019 
1020  return ExtractCost + 2; // worst case pshufhw + pshufd
1021  }
1022  }
1023  }
1024 
1025  // Handle some common (illegal) sub-vector types as they are often very cheap
1026  // to shuffle even on targets without PSHUFB.
1027  EVT VT = TLI->getValueType(DL, BaseTp);
1028  if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1029  !ST->hasSSSE3()) {
1030  static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
1031  {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw
1032  {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw
1033  {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw
1034  {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw
1035  {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck
1036 
1037  {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw
1038  {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw
1039  {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus
1040  {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck
1041 
1042  {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw
1043  {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw
1044  {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw
1045  {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw
1046  {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck
1047 
1048  {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
1049  {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
1050  {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw
1051  {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw
1052  {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck
1053  };
1054 
1055  if (ST->hasSSE2())
1056  if (const auto *Entry =
1057  CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1058  return Entry->Cost;
1059  }
1060 
1061  // We are going to permute multiple sources and the result will be in multiple
1062  // destinations. Providing an accurate cost only for splits where the element
1063  // type remains the same.
1064  if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
1065  MVT LegalVT = LT.second;
1066  if (LegalVT.isVector() &&
1067  LegalVT.getVectorElementType().getSizeInBits() ==
1068  BaseTp->getElementType()->getPrimitiveSizeInBits() &&
1069  LegalVT.getVectorNumElements() <
1070  cast<FixedVectorType>(BaseTp)->getNumElements()) {
1071 
1072  unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
1073  unsigned LegalVTSize = LegalVT.getStoreSize();
1074  // Number of source vectors after legalization:
1075  unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1076  // Number of destination vectors after legalization:
1077  unsigned NumOfDests = LT.first;
1078 
1079  auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
1080  LegalVT.getVectorNumElements());
1081 
1082  unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1083  return NumOfShuffles *
1084  getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
1085  }
1086 
1087  return BaseT::getShuffleCost(Kind, BaseTp, Index, SubTp);
1088  }
1089 
1090  // For 2-input shuffles, we must account for splitting the 2 inputs into many.
1091  if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
1092  // We assume that source and destination have the same vector type.
1093  int NumOfDests = LT.first;
1094  int NumOfShufflesPerDest = LT.first * 2 - 1;
1095  LT.first = NumOfDests * NumOfShufflesPerDest;
1096  }
1097 
1098  static const CostTblEntry AVX512VBMIShuffleTbl[] = {
1099  {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
1100  {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
1101 
1102  {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
1103  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
1104 
1105  {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
1106  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
1107  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b
1108  };
1109 
1110  if (ST->hasVBMI())
1111  if (const auto *Entry =
1112  CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1113  return LT.first * Entry->Cost;
1114 
1115  static const CostTblEntry AVX512BWShuffleTbl[] = {
1116  {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1117  {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1118 
1119  {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
1120  {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
1121  {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
1122 
1123  {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
1124  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
1125  {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
1126 
1127  {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
1128  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
1129  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
1130  {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1131 
1132  {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
1133  {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
1134  };
1135 
1136  if (ST->hasBWI())
1137  if (const auto *Entry =
1138  CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1139  return LT.first * Entry->Cost;
1140 
1141  static const CostTblEntry AVX512ShuffleTbl[] = {
1142  {TTI::SK_Broadcast, MVT::v8f64, 1}, // vbroadcastpd
1143  {TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps
1144  {TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq
1145  {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd
1146  {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1147  {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1148 
1149  {TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd
1150  {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps
1151  {TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq
1152  {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd
1153 
1154  {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd
1155  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1156  {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // vpermpd
1157  {TTI::SK_PermuteSingleSrc, MVT::v16f32, 1}, // vpermps
1158  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1159  {TTI::SK_PermuteSingleSrc, MVT::v4f32, 1}, // vpermps
1160  {TTI::SK_PermuteSingleSrc, MVT::v8i64, 1}, // vpermq
1161  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1162  {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // vpermq
1163  {TTI::SK_PermuteSingleSrc, MVT::v16i32, 1}, // vpermd
1164  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1165  {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // vpermd
1166  {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1167 
1168  {TTI::SK_PermuteTwoSrc, MVT::v8f64, 1}, // vpermt2pd
1169  {TTI::SK_PermuteTwoSrc, MVT::v16f32, 1}, // vpermt2ps
1170  {TTI::SK_PermuteTwoSrc, MVT::v8i64, 1}, // vpermt2q
1171  {TTI::SK_PermuteTwoSrc, MVT::v16i32, 1}, // vpermt2d
1172  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 1}, // vpermt2pd
1173  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 1}, // vpermt2ps
1174  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 1}, // vpermt2q
1175  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 1}, // vpermt2d
1176  {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1}, // vpermt2pd
1177  {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1}, // vpermt2ps
1178  {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1}, // vpermt2q
1179  {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1}, // vpermt2d
1180 
1181  // FIXME: This just applies the type legalization cost rules above
1182  // assuming these completely split.
1187 
1188  {TTI::SK_Select, MVT::v32i16, 1}, // vpternlogq
1189  {TTI::SK_Select, MVT::v64i8, 1}, // vpternlogq
1190  {TTI::SK_Select, MVT::v8f64, 1}, // vblendmpd
1191  {TTI::SK_Select, MVT::v16f32, 1}, // vblendmps
1192  {TTI::SK_Select, MVT::v8i64, 1}, // vblendmq
1193  {TTI::SK_Select, MVT::v16i32, 1}, // vblendmd
1194  };
1195 
1196  if (ST->hasAVX512())
1197  if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1198  return LT.first * Entry->Cost;
1199 
1200  static const CostTblEntry AVX2ShuffleTbl[] = {
1201  {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1202  {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1203  {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1204  {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1205  {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1206  {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1207 
1208  {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1209  {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1210  {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1211  {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1212  {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1213  {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1214 
1215  {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1216  {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1217 
1218  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1219  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1220  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1221  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1222  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1223  // + vpblendvb
1224  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1225  // + vpblendvb
1226 
1227  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1228  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1229  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1230  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1231  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1232  // + vpblendvb
1233  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
1234  // + vpblendvb
1235  };
1236 
1237  if (ST->hasAVX2())
1238  if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1239  return LT.first * Entry->Cost;
1240 
1241  static const CostTblEntry XOPShuffleTbl[] = {
1242  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
1243  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
1244  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
1245  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
1246  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1247  // + vinsertf128
1248  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
1249  // + vinsertf128
1250 
1251  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1252  // + vinsertf128
1253  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
1254  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
1255  // + vinsertf128
1256  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
1257  };
1258 
1259  if (ST->hasXOP())
1260  if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1261  return LT.first * Entry->Cost;
1262 
1263  static const CostTblEntry AVX1ShuffleTbl[] = {
1264  {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1265  {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1266  {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1267  {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1268  {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1269  {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
1270 
1271  {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1272  {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1273  {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1274  {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1275  {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1276  // + vinsertf128
1277  {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
1278  // + vinsertf128
1279 
1280  {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
1281  {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
1282  {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
1283  {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
1284  {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
1285  {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
1286 
1287  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
1288  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
1289  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1290  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1291  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
1292  // + 2*por + vinsertf128
1293  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
1294  // + 2*por + vinsertf128
1295 
1296  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
1297  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
1298  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1299  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1300  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
1301  // + 4*por + vinsertf128
1302  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
1303  // + 4*por + vinsertf128
1304  };
1305 
1306  if (ST->hasAVX())
1307  if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
1308  return LT.first * Entry->Cost;
1309 
1310  static const CostTblEntry SSE41ShuffleTbl[] = {
1311  {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
1312  {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1313  {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
1314  {TTI::SK_Select, MVT::v4f32, 1}, // blendps
1315  {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
1316  {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
1317  };
1318 
1319  if (ST->hasSSE41())
1320  if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
1321  return LT.first * Entry->Cost;
1322 
1323  static const CostTblEntry SSSE3ShuffleTbl[] = {
1324  {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
1325  {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
1326 
1327  {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
1328  {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
1329 
1330  {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
1331  {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
1332 
1333  {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
1334  {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1335 
1336  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
1337  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
1338  };
1339 
1340  if (ST->hasSSSE3())
1341  if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
1342  return LT.first * Entry->Cost;
1343 
1344  static const CostTblEntry SSE2ShuffleTbl[] = {
1345  {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
1346  {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
1347  {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
1348  {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
1349  {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
1350 
1351  {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
1352  {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
1353  {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
1354  {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
1355  {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
1356  // + 2*pshufd + 2*unpck + packus
1357 
1358  {TTI::SK_Select, MVT::v2i64, 1}, // movsd
1359  {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1360  {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
1361  {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
1362  {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
1363 
1364  {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
1365  {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
1366  {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
1367  {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
1368  // + pshufd/unpck
1369  { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
1370  // + 2*pshufd + 2*unpck + 2*packus
1371 
1372  { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
1373  { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
1374  { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
1375  { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
1376  { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
1377  };
1378 
1379  if (ST->hasSSE2())
1380  if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
1381  return LT.first * Entry->Cost;
1382 
1383  static const CostTblEntry SSE1ShuffleTbl[] = {
1384  { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
1385  { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
1386  { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
1387  { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
1388  { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
1389  };
1390 
1391  if (ST->hasSSE1())
1392  if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
1393  return LT.first * Entry->Cost;
1394 
1395  return BaseT::getShuffleCost(Kind, BaseTp, Index, SubTp);
1396 }
1397 
1398 int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1401  const Instruction *I) {
1402  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1403  assert(ISD && "Invalid opcode");
1404 
1405  // TODO: Allow non-throughput costs that aren't binary.
1406  auto AdjustCost = [&CostKind](int Cost) {
1408  return Cost == 0 ? 0 : 1;
1409  return Cost;
1410  };
1411 
1412  // FIXME: Need a better design of the cost table to handle non-simple types of
1413  // potential massive combinations (elem_num x src_type x dst_type).
1414 
1415  static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
1418 
1419  // Mask sign extend has an instruction.
1431 
1432  // Mask zero extend is a sext + shift.
1444 
1446  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm
1447  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // widen to zmm
1448  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // widen to zmm
1449  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // widen to zmm
1450  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // widen to zmm
1451  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // widen to zmm
1452  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // widen to zmm
1453  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // widen to zmm
1454  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // widen to zmm
1455  { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // widen to zmm
1458  };
1459 
1460  static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
1463 
1466 
1469 
1472  };
1473 
1474  // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
1475  // 256-bit wide vectors.
1476 
1477  static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
1481 
1482  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
1483  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
1484  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
1485  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd
1486  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
1487  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
1488  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
1489  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd
1490  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd
1491  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd
1492  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd
1493  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd
1494  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq
1495  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq
1496  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq
1502  { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd
1503  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb
1504 
1505  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32
1507 
1508  // Sign extend is zmm vpternlogd+vptruncdb.
1509  // Zero extend is zmm broadcast load+vptruncdw.
1518 
1519  // Sign extend is zmm vpternlogd+vptruncdw.
1520  // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
1529 
1530  { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd
1531  { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld
1532  { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd
1533  { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld
1534  { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd
1535  { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld
1536  { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq
1537  { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq
1538  { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq
1539  { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq
1540 
1541  { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd
1542  { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld
1543  { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq
1544  { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq
1545 
1556 
1557  { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
1558  { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
1559 
1568 
1579 
1584 
1591  };
1592 
1593  static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] {
1594  // Mask sign extend has an instruction.
1604 
1605  // Mask zero extend is a sext + shift.
1615 
1617  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // vpsllw+vptestmb
1618  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // vpsllw+vptestmw
1619  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // vpsllw+vptestmb
1620  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // vpsllw+vptestmw
1621  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // vpsllw+vptestmb
1622  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // vpsllw+vptestmw
1623  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // vpsllw+vptestmb
1624  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // vpsllw+vptestmw
1625  { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // vpsllw+vptestmb
1626  };
1627 
1628  static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = {
1633 
1638 
1643 
1648  };
1649 
1650  static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = {
1651  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
1652  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
1653  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
1654  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8
1655  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
1656  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
1657  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
1658  { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16
1659  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd
1660  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd
1661  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd
1662  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq
1663  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq
1664  { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd
1665 
1666  // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
1667  // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
1676 
1677  // sign extend is vpcmpeq+maskedmove+vpmovdw
1678  // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
1687 
1688  { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd
1689  { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld
1690  { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd
1691  { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld
1692  { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd
1693  { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld
1694  { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq
1695  { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq
1696  { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq
1697  { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq
1698 
1713 
1716 
1719 
1722 
1728  };
1729 
1730  static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
1751 
1754 
1759 
1762 
1764  };
1765 
1766  static const TypeConversionCostTblEntry AVXConversionTbl[] = {
1785 
1791 
1802 
1815 
1831  // The generic code to compute the scalar overhead is currently broken.
1832  // Workaround this limitation by estimating the scalarization overhead
1833  // here. We have roughly 10 instructions per scalar element.
1834  // Multiply that by the vector width.
1835  // FIXME: remove that when PR19268 is fixed.
1838 
1843 
1848  // This node is expanded into scalarized operations but BasicTTI is overly
1849  // optimistic estimating its cost. It computes 3 per element (one
1850  // vector-extract, one scalar conversion and one vector-insert). The
1851  // problem is that the inserts form a read-modify-write chain so latency
1852  // should be factored in too. Inflating the cost per element by 1.
1855 
1858  };
1859 
1860  static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
1867 
1886 
1887  // These truncates end up widening elements.
1888  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ
1889  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ
1890  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD
1891 
1900  { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1 }, // PSHUFB
1901 
1904 
1907 
1911  };
1912 
1913  static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
1914  // These are somewhat magic numbers justified by looking at the output of
1915  // Intel's IACA, running some kernels and making sure when we take
1916  // legalization into account the throughput will be overestimated.
1918  { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1926 
1927  { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1935 
1942 
1944 
1947 
1956 
1981 
1982  // These truncates are really widening elements.
1983  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD
1984  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // PUNPCKLWD+DQ
1985  { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // PUNPCKLBW+WD+PSHUFD
1986  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 1 }, // PUNPCKLWD
1987  { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD
1988  { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW
1989 
1990  { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // PAND+PACKUSWB
1991  { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // PAND+PACKUSWB
1992  { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // PAND+PACKUSWB
1994  { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 3 }, // PAND+2*PACKUSWB
2002  { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB
2003  { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW
2004  { ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1 }, // PSHUFD
2005  };
2006 
2007  std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
2008  std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);
2009 
2010  if (ST->hasSSE2() && !ST->hasAVX()) {
2011  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2012  LTDest.second, LTSrc.second))
2013  return AdjustCost(LTSrc.first * Entry->Cost);
2014  }
2015 
2016  EVT SrcTy = TLI->getValueType(DL, Src);
2017  EVT DstTy = TLI->getValueType(DL, Dst);
2018 
2019  // The function getSimpleVT only handles simple value types.
2020  if (!SrcTy.isSimple() || !DstTy.isSimple())
2021  return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind));
2022 
2023  MVT SimpleSrcTy = SrcTy.getSimpleVT();
2024  MVT SimpleDstTy = DstTy.getSimpleVT();
2025 
2026  if (ST->useAVX512Regs()) {
2027  if (ST->hasBWI())
2028  if (const auto *Entry = ConvertCostTableLookup(AVX512BWConversionTbl, ISD,
2029  SimpleDstTy, SimpleSrcTy))
2030  return AdjustCost(Entry->Cost);
2031 
2032  if (ST->hasDQI())
2033  if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
2034  SimpleDstTy, SimpleSrcTy))
2035  return AdjustCost(Entry->Cost);
2036 
2037  if (ST->hasAVX512())
2038  if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
2039  SimpleDstTy, SimpleSrcTy))
2040  return AdjustCost(Entry->Cost);
2041  }
2042 
2043  if (ST->hasBWI())
2044  if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
2045  SimpleDstTy, SimpleSrcTy))
2046  return AdjustCost(Entry->Cost);
2047 
2048  if (ST->hasDQI())
2049  if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
2050  SimpleDstTy, SimpleSrcTy))
2051  return AdjustCost(Entry->Cost);
2052 
2053  if (ST->hasAVX512())
2054  if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2055  SimpleDstTy, SimpleSrcTy))
2056  return AdjustCost(Entry->Cost);
2057 
2058  if (ST->hasAVX2()) {
2059  if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2060  SimpleDstTy, SimpleSrcTy))
2061  return AdjustCost(Entry->Cost);
2062  }
2063 
2064  if (ST->hasAVX()) {
2065  if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2066  SimpleDstTy, SimpleSrcTy))
2067  return AdjustCost(Entry->Cost);
2068  }
2069 
2070  if (ST->hasSSE41()) {
2071  if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
2072  SimpleDstTy, SimpleSrcTy))
2073  return AdjustCost(Entry->Cost);
2074  }
2075 
2076  if (ST->hasSSE2()) {
2077  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2078  SimpleDstTy, SimpleSrcTy))
2079  return AdjustCost(Entry->Cost);
2080  }
2081 
2082  return AdjustCost(
2083  BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2084 }
2085 
2086 int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
2087  CmpInst::Predicate VecPred,
2089  const Instruction *I) {
2090  // TODO: Handle other cost kinds.
2092  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2093  I);
2094 
2095  // Legalize the type.
2096  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2097 
2098  MVT MTy = LT.second;
2099 
2100  int ISD = TLI->InstructionOpcodeToISD(Opcode);
2101  assert(ISD && "Invalid opcode");
2102 
2103  unsigned ExtraCost = 0;
2104  if (I && (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp)) {
2105  // Some vector comparison predicates cost extra instructions.
2106  if (MTy.isVector() &&
2107  !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
2108  (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
2109  ST->hasBWI())) {
2110  switch (cast<CmpInst>(I)->getPredicate()) {
2111  case CmpInst::Predicate::ICMP_NE:
2112  // xor(cmpeq(x,y),-1)
2113  ExtraCost = 1;
2114  break;
2115  case CmpInst::Predicate::ICMP_SGE:
2116  case CmpInst::Predicate::ICMP_SLE:
2117  // xor(cmpgt(x,y),-1)
2118  ExtraCost = 1;
2119  break;
2120  case CmpInst::Predicate::ICMP_ULT:
2121  case CmpInst::Predicate::ICMP_UGT:
2122  // cmpgt(xor(x,signbit),xor(y,signbit))
2123  // xor(cmpeq(pmaxu(x,y),x),-1)
2124  ExtraCost = 2;
2125  break;
2126  case CmpInst::Predicate::ICMP_ULE:
2127  case CmpInst::Predicate::ICMP_UGE:
2128  if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
2129  (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
2130  // cmpeq(psubus(x,y),0)
2131  // cmpeq(pminu(x,y),x)
2132  ExtraCost = 1;
2133  } else {
2134  // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
2135  ExtraCost = 3;
2136  }
2137  break;
2138  default:
2139  break;
2140  }
2141  }
2142  }
2143 
2144  static const CostTblEntry SLMCostTbl[] = {
2145  // slm pcmpeq/pcmpgt throughput is 2
2146  { ISD::SETCC, MVT::v2i64, 2 },
2147  };
2148 
2149  static const CostTblEntry AVX512BWCostTbl[] = {
2150  { ISD::SETCC, MVT::v32i16, 1 },
2151  { ISD::SETCC, MVT::v64i8, 1 },
2152 
2153  { ISD::SELECT, MVT::v32i16, 1 },
2154  { ISD::SELECT, MVT::v64i8, 1 },
2155  };
2156 
2157  static const CostTblEntry AVX512CostTbl[] = {
2158  { ISD::SETCC, MVT::v8i64, 1 },
2159  { ISD::SETCC, MVT::v16i32, 1 },
2160  { ISD::SETCC, MVT::v8f64, 1 },
2161  { ISD::SETCC, MVT::v16f32, 1 },
2162 
2163  { ISD::SELECT, MVT::v8i64, 1 },
2164  { ISD::SELECT, MVT::v16i32, 1 },
2165  { ISD::SELECT, MVT::v8f64, 1 },
2166  { ISD::SELECT, MVT::v16f32, 1 },
2167 
2168  { ISD::SETCC, MVT::v32i16, 2 }, // FIXME: should probably be 4
2169  { ISD::SETCC, MVT::v64i8, 2 }, // FIXME: should probably be 4
2170 
2171  { ISD::SELECT, MVT::v32i16, 2 }, // FIXME: should be 3
2172  { ISD::SELECT, MVT::v64i8, 2 }, // FIXME: should be 3
2173  };
2174 
2175  static const CostTblEntry AVX2CostTbl[] = {
2176  { ISD::SETCC, MVT::v4i64, 1 },
2177  { ISD::SETCC, MVT::v8i32, 1 },
2178  { ISD::SETCC, MVT::v16i16, 1 },
2179  { ISD::SETCC, MVT::v32i8, 1 },
2180 
2181  { ISD::SELECT, MVT::v4i64, 1 }, // pblendvb
2182  { ISD::SELECT, MVT::v8i32, 1 }, // pblendvb
2183  { ISD::SELECT, MVT::v16i16, 1 }, // pblendvb
2184  { ISD::SELECT, MVT::v32i8, 1 }, // pblendvb
2185  };
2186 
2187  static const CostTblEntry AVX1CostTbl[] = {
2188  { ISD::SETCC, MVT::v4f64, 1 },
2189  { ISD::SETCC, MVT::v8f32, 1 },
2190  // AVX1 does not support 8-wide integer compare.
2191  { ISD::SETCC, MVT::v4i64, 4 },
2192  { ISD::SETCC, MVT::v8i32, 4 },
2193  { ISD::SETCC, MVT::v16i16, 4 },
2194  { ISD::SETCC, MVT::v32i8, 4 },
2195 
2196  { ISD::SELECT, MVT::v4f64, 1 }, // vblendvpd
2197  { ISD::SELECT, MVT::v8f32, 1 }, // vblendvps
2198  { ISD::SELECT, MVT::v4i64, 1 }, // vblendvpd
2199  { ISD::SELECT, MVT::v8i32, 1 }, // vblendvps
2200  { ISD::SELECT, MVT::v16i16, 3 }, // vandps + vandnps + vorps
2201  { ISD::SELECT, MVT::v32i8, 3 }, // vandps + vandnps + vorps
2202  };
2203 
2204  static const CostTblEntry SSE42CostTbl[] = {
2205  { ISD::SETCC, MVT::v2f64, 1 },
2206  { ISD::SETCC, MVT::v4f32, 1 },
2207  { ISD::SETCC, MVT::v2i64, 1 },
2208  };
2209 
2210  static const CostTblEntry SSE41CostTbl[] = {
2211  { ISD::SELECT, MVT::v2f64, 1 }, // blendvpd
2212  { ISD::SELECT, MVT::v4f32, 1 }, // blendvps
2213  { ISD::SELECT, MVT::v2i64, 1 }, // pblendvb
2214  { ISD::SELECT, MVT::v4i32, 1 }, // pblendvb
2215  { ISD::SELECT, MVT::v8i16, 1 }, // pblendvb
2216  { ISD::SELECT, MVT::v16i8, 1 }, // pblendvb
2217  };
2218 
2219  static const CostTblEntry SSE2CostTbl[] = {
2220  { ISD::SETCC, MVT::v2f64, 2 },
2221  { ISD::SETCC, MVT::f64, 1 },
2222  { ISD::SETCC, MVT::v2i64, 8 },
2223  { ISD::SETCC, MVT::v4i32, 1 },
2224  { ISD::SETCC, MVT::v8i16, 1 },
2225  { ISD::SETCC, MVT::v16i8, 1 },
2226 
2227  { ISD::SELECT, MVT::v2f64, 3 }, // andpd + andnpd + orpd
2228  { ISD::SELECT, MVT::v2i64, 3 }, // pand + pandn + por
2229  { ISD::SELECT, MVT::v4i32, 3 }, // pand + pandn + por
2230  { ISD::SELECT, MVT::v8i16, 3 }, // pand + pandn + por
2231  { ISD::SELECT, MVT::v16i8, 3 }, // pand + pandn + por
2232  };
2233 
2234  static const CostTblEntry SSE1CostTbl[] = {
2235  { ISD::SETCC, MVT::v4f32, 2 },
2236  { ISD::SETCC, MVT::f32, 1 },
2237 
2238  { ISD::SELECT, MVT::v4f32, 3 }, // andps + andnps + orps
2239  };
2240 
2241  if (ST->isSLM())
2242  if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
2243  return LT.first * (ExtraCost + Entry->Cost);
2244 
2245  if (ST->hasBWI())
2246  if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
2247  return LT.first * (ExtraCost + Entry->Cost);
2248 
2249  if (ST->hasAVX512())
2250  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2251  return LT.first * (ExtraCost + Entry->Cost);
2252 
2253  if (ST->hasAVX2())
2254  if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
2255  return LT.first * (ExtraCost + Entry->Cost);
2256 
2257  if (ST->hasAVX())
2258  if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
2259  return LT.first * (ExtraCost + Entry->Cost);
2260 
2261  if (ST->hasSSE42())
2262  if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
2263  return LT.first * (ExtraCost + Entry->Cost);
2264 
2265  if (ST->hasSSE41())
2266  if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
2267  return LT.first * (ExtraCost + Entry->Cost);
2268 
2269  if (ST->hasSSE2())
2270  if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
2271  return LT.first * (ExtraCost + Entry->Cost);
2272 
2273  if (ST->hasSSE1())
2274  if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
2275  return LT.first * (ExtraCost + Entry->Cost);
2276 
2277  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
2278 }
2279 
2281 
2284 
2285  // Costs should match the codegen from:
2286  // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
2287  // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
2288  // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
2289  // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
2290  // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
2291 
2292  // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
2293  // specialized in these tables yet.
2294  static const CostTblEntry AVX512CDCostTbl[] = {
2295  { ISD::CTLZ, MVT::v8i64, 1 },
2296  { ISD::CTLZ, MVT::v16i32, 1 },
2297  { ISD::CTLZ, MVT::v32i16, 8 },
2298  { ISD::CTLZ, MVT::v64i8, 20 },
2299  { ISD::CTLZ, MVT::v4i64, 1 },
2300  { ISD::CTLZ, MVT::v8i32, 1 },
2301  { ISD::CTLZ, MVT::v16i16, 4 },
2302  { ISD::CTLZ, MVT::v32i8, 10 },
2303  { ISD::CTLZ, MVT::v2i64, 1 },
2304  { ISD::CTLZ, MVT::v4i32, 1 },
2305  { ISD::CTLZ, MVT::v8i16, 4 },
2306  { ISD::CTLZ, MVT::v16i8, 4 },
2307  };
2308  static const CostTblEntry AVX512BWCostTbl[] = {
2309  { ISD::ABS, MVT::v32i16, 1 },
2310  { ISD::ABS, MVT::v64i8, 1 },
2311  { ISD::BITREVERSE, MVT::v8i64, 5 },
2312  { ISD::BITREVERSE, MVT::v16i32, 5 },
2313  { ISD::BITREVERSE, MVT::v32i16, 5 },
2314  { ISD::BITREVERSE, MVT::v64i8, 5 },
2315  { ISD::CTLZ, MVT::v8i64, 23 },
2316  { ISD::CTLZ, MVT::v16i32, 22 },
2317  { ISD::CTLZ, MVT::v32i16, 18 },
2318  { ISD::CTLZ, MVT::v64i8, 17 },
2319  { ISD::CTPOP, MVT::v8i64, 7 },
2320  { ISD::CTPOP, MVT::v16i32, 11 },
2321  { ISD::CTPOP, MVT::v32i16, 9 },
2322  { ISD::CTPOP, MVT::v64i8, 6 },
2323  { ISD::CTTZ, MVT::v8i64, 10 },
2324  { ISD::CTTZ, MVT::v16i32, 14 },
2325  { ISD::CTTZ, MVT::v32i16, 12 },
2326  { ISD::CTTZ, MVT::v64i8, 9 },
2327  { ISD::SADDSAT, MVT::v32i16, 1 },
2328  { ISD::SADDSAT, MVT::v64i8, 1 },
2329  { ISD::SMAX, MVT::v32i16, 1 },
2330  { ISD::SMAX, MVT::v64i8, 1 },
2331  { ISD::SMIN, MVT::v32i16, 1 },
2332  { ISD::SMIN, MVT::v64i8, 1 },
2333  { ISD::SSUBSAT, MVT::v32i16, 1 },
2334  { ISD::SSUBSAT, MVT::v64i8, 1 },
2335  { ISD::UADDSAT, MVT::v32i16, 1 },
2336  { ISD::UADDSAT, MVT::v64i8, 1 },
2337  { ISD::UMAX, MVT::v32i16, 1 },
2338  { ISD::UMAX, MVT::v64i8, 1 },
2339  { ISD::UMIN, MVT::v32i16, 1 },
2340  { ISD::UMIN, MVT::v64i8, 1 },
2341  { ISD::USUBSAT, MVT::v32i16, 1 },
2342  { ISD::USUBSAT, MVT::v64i8, 1 },
2343  };
2344  static const CostTblEntry AVX512CostTbl[] = {
2345  { ISD::ABS, MVT::v8i64, 1 },
2346  { ISD::ABS, MVT::v16i32, 1 },
2347  { ISD::ABS, MVT::v32i16, 2 }, // FIXME: include split
2348  { ISD::ABS, MVT::v64i8, 2 }, // FIXME: include split
2349  { ISD::ABS, MVT::v4i64, 1 },
2350  { ISD::ABS, MVT::v2i64, 1 },
2351  { ISD::BITREVERSE, MVT::v8i64, 36 },
2352  { ISD::BITREVERSE, MVT::v16i32, 24 },
2353  { ISD::BITREVERSE, MVT::v32i16, 10 },
2354  { ISD::BITREVERSE, MVT::v64i8, 10 },
2355  { ISD::CTLZ, MVT::v8i64, 29 },
2356  { ISD::CTLZ, MVT::v16i32, 35 },
2357  { ISD::CTLZ, MVT::v32i16, 28 },
2358  { ISD::CTLZ, MVT::v64i8, 18 },
2359  { ISD::CTPOP, MVT::v8i64, 16 },
2360  { ISD::CTPOP, MVT::v16i32, 24 },
2361  { ISD::CTPOP, MVT::v32i16, 18 },
2362  { ISD::CTPOP, MVT::v64i8, 12 },
2363  { ISD::CTTZ, MVT::v8i64, 20 },
2364  { ISD::CTTZ, MVT::v16i32, 28 },
2365  { ISD::CTTZ, MVT::v32i16, 24 },
2366  { ISD::CTTZ, MVT::v64i8, 18 },
2367  { ISD::SMAX, MVT::v8i64, 1 },
2368  { ISD::SMAX, MVT::v16i32, 1 },
2369  { ISD::SMAX, MVT::v32i16, 2 }, // FIXME: include split
2370  { ISD::SMAX, MVT::v64i8, 2 }, // FIXME: include split
2371  { ISD::SMAX, MVT::v4i64, 1 },
2372  { ISD::SMAX, MVT::v2i64, 1 },
2373  { ISD::SMIN, MVT::v8i64, 1 },
2374  { ISD::SMIN, MVT::v16i32, 1 },
2375  { ISD::SMIN, MVT::v32i16, 2 }, // FIXME: include split
2376  { ISD::SMIN, MVT::v64i8, 2 }, // FIXME: include split
2377  { ISD::SMIN, MVT::v4i64, 1 },
2378  { ISD::SMIN, MVT::v2i64, 1 },
2379  { ISD::UMAX, MVT::v8i64, 1 },
2380  { ISD::UMAX, MVT::v16i32, 1 },
2381  { ISD::UMAX, MVT::v32i16, 2 }, // FIXME: include split
2382  { ISD::UMAX, MVT::v64i8, 2 }, // FIXME: include split
2383  { ISD::UMAX, MVT::v4i64, 1 },
2384  { ISD::UMAX, MVT::v2i64, 1 },
2385  { ISD::UMIN, MVT::v8i64, 1 },
2386  { ISD::UMIN, MVT::v16i32, 1 },
2387  { ISD::UMIN, MVT::v32i16, 2 }, // FIXME: include split
2388  { ISD::UMIN, MVT::v64i8, 2 }, // FIXME: include split
2389  { ISD::UMIN, MVT::v4i64, 1 },
2390  { ISD::UMIN, MVT::v2i64, 1 },
2391  { ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd
2392  { ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq
2393  { ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq
2394  { ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq
2395  { ISD::UADDSAT, MVT::v16i32, 3 }, // not + pminud + paddd
2396  { ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq
2397  { ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq
2398  { ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq
2399  { ISD::SADDSAT, MVT::v32i16, 2 }, // FIXME: include split
2400  { ISD::SADDSAT, MVT::v64i8, 2 }, // FIXME: include split
2401  { ISD::SSUBSAT, MVT::v32i16, 2 }, // FIXME: include split
2402  { ISD::SSUBSAT, MVT::v64i8, 2 }, // FIXME: include split
2403  { ISD::UADDSAT, MVT::v32i16, 2 }, // FIXME: include split
2404  { ISD::UADDSAT, MVT::v64i8, 2 }, // FIXME: include split
2405  { ISD::USUBSAT, MVT::v32i16, 2 }, // FIXME: include split
2406  { ISD::USUBSAT, MVT::v64i8, 2 }, // FIXME: include split
2407  { ISD::FMAXNUM, MVT::f32, 2 },
2408  { ISD::FMAXNUM, MVT::v4f32, 2 },
2409  { ISD::FMAXNUM, MVT::v8f32, 2 },
2410  { ISD::FMAXNUM, MVT::v16f32, 2 },
2411  { ISD::FMAXNUM, MVT::f64, 2 },
2412  { ISD::FMAXNUM, MVT::v2f64, 2 },
2413  { ISD::FMAXNUM, MVT::v4f64, 2 },
2414  { ISD::FMAXNUM, MVT::v8f64, 2 },
2415  };
2416  static const CostTblEntry XOPCostTbl[] = {
2417  { ISD::BITREVERSE, MVT::v4i64, 4 },
2418  { ISD::BITREVERSE, MVT::v8i32, 4 },
2419  { ISD::BITREVERSE, MVT::v16i16, 4 },
2420  { ISD::BITREVERSE, MVT::v32i8, 4 },
2421  { ISD::BITREVERSE, MVT::v2i64, 1 },
2422  { ISD::BITREVERSE, MVT::v4i32, 1 },
2423  { ISD::BITREVERSE, MVT::v8i16, 1 },
2424  { ISD::BITREVERSE, MVT::v16i8, 1 },
2425  { ISD::BITREVERSE, MVT::i64, 3 },
2426  { ISD::BITREVERSE, MVT::i32, 3 },
2427  { ISD::BITREVERSE, MVT::i16, 3 },
2428  { ISD::BITREVERSE, MVT::i8, 3 }
2429  };
2430  static const CostTblEntry AVX2CostTbl[] = {
2431  { ISD::ABS, MVT::v4i64, 2 }, // VBLENDVPD(X,VPSUBQ(0,X),X)
2432  { ISD::ABS, MVT::v8i32, 1 },
2433  { ISD::ABS, MVT::v16i16, 1 },
2434  { ISD::ABS, MVT::v32i8, 1 },
2435  { ISD::BITREVERSE, MVT::v4i64, 5 },
2436  { ISD::BITREVERSE, MVT::v8i32, 5 },
2437  { ISD::BITREVERSE, MVT::v16i16, 5 },
2438  { ISD::BITREVERSE, MVT::v32i8, 5 },
2439  { ISD::BSWAP, MVT::v4i64, 1 },
2440  { ISD::BSWAP, MVT::v8i32, 1 },
2441  { ISD::BSWAP, MVT::v16i16, 1 },
2442  { ISD::CTLZ, MVT::v4i64, 23 },
2443  { ISD::CTLZ, MVT::v8i32, 18 },
2444  { ISD::CTLZ, MVT::v16i16, 14 },
2445  { ISD::CTLZ, MVT::v32i8, 9 },
2446  { ISD::CTPOP, MVT::v4i64, 7 },
2447  { ISD::CTPOP, MVT::v8i32, 11 },
2448  { ISD::CTPOP, MVT::v16i16, 9 },
2449  { ISD::CTPOP, MVT::v32i8, 6 },
2450  { ISD::CTTZ, MVT::v4i64, 10 },
2451  { ISD::CTTZ, MVT::v8i32, 14 },
2452  { ISD::CTTZ, MVT::v16i16, 12 },
2453  { ISD::CTTZ, MVT::v32i8, 9 },
2454  { ISD::SADDSAT, MVT::v16i16, 1 },
2455  { ISD::SADDSAT, MVT::v32i8, 1 },
2456  { ISD::SMAX, MVT::v8i32, 1 },
2457  { ISD::SMAX, MVT::v16i16, 1 },
2458  { ISD::SMAX, MVT::v32i8, 1 },
2459  { ISD::SMIN, MVT::v8i32, 1 },
2460  { ISD::SMIN, MVT::v16i16, 1 },
2461  { ISD::SMIN, MVT::v32i8, 1 },
2462  { ISD::SSUBSAT, MVT::v16i16, 1 },
2463  { ISD::SSUBSAT, MVT::v32i8, 1 },
2464  { ISD::UADDSAT, MVT::v16i16, 1 },
2465  { ISD::UADDSAT, MVT::v32i8, 1 },
2466  { ISD::UADDSAT, MVT::v8i32, 3 }, // not + pminud + paddd
2467  { ISD::UMAX, MVT::v8i32, 1 },
2468  { ISD::UMAX, MVT::v16i16, 1 },
2469  { ISD::UMAX, MVT::v32i8, 1 },
2470  { ISD::UMIN, MVT::v8i32, 1 },
2471  { ISD::UMIN, MVT::v16i16, 1 },
2472  { ISD::UMIN, MVT::v32i8, 1 },
2473  { ISD::USUBSAT, MVT::v16i16, 1 },
2474  { ISD::USUBSAT, MVT::v32i8, 1 },
2475  { ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd
2476  { ISD::FMAXNUM, MVT::v8f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS
2477  { ISD::FMAXNUM, MVT::v4f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD
2478  { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/
2479  { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
2480  { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
2481  { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/
2482  { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
2483  { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
2484  };
2485  static const CostTblEntry AVX1CostTbl[] = {
2486  { ISD::ABS, MVT::v4i64, 5 }, // VBLENDVPD(X,VPSUBQ(0,X),X)
2487  { ISD::ABS, MVT::v8i32, 3 },
2488  { ISD::ABS, MVT::v16i16, 3 },
2489  { ISD::ABS, MVT::v32i8, 3 },
2490  { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert
2491  { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert
2492  { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert
2493  { ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert
2494  { ISD::BSWAP, MVT::v4i64, 4 },
2495  { ISD::BSWAP, MVT::v8i32, 4 },
2496  { ISD::BSWAP, MVT::v16i16, 4 },
2497  { ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert
2498  { ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert
2499  { ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert
2500  { ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
2501  { ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert
2502  { ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert
2503  { ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert
2504  { ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert
2505  { ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert
2506  { ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert
2507  { ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert
2508  { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
2509  { ISD::SADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2510  { ISD::SADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2511  { ISD::SMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
2512  { ISD::SMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2513  { ISD::SMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2514  { ISD::SMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
2515  { ISD::SMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2516  { ISD::SMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2517  { ISD::SSUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2518  { ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2519  { ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2520  { ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2521  { ISD::UADDSAT, MVT::v8i32, 8 }, // 2 x 128-bit Op + extract/insert
2522  { ISD::UMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
2523  { ISD::UMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2524  { ISD::UMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2525  { ISD::UMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
2526  { ISD::UMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2527  { ISD::UMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2528  { ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2529  { ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2530  { ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert
2531  { ISD::FMAXNUM, MVT::f32, 3 }, // MAXSS + CMPUNORDSS + BLENDVPS
2532  { ISD::FMAXNUM, MVT::v4f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS
2533  { ISD::FMAXNUM, MVT::v8f32, 5 }, // MAXPS + CMPUNORDPS + BLENDVPS + ?
2534  { ISD::FMAXNUM, MVT::f64, 3 }, // MAXSD + CMPUNORDSD + BLENDVPD
2535  { ISD::FMAXNUM, MVT::v2f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD
2536  { ISD::FMAXNUM, MVT::v4f64, 5 }, // MAXPD + CMPUNORDPD + BLENDVPD + ?
2537  { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
2538  { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
2539  { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
2540  { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/
2541  { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/
2542  { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/
2543  };
2544  static const CostTblEntry GLMCostTbl[] = {
2545  { ISD::FSQRT, MVT::f32, 19 }, // sqrtss
2546  { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps
2547  { ISD::FSQRT, MVT::f64, 34 }, // sqrtsd
2548  { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd
2549  };
2550  static const CostTblEntry SLMCostTbl[] = {
2551  { ISD::FSQRT, MVT::f32, 20 }, // sqrtss
2552  { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps
2553  { ISD::FSQRT, MVT::f64, 35 }, // sqrtsd
2554  { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd
2555  };
2556  static const CostTblEntry SSE42CostTbl[] = {
2557  { ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd
2558  { ISD::UADDSAT, MVT::v4i32, 3 }, // not + pminud + paddd
2559  { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
2560  { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
2561  };
2562  static const CostTblEntry SSE41CostTbl[] = {
2563  { ISD::ABS, MVT::v2i64, 2 }, // BLENDVPD(X,PSUBQ(0,X),X)
2564  { ISD::SMAX, MVT::v4i32, 1 },
2565  { ISD::SMAX, MVT::v16i8, 1 },
2566  { ISD::SMIN, MVT::v4i32, 1 },
2567  { ISD::SMIN, MVT::v16i8, 1 },
2568  { ISD::UMAX, MVT::v4i32, 1 },
2569  { ISD::UMAX, MVT::v8i16, 1 },
2570  { ISD::UMIN, MVT::v4i32, 1 },
2571  { ISD::UMIN, MVT::v8i16, 1 },
2572  };
2573  static const CostTblEntry SSSE3CostTbl[] = {
2574  { ISD::ABS, MVT::v4i32, 1 },
2575  { ISD::ABS, MVT::v8i16, 1 },
2576  { ISD::ABS, MVT::v16i8, 1 },
2577  { ISD::BITREVERSE, MVT::v2i64, 5 },
2578  { ISD::BITREVERSE, MVT::v4i32, 5 },
2579  { ISD::BITREVERSE, MVT::v8i16, 5 },
2580  { ISD::BITREVERSE, MVT::v16i8, 5 },
2581  { ISD::BSWAP, MVT::v2i64, 1 },
2582  { ISD::BSWAP, MVT::v4i32, 1 },
2583  { ISD::BSWAP, MVT::v8i16, 1 },
2584  { ISD::CTLZ, MVT::v2i64, 23 },
2585  { ISD::CTLZ, MVT::v4i32, 18 },
2586  { ISD::CTLZ, MVT::v8i16, 14 },
2587  { ISD::CTLZ, MVT::v16i8, 9 },
2588  { ISD::CTPOP, MVT::v2i64, 7 },
2589  { ISD::CTPOP, MVT::v4i32, 11 },
2590  { ISD::CTPOP, MVT::v8i16, 9 },
2591  { ISD::CTPOP, MVT::v16i8, 6 },
2592  { ISD::CTTZ, MVT::v2i64, 10 },
2593  { ISD::CTTZ, MVT::v4i32, 14 },
2594  { ISD::CTTZ, MVT::v8i16, 12 },
2595  { ISD::CTTZ, MVT::v16i8, 9 }
2596  };
2597  static const CostTblEntry SSE2CostTbl[] = {
2598  { ISD::ABS, MVT::v2i64, 4 },
2599  { ISD::ABS, MVT::v4i32, 3 },
2600  { ISD::ABS, MVT::v8i16, 2 },
2601  { ISD::ABS, MVT::v16i8, 2 },
2602  { ISD::BITREVERSE, MVT::v2i64, 29 },
2603  { ISD::BITREVERSE, MVT::v4i32, 27 },
2604  { ISD::BITREVERSE, MVT::v8i16, 27 },
2605  { ISD::BITREVERSE, MVT::v16i8, 20 },
2606  { ISD::BSWAP, MVT::v2i64, 7 },
2607  { ISD::BSWAP, MVT::v4i32, 7 },
2608  { ISD::BSWAP, MVT::v8i16, 7 },
2609  { ISD::CTLZ, MVT::v2i64, 25 },
2610  { ISD::CTLZ, MVT::v4i32, 26 },
2611  { ISD::CTLZ, MVT::v8i16, 20 },
2612  { ISD::CTLZ, MVT::v16i8, 17 },
2613  { ISD::CTPOP, MVT::v2i64, 12 },
2614  { ISD::CTPOP, MVT::v4i32, 15 },
2615  { ISD::CTPOP, MVT::v8i16, 13 },
2616  { ISD::CTPOP, MVT::v16i8, 10 },
2617  { ISD::CTTZ, MVT::v2i64, 14 },
2618  { ISD::CTTZ, MVT::v4i32, 18 },
2619  { ISD::CTTZ, MVT::v8i16, 16 },
2620  { ISD::CTTZ, MVT::v16i8, 13 },
2621  { ISD::SADDSAT, MVT::v8i16, 1 },
2622  { ISD::SADDSAT, MVT::v16i8, 1 },
2623  { ISD::SMAX, MVT::v8i16, 1 },
2624  { ISD::SMIN, MVT::v8i16, 1 },
2625  { ISD::SSUBSAT, MVT::v8i16, 1 },
2626  { ISD::SSUBSAT, MVT::v16i8, 1 },
2627  { ISD::UADDSAT, MVT::v8i16, 1 },
2628  { ISD::UADDSAT, MVT::v16i8, 1 },
2629  { ISD::UMAX, MVT::v8i16, 2 },
2630  { ISD::UMAX, MVT::v16i8, 1 },
2631  { ISD::UMIN, MVT::v8i16, 2 },
2632  { ISD::UMIN, MVT::v16i8, 1 },
2633  { ISD::USUBSAT, MVT::v8i16, 1 },
2634  { ISD::USUBSAT, MVT::v16i8, 1 },
2635  { ISD::FMAXNUM, MVT::f64, 4 },
2636  { ISD::FMAXNUM, MVT::v2f64, 4 },
2637  { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/
2638  { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/
2639  };
2640  static const CostTblEntry SSE1CostTbl[] = {
2641  { ISD::FMAXNUM, MVT::f32, 4 },
2642  { ISD::FMAXNUM, MVT::v4f32, 4 },
2643  { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
2644  { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
2645  };
2646  static const CostTblEntry BMI64CostTbl[] = { // 64-bit targets
2647  { ISD::CTTZ, MVT::i64, 1 },
2648  };
2649  static const CostTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
2650  { ISD::CTTZ, MVT::i32, 1 },
2651  { ISD::CTTZ, MVT::i16, 1 },
2652  { ISD::CTTZ, MVT::i8, 1 },
2653  };
2654  static const CostTblEntry LZCNT64CostTbl[] = { // 64-bit targets
2655  { ISD::CTLZ, MVT::i64, 1 },
2656  };
2657  static const CostTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
2658  { ISD::CTLZ, MVT::i32, 1 },
2659  { ISD::CTLZ, MVT::i16, 1 },
2660  { ISD::CTLZ, MVT::i8, 1 },
2661  };
2662  static const CostTblEntry POPCNT64CostTbl[] = { // 64-bit targets
2663  { ISD::CTPOP, MVT::i64, 1 },
2664  };
2665  static const CostTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
2666  { ISD::CTPOP, MVT::i32, 1 },
2667  { ISD::CTPOP, MVT::i16, 1 },
2668  { ISD::CTPOP, MVT::i8, 1 },
2669  };
2670  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
2671  { ISD::ABS, MVT::i64, 2 }, // SUB+CMOV
2672  { ISD::BITREVERSE, MVT::i64, 14 },
2673  { ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV
2674  { ISD::CTTZ, MVT::i64, 3 }, // TEST+BSF+CMOV/BRANCH
2675  { ISD::CTPOP, MVT::i64, 10 },
2676  { ISD::SADDO, MVT::i64, 1 },
2677  { ISD::UADDO, MVT::i64, 1 },
2678  { ISD::UMULO, MVT::i64, 2 }, // mulq + seto
2679  };
2680  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
2681  { ISD::ABS, MVT::i32, 2 }, // SUB+CMOV
2682  { ISD::ABS, MVT::i16, 2 }, // SUB+CMOV
2683  { ISD::BITREVERSE, MVT::i32, 14 },
2684  { ISD::BITREVERSE, MVT::i16, 14 },
2685  { ISD::BITREVERSE, MVT::i8, 11 },
2686  { ISD::CTLZ, MVT::i32, 4 }, // BSR+XOR or BSR+XOR+CMOV
2687  { ISD::CTLZ, MVT::i16, 4 }, // BSR+XOR or BSR+XOR+CMOV
2688  { ISD::CTLZ, MVT::i8, 4 }, // BSR+XOR or BSR+XOR+CMOV
2689  { ISD::CTTZ, MVT::i32, 3 }, // TEST+BSF+CMOV/BRANCH
2690  { ISD::CTTZ, MVT::i16, 3 }, // TEST+BSF+CMOV/BRANCH
2691  { ISD::CTTZ, MVT::i8, 3 }, // TEST+BSF+CMOV/BRANCH
2692  { ISD::CTPOP, MVT::i32, 8 },
2693  { ISD::CTPOP, MVT::i16, 9 },
2694  { ISD::CTPOP, MVT::i8, 7 },
2695  { ISD::SADDO, MVT::i32, 1 },
2696  { ISD::SADDO, MVT::i16, 1 },
2697  { ISD::SADDO, MVT::i8, 1 },
2698  { ISD::UADDO, MVT::i32, 1 },
2699  { ISD::UADDO, MVT::i16, 1 },
2700  { ISD::UADDO, MVT::i8, 1 },
2701  { ISD::UMULO, MVT::i32, 2 }, // mul + seto
2702  { ISD::UMULO, MVT::i16, 2 },
2703  { ISD::UMULO, MVT::i8, 2 },
2704  };
2705 
2706  Type *RetTy = ICA.getReturnType();
2707  Type *OpTy = RetTy;
2708  Intrinsic::ID IID = ICA.getID();
2709  unsigned ISD = ISD::DELETED_NODE;
2710  switch (IID) {
2711  default:
2712  break;
2713  case Intrinsic::abs:
2714  ISD = ISD::ABS;
2715  break;
2716  case Intrinsic::bitreverse:
2717  ISD = ISD::BITREVERSE;
2718  break;
2719  case Intrinsic::bswap:
2720  ISD = ISD::BSWAP;
2721  break;
2722  case Intrinsic::ctlz:
2723  ISD = ISD::CTLZ;
2724  break;
2725  case Intrinsic::ctpop:
2726  ISD = ISD::CTPOP;
2727  break;
2728  case Intrinsic::cttz:
2729  ISD = ISD::CTTZ;
2730  break;
2731  case Intrinsic::maxnum:
2732  case Intrinsic::minnum:
2733  // FMINNUM has same costs so don't duplicate.
2734  ISD = ISD::FMAXNUM;
2735  break;
2736  case Intrinsic::sadd_sat:
2737  ISD = ISD::SADDSAT;
2738  break;
2739  case Intrinsic::smax:
2740  ISD = ISD::SMAX;
2741  break;
2742  case Intrinsic::smin:
2743  ISD = ISD::SMIN;
2744  break;
2745  case Intrinsic::ssub_sat:
2746  ISD = ISD::SSUBSAT;
2747  break;
2748  case Intrinsic::uadd_sat:
2749  ISD = ISD::UADDSAT;
2750  break;
2751  case Intrinsic::umax:
2752  ISD = ISD::UMAX;
2753  break;
2754  case Intrinsic::umin:
2755  ISD = ISD::UMIN;
2756  break;
2757  case Intrinsic::usub_sat:
2758  ISD = ISD::USUBSAT;
2759  break;
2760  case Intrinsic::sqrt:
2761  ISD = ISD::FSQRT;
2762  break;
2763  case Intrinsic::sadd_with_overflow:
2764  case Intrinsic::ssub_with_overflow:
2765  // SSUBO has same costs so don't duplicate.
2766  ISD = ISD::SADDO;
2767  OpTy = RetTy->getContainedType(0);
2768  break;
2769  case Intrinsic::uadd_with_overflow:
2770  case Intrinsic::usub_with_overflow:
2771  // USUBO has same costs so don't duplicate.
2772  ISD = ISD::UADDO;
2773  OpTy = RetTy->getContainedType(0);
2774  break;
2775  case Intrinsic::umul_with_overflow:
2776  case Intrinsic::smul_with_overflow:
2777  // SMULO has same costs so don't duplicate.
2778  ISD = ISD::UMULO;
2779  OpTy = RetTy->getContainedType(0);
2780  break;
2781  }
2782 
2783  if (ISD != ISD::DELETED_NODE) {
2784  // Legalize the type.
2785  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, OpTy);
2786  MVT MTy = LT.second;
2787 
2788  // Attempt to lookup cost.
2789  if (ISD == ISD::BITREVERSE && ST->hasGFNI() && ST->hasSSSE3() &&
2790  MTy.isVector()) {
2791  // With PSHUFB the code is very similar for all types. If we have integer
2792  // byte operations, we just need a GF2P8AFFINEQB for vXi8. For other types
2793  // we also need a PSHUFB.
2794  unsigned Cost = MTy.getVectorElementType() == MVT::i8 ? 1 : 2;
2795 
2796  // Without byte operations, we need twice as many GF2P8AFFINEQB and PSHUFB
2797  // instructions. We also need an extract and an insert.
2798  if (!(MTy.is128BitVector() || (ST->hasAVX2() && MTy.is256BitVector()) ||
2799  (ST->hasBWI() && MTy.is512BitVector())))
2800  Cost = Cost * 2 + 2;
2801 
2802  return LT.first * Cost;
2803  }
2804 
2805  auto adjustTableCost = [](const CostTblEntry &Entry, int LegalizationCost,
2806  FastMathFlags FMF) {
2807  // If there are no NANs to deal with, then these are reduced to a
2808  // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
2809  // assume is used in the non-fast case.
2810  if (Entry.ISD == ISD::FMAXNUM || Entry.ISD == ISD::FMINNUM) {
2811  if (FMF.noNaNs())
2812  return LegalizationCost * 1;
2813  }
2814  return LegalizationCost * (int)Entry.Cost;
2815  };
2816 
2817  if (ST->useGLMDivSqrtCosts())
2818  if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
2819  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2820 
2821  if (ST->isSLM())
2822  if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
2823  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2824 
2825  if (ST->hasCDI())
2826  if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
2827  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2828 
2829  if (ST->hasBWI())
2830  if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
2831  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2832 
2833  if (ST->hasAVX512())
2834  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2835  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2836 
2837  if (ST->hasXOP())
2838  if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
2839  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2840 
2841  if (ST->hasAVX2())
2842  if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
2843  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2844 
2845  if (ST->hasAVX())
2846  if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
2847  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2848 
2849  if (ST->hasSSE42())
2850  if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
2851  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2852 
2853  if (ST->hasSSE41())
2854  if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
2855  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2856 
2857  if (ST->hasSSSE3())
2858  if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
2859  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2860 
2861  if (ST->hasSSE2())
2862  if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
2863  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2864 
2865  if (ST->hasSSE1())
2866  if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
2867  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2868 
2869  if (ST->hasBMI()) {
2870  if (ST->is64Bit())
2871  if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
2872  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2873 
2874  if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
2875  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2876  }
2877 
2878  if (ST->hasLZCNT()) {
2879  if (ST->is64Bit())
2880  if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
2881  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2882 
2883  if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
2884  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2885  }
2886 
2887  if (ST->hasPOPCNT()) {
2888  if (ST->is64Bit())
2889  if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
2890  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2891 
2892  if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
2893  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2894  }
2895 
2896  // TODO - add BMI (TZCNT) scalar handling
2897 
2898  if (ST->is64Bit())
2899  if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
2900  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2901 
2902  if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
2903  return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2904  }
2905 
2907 }
2908 
2911  if (ICA.isTypeBasedOnly())
2913 
2914  static const CostTblEntry AVX512CostTbl[] = {
2915  { ISD::ROTL, MVT::v8i64, 1 },
2916  { ISD::ROTL, MVT::v4i64, 1 },
2917  { ISD::ROTL, MVT::v2i64, 1 },
2918  { ISD::ROTL, MVT::v16i32, 1 },
2919  { ISD::ROTL, MVT::v8i32, 1 },
2920  { ISD::ROTL, MVT::v4i32, 1 },
2921  { ISD::ROTR, MVT::v8i64, 1 },
2922  { ISD::ROTR, MVT::v4i64, 1 },
2923  { ISD::ROTR, MVT::v2i64, 1 },
2924  { ISD::ROTR, MVT::v16i32, 1 },
2925  { ISD::ROTR, MVT::v8i32, 1 },
2926  { ISD::ROTR, MVT::v4i32, 1 }
2927  };
2928  // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
2929  static const CostTblEntry XOPCostTbl[] = {
2930  { ISD::ROTL, MVT::v4i64, 4 },
2931  { ISD::ROTL, MVT::v8i32, 4 },
2932  { ISD::ROTL, MVT::v16i16, 4 },
2933  { ISD::ROTL, MVT::v32i8, 4 },
2934  { ISD::ROTL, MVT::v2i64, 1 },
2935  { ISD::ROTL, MVT::v4i32, 1 },
2936  { ISD::ROTL, MVT::v8i16, 1 },
2937  { ISD::ROTL, MVT::v16i8, 1 },
2938  { ISD::ROTR, MVT::v4i64, 6 },
2939  { ISD::ROTR, MVT::v8i32, 6 },
2940  { ISD::ROTR, MVT::v16i16, 6 },
2941  { ISD::ROTR, MVT::v32i8, 6 },
2942  { ISD::ROTR, MVT::v2i64, 2 },
2943  { ISD::ROTR, MVT::v4i32, 2 },
2944  { ISD::ROTR, MVT::v8i16, 2 },
2945  { ISD::ROTR, MVT::v16i8, 2 }
2946  };
2947  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
2948  { ISD::ROTL, MVT::i64, 1 },
2949  { ISD::ROTR, MVT::i64, 1 },
2950  { ISD::FSHL, MVT::i64, 4 }
2951  };
2952  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
2953  { ISD::ROTL, MVT::i32, 1 },
2954  { ISD::ROTL, MVT::i16, 1 },
2955  { ISD::ROTL, MVT::i8, 1 },
2956  { ISD::ROTR, MVT::i32, 1 },
2957  { ISD::ROTR, MVT::i16, 1 },
2958  { ISD::ROTR, MVT::i8, 1 },
2959  { ISD::FSHL, MVT::i32, 4 },
2960  { ISD::FSHL, MVT::i16, 4 },
2961  { ISD::FSHL, MVT::i8, 4 }
2962  };
2963 
2964  Intrinsic::ID IID = ICA.getID();
2965  Type *RetTy = ICA.getReturnType();
2967  unsigned ISD = ISD::DELETED_NODE;
2968  switch (IID) {
2969  default:
2970  break;
2971  case Intrinsic::fshl:
2972  ISD = ISD::FSHL;
2973  if (Args[0] == Args[1])
2974  ISD = ISD::ROTL;
2975  break;
2976  case Intrinsic::fshr:
2977  // FSHR has same costs so don't duplicate.
2978  ISD = ISD::FSHL;
2979  if (Args[0] == Args[1])
2980  ISD = ISD::ROTR;
2981  break;
2982  }
2983 
2984  if (ISD != ISD::DELETED_NODE) {
2985  // Legalize the type.
2986  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
2987  MVT MTy = LT.second;
2988 
2989  // Attempt to lookup cost.
2990  if (ST->hasAVX512())
2991  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2992  return LT.first * Entry->Cost;
2993 
2994  if (ST->hasXOP())
2995  if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
2996  return LT.first * Entry->Cost;
2997 
2998  if (ST->is64Bit())
2999  if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
3000  return LT.first * Entry->Cost;
3001 
3002  if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
3003  return LT.first * Entry->Cost;
3004  }
3005 
3007 }
3008 
3009 int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
3010  static const CostTblEntry SLMCostTbl[] = {
3015  };
3016 
3017  assert(Val->isVectorTy() && "This must be a vector type");
3018  Type *ScalarType = Val->getScalarType();
3019  int RegisterFileMoveCost = 0;
3020 
3021  if (Index != -1U && (Opcode == Instruction::ExtractElement ||
3022  Opcode == Instruction::InsertElement)) {
3023  // Legalize the type.
3024  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
3025 
3026  // This type is legalized to a scalar type.
3027  if (!LT.second.isVector())
3028  return 0;
3029 
3030  // The type may be split. Normalize the index to the new type.
3031  unsigned NumElts = LT.second.getVectorNumElements();
3032  unsigned SubNumElts = NumElts;
3033  Index = Index % NumElts;
3034 
3035  // For >128-bit vectors, we need to extract higher 128-bit subvectors.
3036  // For inserts, we also need to insert the subvector back.
3037  if (LT.second.getSizeInBits() > 128) {
3038  assert((LT.second.getSizeInBits() % 128) == 0 && "Illegal vector");
3039  unsigned NumSubVecs = LT.second.getSizeInBits() / 128;
3040  SubNumElts = NumElts / NumSubVecs;
3041  if (SubNumElts <= Index) {
3042  RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
3043  Index %= SubNumElts;
3044  }
3045  }
3046 
3047  if (Index == 0) {
3048  // Floating point scalars are already located in index #0.
3049  // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
3050  // true for all.
3051  if (ScalarType->isFloatingPointTy())
3052  return RegisterFileMoveCost;
3053 
3054  // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
3055  if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
3056  return 1 + RegisterFileMoveCost;
3057  }
3058 
3059  int ISD = TLI->InstructionOpcodeToISD(Opcode);
3060  assert(ISD && "Unexpected vector opcode");
3061  MVT MScalarTy = LT.second.getScalarType();
3062  if (ST->isSLM())
3063  if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
3064  return Entry->Cost + RegisterFileMoveCost;
3065 
3066  // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
3067  if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
3068  (MScalarTy.isInteger() && ST->hasSSE41()))
3069  return 1 + RegisterFileMoveCost;
3070 
3071  // Assume insertps is relatively cheap on all targets.
3072  if (MScalarTy == MVT::f32 && ST->hasSSE41() &&
3073  Opcode == Instruction::InsertElement)
3074  return 1 + RegisterFileMoveCost;
3075 
3076  // For extractions we just need to shuffle the element to index 0, which
3077  // should be very cheap (assume cost = 1). For insertions we need to shuffle
3078  // the elements to its destination. In both cases we must handle the
3079  // subvector move(s).
3080  // If the vector type is already less than 128-bits then don't reduce it.
3081  // TODO: Under what circumstances should we shuffle using the full width?
3082  int ShuffleCost = 1;
3083  if (Opcode == Instruction::InsertElement) {
3084  auto *SubTy = cast<VectorType>(Val);
3085  EVT VT = TLI->getValueType(DL, Val);
3086  if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
3087  SubTy = FixedVectorType::get(ScalarType, SubNumElts);
3088  ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, 0, SubTy);
3089  }
3090  int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
3091  return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
3092  }
3093 
3094  // Add to the base cost if we know that the extracted element of a vector is
3095  // destined to be moved to and used in the integer register file.
3096  if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
3097  RegisterFileMoveCost += 1;
3098 
3099  return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
3100 }
3101 
3103  const APInt &DemandedElts,
3104  bool Insert, bool Extract) {
3105  unsigned Cost = 0;
3106 
3107  // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
3108  // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
3109  if (Insert) {
3110  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
3111  MVT MScalarTy = LT.second.getScalarType();
3112 
3113  if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
3114  (MScalarTy.isInteger() && ST->hasSSE41()) ||
3115  (MScalarTy == MVT::f32 && ST->hasSSE41())) {
3116  // For types we can insert directly, insertion into 128-bit sub vectors is
3117  // cheap, followed by a cheap chain of concatenations.
3118  if (LT.second.getSizeInBits() <= 128) {
3119  Cost +=
3120  BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false);
3121  } else {
3122  // In each 128-lane, if at least one index is demanded but not all
3123  // indices are demanded and this 128-lane is not the first 128-lane of
3124  // the legalized-vector, then this 128-lane needs a extracti128; If in
3125  // each 128-lane, there is at least one demanded index, this 128-lane
3126  // needs a inserti128.
3127 
3128  // The following cases will help you build a better understanding:
3129  // Assume we insert several elements into a v8i32 vector in avx2,
3130  // Case#1: inserting into 1th index needs vpinsrd + inserti128.
3131  // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
3132  // inserti128.
3133  // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
3134  unsigned Num128Lanes = LT.second.getSizeInBits() / 128 * LT.first;
3135  unsigned NumElts = LT.second.getVectorNumElements() * LT.first;
3136  APInt WidenedDemandedElts = DemandedElts.zextOrSelf(NumElts);
3137  unsigned Scale = NumElts / Num128Lanes;
3138  // We iterate each 128-lane, and check if we need a
3139  // extracti128/inserti128 for this 128-lane.
3140  for (unsigned I = 0; I < NumElts; I += Scale) {
3141  APInt Mask = WidenedDemandedElts.getBitsSet(NumElts, I, I + Scale);
3142  APInt MaskedDE = Mask & WidenedDemandedElts;
3143  unsigned Population = MaskedDE.countPopulation();
3144  Cost += (Population > 0 && Population != Scale &&
3145  I % LT.second.getVectorNumElements() != 0);
3146  Cost += Population > 0;
3147  }
3148  Cost += DemandedElts.countPopulation();
3149 
3150  // For vXf32 cases, insertion into the 0'th index in each v4f32
3151  // 128-bit vector is free.
3152  // NOTE: This assumes legalization widens vXf32 vectors.
3153  if (MScalarTy == MVT::f32)
3154  for (unsigned i = 0, e = cast<FixedVectorType>(Ty)->getNumElements();
3155  i < e; i += 4)
3156  if (DemandedElts[i])
3157  Cost--;
3158  }
3159  } else if (LT.second.isVector()) {
3160  // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
3161  // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
3162  // series of UNPCK followed by CONCAT_VECTORS - all of these can be
3163  // considered cheap.
3164  if (Ty->isIntOrIntVectorTy())
3165  Cost += DemandedElts.countPopulation();
3166 
3167  // Get the smaller of the legalized or original pow2-extended number of
3168  // vector elements, which represents the number of unpacks we'll end up
3169  // performing.
3170  unsigned NumElts = LT.second.getVectorNumElements();
3171  unsigned Pow2Elts =
3172  PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements());
3173  Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
3174  }
3175  }
3176 
3177  // TODO: Use default extraction for now, but we should investigate extending this
3178  // to handle repeated subvector extraction.
3179  if (Extract)
3180  Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, false, Extract);
3181 
3182  return Cost;
3183 }
3184 
3185 int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
3186  MaybeAlign Alignment, unsigned AddressSpace,
3188  const Instruction *I) {
3189  // TODO: Handle other cost kinds.
3191  if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
3192  // Store instruction with index and scale costs 2 Uops.
3193  // Check the preceding GEP to identify non-const indices.
3194  if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
3195  if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
3196  return TTI::TCC_Basic * 2;
3197  }
3198  }
3199  return TTI::TCC_Basic;
3200  }
3201 
3202  // Handle non-power-of-two vectors such as <3 x float>
3203  if (auto *VTy = dyn_cast<FixedVectorType>(Src)) {
3204  unsigned NumElem = VTy->getNumElements();
3205 
3206  // Handle a few common cases:
3207  // <3 x float>
3208  if (NumElem == 3 && VTy->getScalarSizeInBits() == 32)
3209  // Cost = 64 bit store + extract + 32 bit store.
3210  return 3;
3211 
3212  // <3 x double>
3213  if (NumElem == 3 && VTy->getScalarSizeInBits() == 64)
3214  // Cost = 128 bit store + unpack + 64 bit store.
3215  return 3;
3216 
3217  // Assume that all other non-power-of-two numbers are scalarized.
3218  if (!isPowerOf2_32(NumElem)) {
3219  APInt DemandedElts = APInt::getAllOnesValue(NumElem);
3220  int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
3222  int SplitCost = getScalarizationOverhead(VTy, DemandedElts,
3223  Opcode == Instruction::Load,
3224  Opcode == Instruction::Store);
3225  return NumElem * Cost + SplitCost;
3226  }
3227  }
3228 
3229  // Type legalization can't handle structs
3230  if (TLI->getValueType(DL, Src, true) == MVT::Other)
3231  return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
3232  CostKind);
3233 
3234  // Legalize the type.
3235  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
3236  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
3237  "Invalid Opcode");
3238 
3239  // Each load/store unit costs 1.
3240  int Cost = LT.first * 1;
3241 
3242  // This isn't exactly right. We're using slow unaligned 32-byte accesses as a
3243  // proxy for a double-pumped AVX memory interface such as on Sandybridge.
3244  if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow())
3245  Cost *= 2;
3246 
3247  return Cost;
3248 }
3249 
3250 int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
3251  Align Alignment, unsigned AddressSpace,
3253  bool IsLoad = (Instruction::Load == Opcode);
3254  bool IsStore = (Instruction::Store == Opcode);
3255 
3256  auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
3257  if (!SrcVTy)
3258  // To calculate scalar take the regular cost, without mask
3259  return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
3260 
3261  unsigned NumElem = SrcVTy->getNumElements();
3262  auto *MaskTy =
3263  FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
3264  if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) ||
3265  (IsStore && !isLegalMaskedStore(SrcVTy, Alignment)) ||
3266  !isPowerOf2_32(NumElem)) {
3267  // Scalarization
3268  APInt DemandedElts = APInt::getAllOnesValue(NumElem);
3269  int MaskSplitCost =
3270  getScalarizationOverhead(MaskTy, DemandedElts, false, true);
3271  int ScalarCompareCost = getCmpSelInstrCost(
3272  Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
3274  int BranchCost = getCFInstrCost(Instruction::Br, CostKind);
3275  int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
3276  int ValueSplitCost =
3277  getScalarizationOverhead(SrcVTy, DemandedElts, IsLoad, IsStore);
3278  int MemopCost =
3279  NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
3280  Alignment, AddressSpace, CostKind);
3281  return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
3282  }
3283 
3284  // Legalize the type.
3285  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
3286  auto VT = TLI->getValueType(DL, SrcVTy);
3287  int Cost = 0;
3288  if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
3289  LT.second.getVectorNumElements() == NumElem)
3290  // Promotion requires expand/truncate for data and a shuffle for mask.
3291  Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, 0, nullptr) +
3292  getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, 0, nullptr);
3293 
3294  else if (LT.second.getVectorNumElements() > NumElem) {
3295  auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
3296  LT.second.getVectorNumElements());
3297  // Expanding requires fill mask with zeroes
3298  Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy);
3299  }
3300 
3301  // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
3302  if (!ST->hasAVX512())
3303  return Cost + LT.first * (IsLoad ? 2 : 8);
3304 
3305  // AVX-512 masked load/store is cheapper
3306  return Cost + LT.first;
3307 }
3308 
3310  const SCEV *Ptr) {
3311  // Address computations in vectorized code with non-consecutive addresses will
3312  // likely result in more instructions compared to scalar code where the
3313  // computation can more often be merged into the index mode. The resulting
3314  // extra micro-ops can significantly decrease throughput.
3315  const unsigned NumVectorInstToHideOverhead = 10;
3316 
3317  // Cost modeling of Strided Access Computation is hidden by the indexing
3318  // modes of X86 regardless of the stride value. We dont believe that there
3319  // is a difference between constant strided access in gerenal and constant
3320  // strided value which is less than or equal to 64.
3321  // Even in the case of (loop invariant) stride whose value is not known at
3322  // compile time, the address computation will not incur more than one extra
3323  // ADD instruction.
3324  if (Ty->isVectorTy() && SE) {
3325  if (!BaseT::isStridedAccess(Ptr))
3326  return NumVectorInstToHideOverhead;
3327  if (!BaseT::getConstantStrideStep(SE, Ptr))
3328  return 1;
3329  }
3330 
3331  return BaseT::getAddressComputationCost(Ty, SE, Ptr);
3332 }
3333 
3335  bool IsPairwise,
3337  // Just use the default implementation for pair reductions.
3338  if (IsPairwise)
3339  return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise, CostKind);
3340 
3341  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
3342  // and make it as the cost.
3343 
3344  static const CostTblEntry SLMCostTblNoPairWise[] = {
3345  { ISD::FADD, MVT::v2f64, 3 },
3346  { ISD::ADD, MVT::v2i64, 5 },
3347  };
3348 
3349  static const CostTblEntry SSE2CostTblNoPairWise[] = {
3350  { ISD::FADD, MVT::v2f64, 2 },
3351  { ISD::FADD, MVT::v4f32, 4 },
3352  { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
3353  { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
3354  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
3355  { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
3356  { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
3357  { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
3358  { ISD::ADD, MVT::v2i8, 2 },
3359  { ISD::ADD, MVT::v4i8, 2 },
3360  { ISD::ADD, MVT::v8i8, 2 },
3361  { ISD::ADD, MVT::v16i8, 3 },
3362  };
3363 
3364  static const CostTblEntry AVX1CostTblNoPairWise[] = {
3365  { ISD::FADD, MVT::v4f64, 3 },
3366  { ISD::FADD, MVT::v4f32, 3 },
3367  { ISD::FADD, MVT::v8f32, 4 },
3368  { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
3369  { ISD::ADD, MVT::v4i64, 3 },
3370  { ISD::ADD, MVT::v8i32, 5 },
3371  { ISD::ADD, MVT::v16i16, 5 },
3372  { ISD::ADD, MVT::v32i8, 4 },
3373  };
3374 
3375  int ISD = TLI->InstructionOpcodeToISD(Opcode);
3376  assert(ISD && "Invalid opcode");
3377 
3378  // Before legalizing the type, give a chance to look up illegal narrow types
3379  // in the table.
3380  // FIXME: Is there a better way to do this?
3381  EVT VT = TLI->getValueType(DL, ValTy);
3382  if (VT.isSimple()) {
3383  MVT MTy = VT.getSimpleVT();
3384  if (ST->isSLM())
3385  if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
3386  return Entry->Cost;
3387 
3388  if (ST->hasAVX())
3389  if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
3390  return Entry->Cost;
3391 
3392  if (ST->hasSSE2())
3393  if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
3394  return Entry->Cost;
3395  }
3396 
3397  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
3398 
3399  MVT MTy = LT.second;
3400 
3401  auto *ValVTy = cast<FixedVectorType>(ValTy);
3402 
3403  unsigned ArithmeticCost = 0;
3404  if (LT.first != 1 && MTy.isVector() &&
3405  MTy.getVectorNumElements() < ValVTy->getNumElements()) {
3406  // Type needs to be split. We need LT.first - 1 arithmetic ops.
3407  auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
3408  MTy.getVectorNumElements());
3409  ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
3410  ArithmeticCost *= LT.first - 1;
3411  }
3412 
3413  if (ST->isSLM())
3414  if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
3415  return ArithmeticCost + Entry->Cost;
3416 
3417  if (ST->hasAVX())
3418  if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
3419  return ArithmeticCost + Entry->Cost;
3420 
3421  if (ST->hasSSE2())
3422  if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
3423  return ArithmeticCost + Entry->Cost;
3424 
3425  // FIXME: These assume a naive kshift+binop lowering, which is probably
3426  // conservative in most cases.
3427  static const CostTblEntry AVX512BoolReduction[] = {
3428  { ISD::AND, MVT::v2i1, 3 },
3429  { ISD::AND, MVT::v4i1, 5 },
3430  { ISD::AND, MVT::v8i1, 7 },
3431  { ISD::AND, MVT::v16i1, 9 },
3432  { ISD::AND, MVT::v32i1, 11 },
3433  { ISD::AND, MVT::v64i1, 13 },
3434  { ISD::OR, MVT::v2i1, 3 },
3435  { ISD::OR, MVT::v4i1, 5 },
3436  { ISD::OR, MVT::v8i1, 7 },
3437  { ISD::OR, MVT::v16i1, 9 },
3438  { ISD::OR, MVT::v32i1, 11 },
3439  { ISD::OR, MVT::v64i1, 13 },
3440  };
3441 
3442  static const CostTblEntry AVX2BoolReduction[] = {
3443  { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
3444  { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
3445  { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
3446  { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
3447  };
3448 
3449  static const CostTblEntry AVX1BoolReduction[] = {
3450  { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
3451  { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
3452  { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
3453  { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
3454  { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
3455  { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
3456  { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
3457  { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
3458  };
3459 
3460  static const CostTblEntry SSE2BoolReduction[] = {
3461  { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
3462  { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
3463  { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
3464  { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
3465  { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
3466  { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
3467  { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
3468  { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
3469  };
3470 
3471  // Handle bool allof/anyof patterns.
3472  if (ValVTy->getElementType()->isIntegerTy(1)) {
3473  unsigned ArithmeticCost = 0;
3474  if (LT.first != 1 && MTy.isVector() &&
3475  MTy.getVectorNumElements() < ValVTy->getNumElements()) {
3476  // Type needs to be split. We need LT.first - 1 arithmetic ops.
3477  auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
3478  MTy.getVectorNumElements());
3479  ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
3480  ArithmeticCost *= LT.first - 1;
3481  }
3482 
3483  if (ST->hasAVX512())
3484  if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
3485  return ArithmeticCost + Entry->Cost;
3486  if (ST->hasAVX2())
3487  if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
3488  return ArithmeticCost + Entry->Cost;
3489  if (ST->hasAVX())
3490  if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
3491  return ArithmeticCost + Entry->Cost;
3492  if (ST->hasSSE2())
3493  if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
3494  return ArithmeticCost + Entry->Cost;
3495 
3496  return BaseT::getArithmeticReductionCost(Opcode, ValVTy, IsPairwise,
3497  CostKind);
3498  }
3499 
3500  unsigned NumVecElts = ValVTy->getNumElements();
3501  unsigned ScalarSize = ValVTy->getScalarSize