LLVM  8.0.0svn
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 /// \file
10 /// This file implements a TargetTransformInfo analysis pass specific to the
11 /// X86 target machine. It uses the target's detailed information to provide
12 /// more precise answers to certain TTI queries, while letting the target
13 /// independent and default TTI implementations handle the rest.
14 ///
15 //===----------------------------------------------------------------------===//
16 /// About Cost Model numbers used below it's necessary to say the following:
17 /// the numbers correspond to some "generic" X86 CPU instead of usage of
18 /// concrete CPU model. Usually the numbers correspond to CPU where the feature
19 /// apeared at the first time. For example, if we do Subtarget.hasSSE42() in
20 /// the lookups below the cost is based on Nehalem as that was the first CPU
21 /// to support that feature level and thus has most likely the worst case cost.
22 /// Some examples of other technologies/CPUs:
23 /// SSE 3 - Pentium4 / Athlon64
24 /// SSE 4.1 - Penryn
25 /// SSE 4.2 - Nehalem
26 /// AVX - Sandy Bridge
27 /// AVX2 - Haswell
28 /// AVX-512 - Xeon Phi / Skylake
29 /// And some examples of instruction target dependent costs (latency)
30 /// divss sqrtss rsqrtss
31 /// AMD K7 11-16 19 3
32 /// Piledriver 9-24 13-15 5
33 /// Jaguar 14 16 2
34 /// Pentium II,III 18 30 2
35 /// Nehalem 7-14 7-18 3
36 /// Haswell 10-13 11 5
37 /// TODO: Develop and implement the target dependent cost model and
38 /// specialize cost numbers for different Cost Model Targets such as throughput,
39 /// code size, latency and uop count.
40 //===----------------------------------------------------------------------===//
41 
42 #include "X86TargetTransformInfo.h"
45 #include "llvm/CodeGen/CostTable.h"
47 #include "llvm/IR/IntrinsicInst.h"
48 #include "llvm/Support/Debug.h"
49 
50 using namespace llvm;
51 
52 #define DEBUG_TYPE "x86tti"
53 
54 //===----------------------------------------------------------------------===//
55 //
56 // X86 cost model.
57 //
58 //===----------------------------------------------------------------------===//
59 
61 X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
62  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
63  // TODO: Currently the __builtin_popcount() implementation using SSE3
64  // instructions is inefficient. Once the problem is fixed, we should
65  // call ST->hasSSE3() instead of ST->hasPOPCNT().
67 }
68 
71  switch (Level) {
73  // - Penryn
74  // - Nehalem
75  // - Westmere
76  // - Sandy Bridge
77  // - Ivy Bridge
78  // - Haswell
79  // - Broadwell
80  // - Skylake
81  // - Kabylake
82  return 32 * 1024; // 32 KByte
84  // - Penryn
85  // - Nehalem
86  // - Westmere
87  // - Sandy Bridge
88  // - Ivy Bridge
89  // - Haswell
90  // - Broadwell
91  // - Skylake
92  // - Kabylake
93  return 256 * 1024; // 256 KByte
94  }
95 
96  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
97 }
98 
101  // - Penryn
102  // - Nehalem
103  // - Westmere
104  // - Sandy Bridge
105  // - Ivy Bridge
106  // - Haswell
107  // - Broadwell
108  // - Skylake
109  // - Kabylake
110  switch (Level) {
114  return 8;
115  }
116 
117  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
118 }
119 
120 unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
121  if (Vector && !ST->hasSSE1())
122  return 0;
123 
124  if (ST->is64Bit()) {
125  if (Vector && ST->hasAVX512())
126  return 32;
127  return 16;
128  }
129  return 8;
130 }
131 
132 unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const {
133  unsigned PreferVectorWidth = ST->getPreferVectorWidth();
134  if (Vector) {
135  if (ST->hasAVX512() && PreferVectorWidth >= 512)
136  return 512;
137  if (ST->hasAVX() && PreferVectorWidth >= 256)
138  return 256;
139  if (ST->hasSSE1() && PreferVectorWidth >= 128)
140  return 128;
141  return 0;
142  }
143 
144  if (ST->is64Bit())
145  return 64;
146 
147  return 32;
148 }
149 
150 unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
151  return getRegisterBitWidth(true);
152 }
153 
154 unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
155  // If the loop will not be vectorized, don't interleave the loop.
156  // Let regular unroll to unroll the loop, which saves the overflow
157  // check and memory check cost.
158  if (VF == 1)
159  return 1;
160 
161  if (ST->isAtom())
162  return 1;
163 
164  // Sandybridge and Haswell have multiple execution ports and pipelined
165  // vector units.
166  if (ST->hasAVX())
167  return 4;
168 
169  return 2;
170 }
171 
173  unsigned Opcode, Type *Ty,
175  TTI::OperandValueProperties Opd1PropInfo,
176  TTI::OperandValueProperties Opd2PropInfo,
178  // Legalize the type.
179  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
180 
181  int ISD = TLI->InstructionOpcodeToISD(Opcode);
182  assert(ISD && "Invalid opcode");
183 
184  static const CostTblEntry GLMCostTable[] = {
185  { ISD::FDIV, MVT::f32, 18 }, // divss
186  { ISD::FDIV, MVT::v4f32, 35 }, // divps
187  { ISD::FDIV, MVT::f64, 33 }, // divsd
188  { ISD::FDIV, MVT::v2f64, 65 }, // divpd
189  };
190 
191  if (ST->isGLM())
192  if (const auto *Entry = CostTableLookup(GLMCostTable, ISD,
193  LT.second))
194  return LT.first * Entry->Cost;
195 
196  static const CostTblEntry SLMCostTable[] = {
197  { ISD::MUL, MVT::v4i32, 11 }, // pmulld
198  { ISD::MUL, MVT::v8i16, 2 }, // pmullw
199  { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
200  { ISD::FMUL, MVT::f64, 2 }, // mulsd
201  { ISD::FMUL, MVT::v2f64, 4 }, // mulpd
202  { ISD::FMUL, MVT::v4f32, 2 }, // mulps
203  { ISD::FDIV, MVT::f32, 17 }, // divss
204  { ISD::FDIV, MVT::v4f32, 39 }, // divps
205  { ISD::FDIV, MVT::f64, 32 }, // divsd
206  { ISD::FDIV, MVT::v2f64, 69 }, // divpd
207  { ISD::FADD, MVT::v2f64, 2 }, // addpd
208  { ISD::FSUB, MVT::v2f64, 2 }, // subpd
209  // v2i64/v4i64 mul is custom lowered as a series of long:
210  // multiplies(3), shifts(3) and adds(2)
211  // slm muldq version throughput is 2 and addq throughput 4
212  // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
213  // 3X4 (addq throughput) = 17
214  { ISD::MUL, MVT::v2i64, 17 },
215  // slm addq\subq throughput is 4
216  { ISD::ADD, MVT::v2i64, 4 },
217  { ISD::SUB, MVT::v2i64, 4 },
218  };
219 
220  if (ST->isSLM()) {
221  if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
222  // Check if the operands can be shrinked into a smaller datatype.
223  bool Op1Signed = false;
224  unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
225  bool Op2Signed = false;
226  unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
227 
228  bool signedMode = Op1Signed | Op2Signed;
229  unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
230 
231  if (OpMinSize <= 7)
232  return LT.first * 3; // pmullw/sext
233  if (!signedMode && OpMinSize <= 8)
234  return LT.first * 3; // pmullw/zext
235  if (OpMinSize <= 15)
236  return LT.first * 5; // pmullw/pmulhw/pshuf
237  if (!signedMode && OpMinSize <= 16)
238  return LT.first * 5; // pmullw/pmulhw/pshuf
239  }
240 
241  if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
242  LT.second)) {
243  return LT.first * Entry->Cost;
244  }
245  }
246 
247  if ((ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
248  ISD == ISD::UREM) &&
251  Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
252  if (ISD == ISD::SDIV || ISD == ISD::SREM) {
253  // On X86, vector signed division by constants power-of-two are
254  // normally expanded to the sequence SRA + SRL + ADD + SRA.
255  // The OperandValue properties may not be the same as that of the previous
256  // operation; conservatively assume OP_None.
257  int Cost =
258  2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, Op2Info,
261  Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
264  Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info,
267 
268  if (ISD == ISD::SREM) {
269  // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
270  Cost += getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info);
271  Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Op1Info, Op2Info);
272  }
273 
274  return Cost;
275  }
276 
277  // Vector unsigned division/remainder will be simplified to shifts/masks.
278  if (ISD == ISD::UDIV)
279  return getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
282 
283  if (ISD == ISD::UREM)
284  return getArithmeticInstrCost(Instruction::And, Ty, Op1Info, Op2Info,
287  }
288 
289  static const CostTblEntry AVX512BWUniformConstCostTable[] = {
290  { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand.
291  { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand.
292  { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb.
293  };
294 
296  ST->hasBWI()) {
297  if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
298  LT.second))
299  return LT.first * Entry->Cost;
300  }
301 
302  static const CostTblEntry AVX512UniformConstCostTable[] = {
303  { ISD::SRA, MVT::v2i64, 1 },
304  { ISD::SRA, MVT::v4i64, 1 },
305  { ISD::SRA, MVT::v8i64, 1 },
306  };
307 
309  ST->hasAVX512()) {
310  if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
311  LT.second))
312  return LT.first * Entry->Cost;
313  }
314 
315  static const CostTblEntry AVX2UniformConstCostTable[] = {
316  { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand.
317  { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand.
318  { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb.
319 
320  { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
321  };
322 
324  ST->hasAVX2()) {
325  if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
326  LT.second))
327  return LT.first * Entry->Cost;
328  }
329 
330  static const CostTblEntry SSE2UniformConstCostTable[] = {
331  { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand.
332  { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand.
333  { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
334 
335  { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split.
336  { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split.
337  { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
338  };
339 
340  // XOP has faster vXi8 shifts.
342  ST->hasSSE2() && !ST->hasXOP()) {
343  if (const auto *Entry =
344  CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
345  return LT.first * Entry->Cost;
346  }
347 
348  static const CostTblEntry AVX512BWConstCostTable[] = {
349  { ISD::SDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
350  { ISD::SREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
351  { ISD::UDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
352  { ISD::UREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
353  { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence
354  { ISD::SREM, MVT::v32i16, 8 }, // vpmulhw+mul+sub sequence
355  { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence
356  { ISD::UREM, MVT::v32i16, 8 }, // vpmulhuw+mul+sub sequence
357  };
358 
361  ST->hasBWI()) {
362  if (const auto *Entry =
363  CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
364  return LT.first * Entry->Cost;
365  }
366 
367  static const CostTblEntry AVX512ConstCostTable[] = {
368  { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
369  { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
370  { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
371  { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
372  };
373 
376  ST->hasAVX512()) {
377  if (const auto *Entry =
378  CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
379  return LT.first * Entry->Cost;
380  }
381 
382  static const CostTblEntry AVX2ConstCostTable[] = {
383  { ISD::SDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
384  { ISD::SREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
385  { ISD::UDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
386  { ISD::UREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
387  { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
388  { ISD::SREM, MVT::v16i16, 8 }, // vpmulhw+mul+sub sequence
389  { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
390  { ISD::UREM, MVT::v16i16, 8 }, // vpmulhuw+mul+sub sequence
391  { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
392  { ISD::SREM, MVT::v8i32, 19 }, // vpmuldq+mul+sub sequence
393  { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
394  { ISD::UREM, MVT::v8i32, 19 }, // vpmuludq+mul+sub sequence
395  };
396 
399  ST->hasAVX2()) {
400  if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
401  return LT.first * Entry->Cost;
402  }
403 
404  static const CostTblEntry SSE2ConstCostTable[] = {
405  { ISD::SDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
406  { ISD::SREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
407  { ISD::SDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
408  { ISD::SREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
409  { ISD::UDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
410  { ISD::UREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
411  { ISD::UDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
412  { ISD::UREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
413  { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
414  { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split.
415  { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
416  { ISD::SREM, MVT::v8i16, 8 }, // pmulhw+mul+sub sequence
417  { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
418  { ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split.
419  { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
420  { ISD::UREM, MVT::v8i16, 8 }, // pmulhuw+mul+sub sequence
421  { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split.
422  { ISD::SREM, MVT::v8i32, 48+2 }, // 2*pmuludq+mul+sub sequence + split.
423  { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
424  { ISD::SREM, MVT::v4i32, 24 }, // pmuludq+mul+sub sequence
425  { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split.
426  { ISD::UREM, MVT::v8i32, 40+2 }, // 2*pmuludq+mul+sub sequence + split.
427  { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
428  { ISD::UREM, MVT::v4i32, 20 }, // pmuludq+mul+sub sequence
429  };
430 
433  ST->hasSSE2()) {
434  // pmuldq sequence.
435  if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
436  return LT.first * 32;
437  if (ISD == ISD::SREM && LT.second == MVT::v8i32 && ST->hasAVX())
438  return LT.first * 38;
439  if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
440  return LT.first * 15;
441  if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41())
442  return LT.first * 20;
443 
444  if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
445  return LT.first * Entry->Cost;
446  }
447 
448  static const CostTblEntry AVX2UniformCostTable[] = {
449  // Uniform splats are cheaper for the following instructions.
450  { ISD::SHL, MVT::v16i16, 1 }, // psllw.
451  { ISD::SRL, MVT::v16i16, 1 }, // psrlw.
452  { ISD::SRA, MVT::v16i16, 1 }, // psraw.
453  };
454 
455  if (ST->hasAVX2() &&
457  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
458  if (const auto *Entry =
459  CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
460  return LT.first * Entry->Cost;
461  }
462 
463  static const CostTblEntry SSE2UniformCostTable[] = {
464  // Uniform splats are cheaper for the following instructions.
465  { ISD::SHL, MVT::v8i16, 1 }, // psllw.
466  { ISD::SHL, MVT::v4i32, 1 }, // pslld
467  { ISD::SHL, MVT::v2i64, 1 }, // psllq.
468 
469  { ISD::SRL, MVT::v8i16, 1 }, // psrlw.
470  { ISD::SRL, MVT::v4i32, 1 }, // psrld.
471  { ISD::SRL, MVT::v2i64, 1 }, // psrlq.
472 
473  { ISD::SRA, MVT::v8i16, 1 }, // psraw.
474  { ISD::SRA, MVT::v4i32, 1 }, // psrad.
475  };
476 
477  if (ST->hasSSE2() &&
479  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
480  if (const auto *Entry =
481  CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
482  return LT.first * Entry->Cost;
483  }
484 
485  static const CostTblEntry AVX512DQCostTable[] = {
486  { ISD::MUL, MVT::v2i64, 1 },
487  { ISD::MUL, MVT::v4i64, 1 },
488  { ISD::MUL, MVT::v8i64, 1 }
489  };
490 
491  // Look for AVX512DQ lowering tricks for custom cases.
492  if (ST->hasDQI())
493  if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
494  return LT.first * Entry->Cost;
495 
496  static const CostTblEntry AVX512BWCostTable[] = {
497  { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
498  { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
499  { ISD::SRA, MVT::v8i16, 1 }, // vpsravw
500 
501  { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
502  { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
503  { ISD::SRA, MVT::v16i16, 1 }, // vpsravw
504 
505  { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
506  { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
507  { ISD::SRA, MVT::v32i16, 1 }, // vpsravw
508 
509  { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence.
510  { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence.
511  { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence.
512 
513  { ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence.
514  { ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence.
515  { ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence.
516  };
517 
518  // Look for AVX512BW lowering tricks for custom cases.
519  if (ST->hasBWI())
520  if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
521  return LT.first * Entry->Cost;
522 
523  static const CostTblEntry AVX512CostTable[] = {
524  { ISD::SHL, MVT::v16i32, 1 },
525  { ISD::SRL, MVT::v16i32, 1 },
526  { ISD::SRA, MVT::v16i32, 1 },
527 
528  { ISD::SHL, MVT::v8i64, 1 },
529  { ISD::SRL, MVT::v8i64, 1 },
530 
531  { ISD::SRA, MVT::v2i64, 1 },
532  { ISD::SRA, MVT::v4i64, 1 },
533  { ISD::SRA, MVT::v8i64, 1 },
534 
535  { ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence.
536  { ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence.
537  { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org)
538  { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org)
539  { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org)
540  { ISD::MUL, MVT::v8i64, 8 }, // 3*pmuludq/3*shift/2*add
541 
542  { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
543  { ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
544  { ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
545 
546  { ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
547  { ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
548  { ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
549  };
550 
551  if (ST->hasAVX512())
552  if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
553  return LT.first * Entry->Cost;
554 
555  static const CostTblEntry AVX2ShiftCostTable[] = {
556  // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
557  // customize them to detect the cases where shift amount is a scalar one.
558  { ISD::SHL, MVT::v4i32, 1 },
559  { ISD::SRL, MVT::v4i32, 1 },
560  { ISD::SRA, MVT::v4i32, 1 },
561  { ISD::SHL, MVT::v8i32, 1 },
562  { ISD::SRL, MVT::v8i32, 1 },
563  { ISD::SRA, MVT::v8i32, 1 },
564  { ISD::SHL, MVT::v2i64, 1 },
565  { ISD::SRL, MVT::v2i64, 1 },
566  { ISD::SHL, MVT::v4i64, 1 },
567  { ISD::SRL, MVT::v4i64, 1 },
568  };
569 
570  // Look for AVX2 lowering tricks.
571  if (ST->hasAVX2()) {
572  if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
575  // On AVX2, a packed v16i16 shift left by a constant build_vector
576  // is lowered into a vector multiply (vpmullw).
577  return getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info,
580 
581  if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
582  return LT.first * Entry->Cost;
583  }
584 
585  static const CostTblEntry XOPShiftCostTable[] = {
586  // 128bit shifts take 1cy, but right shifts require negation beforehand.
587  { ISD::SHL, MVT::v16i8, 1 },
588  { ISD::SRL, MVT::v16i8, 2 },
589  { ISD::SRA, MVT::v16i8, 2 },
590  { ISD::SHL, MVT::v8i16, 1 },
591  { ISD::SRL, MVT::v8i16, 2 },
592  { ISD::SRA, MVT::v8i16, 2 },
593  { ISD::SHL, MVT::v4i32, 1 },
594  { ISD::SRL, MVT::v4i32, 2 },
595  { ISD::SRA, MVT::v4i32, 2 },
596  { ISD::SHL, MVT::v2i64, 1 },
597  { ISD::SRL, MVT::v2i64, 2 },
598  { ISD::SRA, MVT::v2i64, 2 },
599  // 256bit shifts require splitting if AVX2 didn't catch them above.
600  { ISD::SHL, MVT::v32i8, 2+2 },
601  { ISD::SRL, MVT::v32i8, 4+2 },
602  { ISD::SRA, MVT::v32i8, 4+2 },
603  { ISD::SHL, MVT::v16i16, 2+2 },
604  { ISD::SRL, MVT::v16i16, 4+2 },
605  { ISD::SRA, MVT::v16i16, 4+2 },
606  { ISD::SHL, MVT::v8i32, 2+2 },
607  { ISD::SRL, MVT::v8i32, 4+2 },
608  { ISD::SRA, MVT::v8i32, 4+2 },
609  { ISD::SHL, MVT::v4i64, 2+2 },
610  { ISD::SRL, MVT::v4i64, 4+2 },
611  { ISD::SRA, MVT::v4i64, 4+2 },
612  };
613 
614  // Look for XOP lowering tricks.
615  if (ST->hasXOP()) {
616  // If the right shift is constant then we'll fold the negation so
617  // it's as cheap as a left shift.
618  int ShiftISD = ISD;
619  if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) &&
622  ShiftISD = ISD::SHL;
623  if (const auto *Entry =
624  CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
625  return LT.first * Entry->Cost;
626  }
627 
628  static const CostTblEntry SSE2UniformShiftCostTable[] = {
629  // Uniform splats are cheaper for the following instructions.
630  { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split.
631  { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split.
632  { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split.
633 
634  { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split.
635  { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split.
636  { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split.
637 
638  { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split.
639  { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split.
640  { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle.
641  { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split.
642  };
643 
644  if (ST->hasSSE2() &&
646  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
647 
648  // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
649  if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2())
650  return LT.first * 4; // 2*psrad + shuffle.
651 
652  if (const auto *Entry =
653  CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
654  return LT.first * Entry->Cost;
655  }
656 
657  if (ISD == ISD::SHL &&
659  MVT VT = LT.second;
660  // Vector shift left by non uniform constant can be lowered
661  // into vector multiply.
662  if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
663  ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
664  ISD = ISD::MUL;
665  }
666 
667  static const CostTblEntry AVX2CostTable[] = {
668  { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence.
669  { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
670 
671  { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence.
672  { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
673 
674  { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence.
675  { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence.
676  { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence.
677  { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence.
678 
679  { ISD::SUB, MVT::v32i8, 1 }, // psubb
680  { ISD::ADD, MVT::v32i8, 1 }, // paddb
681  { ISD::SUB, MVT::v16i16, 1 }, // psubw
682  { ISD::ADD, MVT::v16i16, 1 }, // paddw
683  { ISD::SUB, MVT::v8i32, 1 }, // psubd
684  { ISD::ADD, MVT::v8i32, 1 }, // paddd
685  { ISD::SUB, MVT::v4i64, 1 }, // psubq
686  { ISD::ADD, MVT::v4i64, 1 }, // paddq
687 
688  { ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence.
689  { ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence.
690  { ISD::MUL, MVT::v16i16, 1 }, // pmullw
691  { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org)
692  { ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add
693 
694  { ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
695  { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
696  { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
697  { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
698  { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
699  { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
700 
701  { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
702  { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
703  { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
704  { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/
705  { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
706  { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
707  };
708 
709  // Look for AVX2 lowering tricks for custom cases.
710  if (ST->hasAVX2())
711  if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
712  return LT.first * Entry->Cost;
713 
714  static const CostTblEntry AVX1CostTable[] = {
715  // We don't have to scalarize unsupported ops. We can issue two half-sized
716  // operations and we only need to extract the upper YMM half.
717  // Two ops + 1 extract + 1 insert = 4.
718  { ISD::MUL, MVT::v16i16, 4 },
719  { ISD::MUL, MVT::v8i32, 4 },
720  { ISD::SUB, MVT::v32i8, 4 },
721  { ISD::ADD, MVT::v32i8, 4 },
722  { ISD::SUB, MVT::v16i16, 4 },
723  { ISD::ADD, MVT::v16i16, 4 },
724  { ISD::SUB, MVT::v8i32, 4 },
725  { ISD::ADD, MVT::v8i32, 4 },
726  { ISD::SUB, MVT::v4i64, 4 },
727  { ISD::ADD, MVT::v4i64, 4 },
728 
729  // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
730  // are lowered as a series of long multiplies(3), shifts(3) and adds(2)
731  // Because we believe v4i64 to be a legal type, we must also include the
732  // extract+insert in the cost table. Therefore, the cost here is 18
733  // instead of 8.
734  { ISD::MUL, MVT::v4i64, 18 },
735 
736  { ISD::MUL, MVT::v32i8, 26 }, // extend/pmullw/trunc sequence.
737 
738  { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
739  { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
740  { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
741  { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/
742  { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/
743  { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/
744  };
745 
746  if (ST->hasAVX())
747  if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
748  return LT.first * Entry->Cost;
749 
750  static const CostTblEntry SSE42CostTable[] = {
751  { ISD::FADD, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
752  { ISD::FADD, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
753  { ISD::FADD, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
754  { ISD::FADD, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
755 
756  { ISD::FSUB, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
757  { ISD::FSUB, MVT::f32 , 1 }, // Nehalem from http://www.agner.org/
758  { ISD::FSUB, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
759  { ISD::FSUB, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
760 
761  { ISD::FMUL, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
762  { ISD::FMUL, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
763  { ISD::FMUL, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
764  { ISD::FMUL, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
765 
766  { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/
767  { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
768  { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/
769  { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
770  };
771 
772  if (ST->hasSSE42())
773  if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
774  return LT.first * Entry->Cost;
775 
776  static const CostTblEntry SSE41CostTable[] = {
777  { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence.
778  { ISD::SHL, MVT::v32i8, 2*11+2 }, // pblendvb sequence + split.
779  { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence.
780  { ISD::SHL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
781  { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
782  { ISD::SHL, MVT::v8i32, 2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split
783 
784  { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence.
785  { ISD::SRL, MVT::v32i8, 2*12+2 }, // pblendvb sequence + split.
786  { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence.
787  { ISD::SRL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
788  { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend.
789  { ISD::SRL, MVT::v8i32, 2*11+2 }, // Shift each lane + blend + split.
790 
791  { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence.
792  { ISD::SRA, MVT::v32i8, 2*24+2 }, // pblendvb sequence + split.
793  { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence.
794  { ISD::SRA, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
795  { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
796  { ISD::SRA, MVT::v8i32, 2*12+2 }, // Shift each lane + blend + split.
797 
798  { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org)
799  };
800 
801  if (ST->hasSSE41())
802  if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
803  return LT.first * Entry->Cost;
804 
805  static const CostTblEntry SSE2CostTable[] = {
806  // We don't correctly identify costs of casts because they are marked as
807  // custom.
808  { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence.
809  { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence.
810  { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
811  { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
812  { ISD::SHL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
813 
814  { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence.
815  { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence.
816  { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
817  { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
818  { ISD::SRL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
819 
820  { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence.
821  { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence.
822  { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend.
823  { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence.
824  { ISD::SRA, MVT::v4i64, 2*12+2 }, // srl/xor/sub sequence+split.
825 
826  { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence.
827  { ISD::MUL, MVT::v8i16, 1 }, // pmullw
828  { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
829  { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
830 
831  { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/
832  { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
833  { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
834  { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
835  };
836 
837  if (ST->hasSSE2())
838  if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
839  return LT.first * Entry->Cost;
840 
841  static const CostTblEntry SSE1CostTable[] = {
842  { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/
843  { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
844  };
845 
846  if (ST->hasSSE1())
847  if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
848  return LT.first * Entry->Cost;
849 
850  // It is not a good idea to vectorize division. We have to scalarize it and
851  // in the process we will often end up having to spilling regular
852  // registers. The overhead of division is going to dominate most kernels
853  // anyways so try hard to prevent vectorization of division - it is
854  // generally a bad idea. Assume somewhat arbitrarily that we have to be able
855  // to hide "20 cycles" for each lane.
856  if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM ||
857  ISD == ISD::UDIV || ISD == ISD::UREM)) {
858  int ScalarCost = getArithmeticInstrCost(
859  Opcode, Ty->getScalarType(), Op1Info, Op2Info,
861  return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
862  }
863 
864  // Fallback to the default implementation.
865  return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
866 }
867 
869  Type *SubTp) {
870  // 64-bit packed float vectors (v2f32) are widened to type v4f32.
871  // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
872  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
873 
874  // Treat Transpose as 2-op shuffles - there's no difference in lowering.
875  if (Kind == TTI::SK_Transpose)
876  Kind = TTI::SK_PermuteTwoSrc;
877 
878  // For Broadcasts we are splatting the first element from the first input
879  // register, so only need to reference that input and all the output
880  // registers are the same.
881  if (Kind == TTI::SK_Broadcast)
882  LT.first = 1;
883 
884  // Subvector extractions are free if they start at the beginning of a
885  // vector and cheap if the subvectors are aligned.
886  if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
887  int NumElts = LT.second.getVectorNumElements();
888  if ((Index % NumElts) == 0)
889  return 0;
890  std::pair<int, MVT> SubLT = TLI->getTypeLegalizationCost(DL, SubTp);
891  if (SubLT.second.isVector()) {
892  int NumSubElts = SubLT.second.getVectorNumElements();
893  if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
894  return SubLT.first;
895  }
896  }
897 
898  // We are going to permute multiple sources and the result will be in multiple
899  // destinations. Providing an accurate cost only for splits where the element
900  // type remains the same.
901  if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
902  MVT LegalVT = LT.second;
903  if (LegalVT.isVector() &&
904  LegalVT.getVectorElementType().getSizeInBits() ==
906  LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) {
907 
908  unsigned VecTySize = DL.getTypeStoreSize(Tp);
909  unsigned LegalVTSize = LegalVT.getStoreSize();
910  // Number of source vectors after legalization:
911  unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
912  // Number of destination vectors after legalization:
913  unsigned NumOfDests = LT.first;
914 
915  Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(),
916  LegalVT.getVectorNumElements());
917 
918  unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
919  return NumOfShuffles *
920  getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
921  }
922 
923  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
924  }
925 
926  // For 2-input shuffles, we must account for splitting the 2 inputs into many.
927  if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
928  // We assume that source and destination have the same vector type.
929  int NumOfDests = LT.first;
930  int NumOfShufflesPerDest = LT.first * 2 - 1;
931  LT.first = NumOfDests * NumOfShufflesPerDest;
932  }
933 
934  static const CostTblEntry AVX512VBMIShuffleTbl[] = {
935  {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
936  {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
937 
938  {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
939  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
940 
941  {TTI::SK_PermuteTwoSrc, MVT::v64i8, 1}, // vpermt2b
942  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 1}, // vpermt2b
943  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1} // vpermt2b
944  };
945 
946  if (ST->hasVBMI())
947  if (const auto *Entry =
948  CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
949  return LT.first * Entry->Cost;
950 
951  static const CostTblEntry AVX512BWShuffleTbl[] = {
952  {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
953  {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
954 
955  {TTI::SK_Reverse, MVT::v32i16, 1}, // vpermw
956  {TTI::SK_Reverse, MVT::v16i16, 1}, // vpermw
957  {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
958 
959  {TTI::SK_PermuteSingleSrc, MVT::v32i16, 1}, // vpermw
960  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 1}, // vpermw
961  {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // vpermw
962  {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
963  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 3}, // vpermw + zext/trunc
964 
965  {TTI::SK_PermuteTwoSrc, MVT::v32i16, 1}, // vpermt2w
966  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 1}, // vpermt2w
967  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpermt2w
968  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 3}, // zext + vpermt2w + trunc
969  {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
970  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3} // zext + vpermt2w + trunc
971  };
972 
973  if (ST->hasBWI())
974  if (const auto *Entry =
975  CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
976  return LT.first * Entry->Cost;
977 
978  static const CostTblEntry AVX512ShuffleTbl[] = {
979  {TTI::SK_Broadcast, MVT::v8f64, 1}, // vbroadcastpd
980  {TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps
981  {TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq
982  {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd
983 
984  {TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd
985  {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps
986  {TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq
987  {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd
988 
989  {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd
990  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
991  {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // vpermpd
992  {TTI::SK_PermuteSingleSrc, MVT::v16f32, 1}, // vpermps
993  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
994  {TTI::SK_PermuteSingleSrc, MVT::v4f32, 1}, // vpermps
995  {TTI::SK_PermuteSingleSrc, MVT::v8i64, 1}, // vpermq
996  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
997  {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // vpermq
998  {TTI::SK_PermuteSingleSrc, MVT::v16i32, 1}, // vpermd
999  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1000  {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // vpermd
1001  {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1002 
1003  {TTI::SK_PermuteTwoSrc, MVT::v8f64, 1}, // vpermt2pd
1004  {TTI::SK_PermuteTwoSrc, MVT::v16f32, 1}, // vpermt2ps
1005  {TTI::SK_PermuteTwoSrc, MVT::v8i64, 1}, // vpermt2q
1006  {TTI::SK_PermuteTwoSrc, MVT::v16i32, 1}, // vpermt2d
1007  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 1}, // vpermt2pd
1008  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 1}, // vpermt2ps
1009  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 1}, // vpermt2q
1010  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 1}, // vpermt2d
1011  {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1}, // vpermt2pd
1012  {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1}, // vpermt2ps
1013  {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1}, // vpermt2q
1014  {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1} // vpermt2d
1015  };
1016 
1017  if (ST->hasAVX512())
1018  if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1019  return LT.first * Entry->Cost;
1020 
1021  static const CostTblEntry AVX2ShuffleTbl[] = {
1022  {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1023  {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1024  {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1025  {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1026  {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1027  {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1028 
1029  {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1030  {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1031  {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1032  {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1033  {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1034  {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1035 
1036  {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1037  {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1038 
1039  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1040  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1041  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1042  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1043  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1044  // + vpblendvb
1045  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1046  // + vpblendvb
1047 
1048  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1049  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1050  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1051  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1052  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1053  // + vpblendvb
1054  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
1055  // + vpblendvb
1056  };
1057 
1058  if (ST->hasAVX2())
1059  if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1060  return LT.first * Entry->Cost;
1061 
1062  static const CostTblEntry XOPShuffleTbl[] = {
1063  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
1064  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
1065  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
1066  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
1067  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1068  // + vinsertf128
1069  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
1070  // + vinsertf128
1071 
1072  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1073  // + vinsertf128
1074  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
1075  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
1076  // + vinsertf128
1077  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
1078  };
1079 
1080  if (ST->hasXOP())
1081  if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1082  return LT.first * Entry->Cost;
1083 
1084  static const CostTblEntry AVX1ShuffleTbl[] = {
1085  {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1086  {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1087  {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1088  {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1089  {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1090  {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
1091 
1092  {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1093  {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1094  {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1095  {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1096  {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1097  // + vinsertf128
1098  {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
1099  // + vinsertf128
1100 
1101  {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
1102  {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
1103  {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
1104  {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
1105  {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
1106  {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
1107 
1108  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
1109  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
1110  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1111  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1112  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
1113  // + 2*por + vinsertf128
1114  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
1115  // + 2*por + vinsertf128
1116 
1117  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
1118  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
1119  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1120  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1121  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
1122  // + 4*por + vinsertf128
1123  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
1124  // + 4*por + vinsertf128
1125  };
1126 
1127  if (ST->hasAVX())
1128  if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
1129  return LT.first * Entry->Cost;
1130 
1131  static const CostTblEntry SSE41ShuffleTbl[] = {
1132  {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
1133  {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1134  {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
1135  {TTI::SK_Select, MVT::v4f32, 1}, // blendps
1136  {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
1137  {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
1138  };
1139 
1140  if (ST->hasSSE41())
1141  if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
1142  return LT.first * Entry->Cost;
1143 
1144  static const CostTblEntry SSSE3ShuffleTbl[] = {
1145  {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
1146  {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
1147 
1148  {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
1149  {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
1150 
1151  {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
1152  {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
1153 
1154  {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
1155  {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1156 
1157  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
1158  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
1159  };
1160 
1161  if (ST->hasSSSE3())
1162  if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
1163  return LT.first * Entry->Cost;
1164 
1165  static const CostTblEntry SSE2ShuffleTbl[] = {
1166  {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
1167  {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
1168  {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
1169  {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
1170  {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
1171 
1172  {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
1173  {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
1174  {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
1175  {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
1176  {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
1177  // + 2*pshufd + 2*unpck + packus
1178 
1179  {TTI::SK_Select, MVT::v2i64, 1}, // movsd
1180  {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1181  {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
1182  {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
1183  {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
1184 
1185  {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
1186  {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
1187  {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
1188  {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
1189  // + pshufd/unpck
1190  { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
1191  // + 2*pshufd + 2*unpck + 2*packus
1192 
1193  { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
1194  { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
1195  { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
1196  { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
1197  { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
1198  };
1199 
1200  if (ST->hasSSE2())
1201  if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
1202  return LT.first * Entry->Cost;
1203 
1204  static const CostTblEntry SSE1ShuffleTbl[] = {
1205  { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
1206  { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
1207  { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
1208  { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
1209  { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
1210  };
1211 
1212  if (ST->hasSSE1())
1213  if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
1214  return LT.first * Entry->Cost;
1215 
1216  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
1217 }
1218 
1219 int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1220  const Instruction *I) {
1221  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1222  assert(ISD && "Invalid opcode");
1223 
1224  // FIXME: Need a better design of the cost table to handle non-simple types of
1225  // potential massive combinations (elem_num x src_type x dst_type).
1226 
1227  static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
1230 
1231  // Mask sign extend has an instruction.
1238 
1239  // Mask zero extend is a load + broadcast.
1246  };
1247 
1248  static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
1255 
1262 
1269 
1276  };
1277 
1278  // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
1279  // 256-bit wide vectors.
1280 
1281  static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
1285 
1290 
1291  // v16i1 -> v16i32 - load + broadcast
1302 
1311 
1336 
1338 
1348  };
1349 
1350  static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
1367 
1374 
1377 
1379  };
1380 
1381  static const TypeConversionCostTblEntry AVXConversionTbl[] = {
1398 
1406 
1419 
1435  // The generic code to compute the scalar overhead is currently broken.
1436  // Workaround this limitation by estimating the scalarization overhead
1437  // here. We have roughly 10 instructions per scalar element.
1438  // Multiply that by the vector width.
1439  // FIXME: remove that when PR19268 is fixed.
1442 
1445  // This node is expanded into scalarized operations but BasicTTI is overly
1446  // optimistic estimating its cost. It computes 3 per element (one
1447  // vector-extract, one scalar conversion and one vector-insert). The
1448  // problem is that the inserts form a read-modify-write chain so latency
1449  // should be factored in too. Inflating the cost per element by 1.
1452 
1455  };
1456 
1457  static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
1464 
1483 
1491 
1493  };
1494 
1495  static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
1496  // These are somewhat magic numbers justified by looking at the output of
1497  // Intel's IACA, running some kernels and making sure when we take
1498  // legalization into account the throughput will be overestimated.
1500  { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1507 
1508  { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1516 
1518 
1520 
1545 
1555  };
1556 
1557  std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
1558  std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);
1559 
1560  if (ST->hasSSE2() && !ST->hasAVX()) {
1561  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
1562  LTDest.second, LTSrc.second))
1563  return LTSrc.first * Entry->Cost;
1564  }
1565 
1566  EVT SrcTy = TLI->getValueType(DL, Src);
1567  EVT DstTy = TLI->getValueType(DL, Dst);
1568 
1569  // The function getSimpleVT only handles simple value types.
1570  if (!SrcTy.isSimple() || !DstTy.isSimple())
1571  return BaseT::getCastInstrCost(Opcode, Dst, Src);
1572 
1573  MVT SimpleSrcTy = SrcTy.getSimpleVT();
1574  MVT SimpleDstTy = DstTy.getSimpleVT();
1575 
1576  // Make sure that neither type is going to be split before using the
1577  // AVX512 tables. This handles -mprefer-vector-width=256
1578  // with -min-legal-vector-width<=256
1579  if (TLI->getTypeAction(SimpleSrcTy) != TargetLowering::TypeSplitVector &&
1580  TLI->getTypeAction(SimpleDstTy) != TargetLowering::TypeSplitVector) {
1581  if (ST->hasBWI())
1582  if (const auto *Entry = ConvertCostTableLookup(AVX512BWConversionTbl, ISD,
1583  SimpleDstTy, SimpleSrcTy))
1584  return Entry->Cost;
1585 
1586  if (ST->hasDQI())
1587  if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
1588  SimpleDstTy, SimpleSrcTy))
1589  return Entry->Cost;
1590 
1591  if (ST->hasAVX512())
1592  if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
1593  SimpleDstTy, SimpleSrcTy))
1594  return Entry->Cost;
1595  }
1596 
1597  if (ST->hasAVX2()) {
1598  if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
1599  SimpleDstTy, SimpleSrcTy))
1600  return Entry->Cost;
1601  }
1602 
1603  if (ST->hasAVX()) {
1604  if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
1605  SimpleDstTy, SimpleSrcTy))
1606  return Entry->Cost;
1607  }
1608 
1609  if (ST->hasSSE41()) {
1610  if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
1611  SimpleDstTy, SimpleSrcTy))
1612  return Entry->Cost;
1613  }
1614 
1615  if (ST->hasSSE2()) {
1616  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
1617  SimpleDstTy, SimpleSrcTy))
1618  return Entry->Cost;
1619  }
1620 
1621  return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
1622 }
1623 
1624 int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1625  const Instruction *I) {
1626  // Legalize the type.
1627  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1628 
1629  MVT MTy = LT.second;
1630 
1631  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1632  assert(ISD && "Invalid opcode");
1633 
1634  static const CostTblEntry SSE2CostTbl[] = {
1635  { ISD::SETCC, MVT::v2i64, 8 },
1636  { ISD::SETCC, MVT::v4i32, 1 },
1637  { ISD::SETCC, MVT::v8i16, 1 },
1638  { ISD::SETCC, MVT::v16i8, 1 },
1639  };
1640 
1641  static const CostTblEntry SSE42CostTbl[] = {
1642  { ISD::SETCC, MVT::v2f64, 1 },
1643  { ISD::SETCC, MVT::v4f32, 1 },
1644  { ISD::SETCC, MVT::v2i64, 1 },
1645  };
1646 
1647  static const CostTblEntry AVX1CostTbl[] = {
1648  { ISD::SETCC, MVT::v4f64, 1 },
1649  { ISD::SETCC, MVT::v8f32, 1 },
1650  // AVX1 does not support 8-wide integer compare.
1651  { ISD::SETCC, MVT::v4i64, 4 },
1652  { ISD::SETCC, MVT::v8i32, 4 },
1653  { ISD::SETCC, MVT::v16i16, 4 },
1654  { ISD::SETCC, MVT::v32i8, 4 },
1655  };
1656 
1657  static const CostTblEntry AVX2CostTbl[] = {
1658  { ISD::SETCC, MVT::v4i64, 1 },
1659  { ISD::SETCC, MVT::v8i32, 1 },
1660  { ISD::SETCC, MVT::v16i16, 1 },
1661  { ISD::SETCC, MVT::v32i8, 1 },
1662  };
1663 
1664  static const CostTblEntry AVX512CostTbl[] = {
1665  { ISD::SETCC, MVT::v8i64, 1 },
1666  { ISD::SETCC, MVT::v16i32, 1 },
1667  { ISD::SETCC, MVT::v8f64, 1 },
1668  { ISD::SETCC, MVT::v16f32, 1 },
1669  };
1670 
1671  static const CostTblEntry AVX512BWCostTbl[] = {
1672  { ISD::SETCC, MVT::v32i16, 1 },
1673  { ISD::SETCC, MVT::v64i8, 1 },
1674  };
1675 
1676  if (ST->hasBWI())
1677  if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
1678  return LT.first * Entry->Cost;
1679 
1680  if (ST->hasAVX512())
1681  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
1682  return LT.first * Entry->Cost;
1683 
1684  if (ST->hasAVX2())
1685  if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
1686  return LT.first * Entry->Cost;
1687 
1688  if (ST->hasAVX())
1689  if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
1690  return LT.first * Entry->Cost;
1691 
1692  if (ST->hasSSE42())
1693  if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
1694  return LT.first * Entry->Cost;
1695 
1696  if (ST->hasSSE2())
1697  if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
1698  return LT.first * Entry->Cost;
1699 
1700  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
1701 }
1702 
1704 
1707  unsigned ScalarizationCostPassed) {
1708  // Costs should match the codegen from:
1709  // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
1710  // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
1711  // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
1712  // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
1713  // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
1714  static const CostTblEntry AVX512CDCostTbl[] = {
1715  { ISD::CTLZ, MVT::v8i64, 1 },
1716  { ISD::CTLZ, MVT::v16i32, 1 },
1717  { ISD::CTLZ, MVT::v32i16, 8 },
1718  { ISD::CTLZ, MVT::v64i8, 20 },
1719  { ISD::CTLZ, MVT::v4i64, 1 },
1720  { ISD::CTLZ, MVT::v8i32, 1 },
1721  { ISD::CTLZ, MVT::v16i16, 4 },
1722  { ISD::CTLZ, MVT::v32i8, 10 },
1723  { ISD::CTLZ, MVT::v2i64, 1 },
1724  { ISD::CTLZ, MVT::v4i32, 1 },
1725  { ISD::CTLZ, MVT::v8i16, 4 },
1726  { ISD::CTLZ, MVT::v16i8, 4 },
1727  };
1728  static const CostTblEntry AVX512BWCostTbl[] = {
1729  { ISD::BITREVERSE, MVT::v8i64, 5 },
1730  { ISD::BITREVERSE, MVT::v16i32, 5 },
1731  { ISD::BITREVERSE, MVT::v32i16, 5 },
1732  { ISD::BITREVERSE, MVT::v64i8, 5 },
1733  { ISD::CTLZ, MVT::v8i64, 23 },
1734  { ISD::CTLZ, MVT::v16i32, 22 },
1735  { ISD::CTLZ, MVT::v32i16, 18 },
1736  { ISD::CTLZ, MVT::v64i8, 17 },
1737  { ISD::CTPOP, MVT::v8i64, 7 },
1738  { ISD::CTPOP, MVT::v16i32, 11 },
1739  { ISD::CTPOP, MVT::v32i16, 9 },
1740  { ISD::CTPOP, MVT::v64i8, 6 },
1741  { ISD::CTTZ, MVT::v8i64, 10 },
1742  { ISD::CTTZ, MVT::v16i32, 14 },
1743  { ISD::CTTZ, MVT::v32i16, 12 },
1744  { ISD::CTTZ, MVT::v64i8, 9 },
1745  };
1746  static const CostTblEntry AVX512CostTbl[] = {
1747  { ISD::BITREVERSE, MVT::v8i64, 36 },
1748  { ISD::BITREVERSE, MVT::v16i32, 24 },
1749  { ISD::CTLZ, MVT::v8i64, 29 },
1750  { ISD::CTLZ, MVT::v16i32, 35 },
1751  { ISD::CTPOP, MVT::v8i64, 16 },
1752  { ISD::CTPOP, MVT::v16i32, 24 },
1753  { ISD::CTTZ, MVT::v8i64, 20 },
1754  { ISD::CTTZ, MVT::v16i32, 28 },
1755  };
1756  static const CostTblEntry XOPCostTbl[] = {
1757  { ISD::BITREVERSE, MVT::v4i64, 4 },
1758  { ISD::BITREVERSE, MVT::v8i32, 4 },
1759  { ISD::BITREVERSE, MVT::v16i16, 4 },
1760  { ISD::BITREVERSE, MVT::v32i8, 4 },
1761  { ISD::BITREVERSE, MVT::v2i64, 1 },
1762  { ISD::BITREVERSE, MVT::v4i32, 1 },
1763  { ISD::BITREVERSE, MVT::v8i16, 1 },
1764  { ISD::BITREVERSE, MVT::v16i8, 1 },
1765  { ISD::BITREVERSE, MVT::i64, 3 },
1766  { ISD::BITREVERSE, MVT::i32, 3 },
1767  { ISD::BITREVERSE, MVT::i16, 3 },
1768  { ISD::BITREVERSE, MVT::i8, 3 }
1769  };
1770  static const CostTblEntry AVX2CostTbl[] = {
1771  { ISD::BITREVERSE, MVT::v4i64, 5 },
1772  { ISD::BITREVERSE, MVT::v8i32, 5 },
1773  { ISD::BITREVERSE, MVT::v16i16, 5 },
1774  { ISD::BITREVERSE, MVT::v32i8, 5 },
1775  { ISD::BSWAP, MVT::v4i64, 1 },
1776  { ISD::BSWAP, MVT::v8i32, 1 },
1777  { ISD::BSWAP, MVT::v16i16, 1 },
1778  { ISD::CTLZ, MVT::v4i64, 23 },
1779  { ISD::CTLZ, MVT::v8i32, 18 },
1780  { ISD::CTLZ, MVT::v16i16, 14 },
1781  { ISD::CTLZ, MVT::v32i8, 9 },
1782  { ISD::CTPOP, MVT::v4i64, 7 },
1783  { ISD::CTPOP, MVT::v8i32, 11 },
1784  { ISD::CTPOP, MVT::v16i16, 9 },
1785  { ISD::CTPOP, MVT::v32i8, 6 },
1786  { ISD::CTTZ, MVT::v4i64, 10 },
1787  { ISD::CTTZ, MVT::v8i32, 14 },
1788  { ISD::CTTZ, MVT::v16i16, 12 },
1789  { ISD::CTTZ, MVT::v32i8, 9 },
1790  { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/
1791  { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
1792  { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
1793  { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/
1794  { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
1795  { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
1796  };
1797  static const CostTblEntry AVX1CostTbl[] = {
1798  { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert
1799  { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert
1800  { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert
1801  { ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert
1802  { ISD::BSWAP, MVT::v4i64, 4 },
1803  { ISD::BSWAP, MVT::v8i32, 4 },
1804  { ISD::BSWAP, MVT::v16i16, 4 },
1805  { ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert
1806  { ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert
1807  { ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert
1808  { ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
1809  { ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert
1810  { ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert
1811  { ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert
1812  { ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert
1813  { ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert
1814  { ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert
1815  { ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert
1816  { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
1817  { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
1818  { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
1819  { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
1820  { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/
1821  { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/
1822  { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/
1823  };
1824  static const CostTblEntry GLMCostTbl[] = {
1825  { ISD::FSQRT, MVT::f32, 19 }, // sqrtss
1826  { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps
1827  { ISD::FSQRT, MVT::f64, 34 }, // sqrtsd
1828  { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd
1829  };
1830  static const CostTblEntry SLMCostTbl[] = {
1831  { ISD::FSQRT, MVT::f32, 20 }, // sqrtss
1832  { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps
1833  { ISD::FSQRT, MVT::f64, 35 }, // sqrtsd
1834  { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd
1835  };
1836  static const CostTblEntry SSE42CostTbl[] = {
1837  { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
1838  { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
1839  };
1840  static const CostTblEntry SSSE3CostTbl[] = {
1841  { ISD::BITREVERSE, MVT::v2i64, 5 },
1842  { ISD::BITREVERSE, MVT::v4i32, 5 },
1843  { ISD::BITREVERSE, MVT::v8i16, 5 },
1844  { ISD::BITREVERSE, MVT::v16i8, 5 },
1845  { ISD::BSWAP, MVT::v2i64, 1 },
1846  { ISD::BSWAP, MVT::v4i32, 1 },
1847  { ISD::BSWAP, MVT::v8i16, 1 },
1848  { ISD::CTLZ, MVT::v2i64, 23 },
1849  { ISD::CTLZ, MVT::v4i32, 18 },
1850  { ISD::CTLZ, MVT::v8i16, 14 },
1851  { ISD::CTLZ, MVT::v16i8, 9 },
1852  { ISD::CTPOP, MVT::v2i64, 7 },
1853  { ISD::CTPOP, MVT::v4i32, 11 },
1854  { ISD::CTPOP, MVT::v8i16, 9 },
1855  { ISD::CTPOP, MVT::v16i8, 6 },
1856  { ISD::CTTZ, MVT::v2i64, 10 },
1857  { ISD::CTTZ, MVT::v4i32, 14 },
1858  { ISD::CTTZ, MVT::v8i16, 12 },
1859  { ISD::CTTZ, MVT::v16i8, 9 }
1860  };
1861  static const CostTblEntry SSE2CostTbl[] = {
1862  { ISD::BITREVERSE, MVT::v2i64, 29 },
1863  { ISD::BITREVERSE, MVT::v4i32, 27 },
1864  { ISD::BITREVERSE, MVT::v8i16, 27 },
1865  { ISD::BITREVERSE, MVT::v16i8, 20 },
1866  { ISD::BSWAP, MVT::v2i64, 7 },
1867  { ISD::BSWAP, MVT::v4i32, 7 },
1868  { ISD::BSWAP, MVT::v8i16, 7 },
1869  { ISD::CTLZ, MVT::v2i64, 25 },
1870  { ISD::CTLZ, MVT::v4i32, 26 },
1871  { ISD::CTLZ, MVT::v8i16, 20 },
1872  { ISD::CTLZ, MVT::v16i8, 17 },
1873  { ISD::CTPOP, MVT::v2i64, 12 },
1874  { ISD::CTPOP, MVT::v4i32, 15 },
1875  { ISD::CTPOP, MVT::v8i16, 13 },
1876  { ISD::CTPOP, MVT::v16i8, 10 },
1877  { ISD::CTTZ, MVT::v2i64, 14 },
1878  { ISD::CTTZ, MVT::v4i32, 18 },
1879  { ISD::CTTZ, MVT::v8i16, 16 },
1880  { ISD::CTTZ, MVT::v16i8, 13 },
1881  { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/
1882  { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/
1883  };
1884  static const CostTblEntry SSE1CostTbl[] = {
1885  { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
1886  { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
1887  };
1888  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
1889  { ISD::BITREVERSE, MVT::i64, 14 }
1890  };
1891  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1892  { ISD::BITREVERSE, MVT::i32, 14 },
1893  { ISD::BITREVERSE, MVT::i16, 14 },
1894  { ISD::BITREVERSE, MVT::i8, 11 }
1895  };
1896 
1897  unsigned ISD = ISD::DELETED_NODE;
1898  switch (IID) {
1899  default:
1900  break;
1901  case Intrinsic::bitreverse:
1902  ISD = ISD::BITREVERSE;
1903  break;
1904  case Intrinsic::bswap:
1905  ISD = ISD::BSWAP;
1906  break;
1907  case Intrinsic::ctlz:
1908  ISD = ISD::CTLZ;
1909  break;
1910  case Intrinsic::ctpop:
1911  ISD = ISD::CTPOP;
1912  break;
1913  case Intrinsic::cttz:
1914  ISD = ISD::CTTZ;
1915  break;
1916  case Intrinsic::sqrt:
1917  ISD = ISD::FSQRT;
1918  break;
1919  }
1920 
1921  if (ISD != ISD::DELETED_NODE) {
1922  // Legalize the type.
1923  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
1924  MVT MTy = LT.second;
1925 
1926  // Attempt to lookup cost.
1927  if (ST->isGLM())
1928  if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
1929  return LT.first * Entry->Cost;
1930 
1931  if (ST->isSLM())
1932  if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
1933  return LT.first * Entry->Cost;
1934 
1935  if (ST->hasCDI())
1936  if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
1937  return LT.first * Entry->Cost;
1938 
1939  if (ST->hasBWI())
1940  if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
1941  return LT.first * Entry->Cost;
1942 
1943  if (ST->hasAVX512())
1944  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
1945  return LT.first * Entry->Cost;
1946 
1947  if (ST->hasXOP())
1948  if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
1949  return LT.first * Entry->Cost;
1950 
1951  if (ST->hasAVX2())
1952  if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
1953  return LT.first * Entry->Cost;
1954 
1955  if (ST->hasAVX())
1956  if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
1957  return LT.first * Entry->Cost;
1958 
1959  if (ST->hasSSE42())
1960  if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
1961  return LT.first * Entry->Cost;
1962 
1963  if (ST->hasSSSE3())
1964  if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
1965  return LT.first * Entry->Cost;
1966 
1967  if (ST->hasSSE2())
1968  if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
1969  return LT.first * Entry->Cost;
1970 
1971  if (ST->hasSSE1())
1972  if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
1973  return LT.first * Entry->Cost;
1974 
1975  if (ST->is64Bit())
1976  if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
1977  return LT.first * Entry->Cost;
1978 
1979  if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
1980  return LT.first * Entry->Cost;
1981  }
1982 
1983  return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed);
1984 }
1985 
1988  unsigned VF) {
1989  static const CostTblEntry AVX512CostTbl[] = {
1990  { ISD::ROTL, MVT::v8i64, 1 },
1991  { ISD::ROTL, MVT::v4i64, 1 },
1992  { ISD::ROTL, MVT::v2i64, 1 },
1993  { ISD::ROTL, MVT::v16i32, 1 },
1994  { ISD::ROTL, MVT::v8i32, 1 },
1995  { ISD::ROTL, MVT::v4i32, 1 },
1996  { ISD::ROTR, MVT::v8i64, 1 },
1997  { ISD::ROTR, MVT::v4i64, 1 },
1998  { ISD::ROTR, MVT::v2i64, 1 },
1999  { ISD::ROTR, MVT::v16i32, 1 },
2000  { ISD::ROTR, MVT::v8i32, 1 },
2001  { ISD::ROTR, MVT::v4i32, 1 }
2002  };
2003  // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
2004  static const CostTblEntry XOPCostTbl[] = {
2005  { ISD::ROTL, MVT::v4i64, 4 },
2006  { ISD::ROTL, MVT::v8i32, 4 },
2007  { ISD::ROTL, MVT::v16i16, 4 },
2008  { ISD::ROTL, MVT::v32i8, 4 },
2009  { ISD::ROTL, MVT::v2i64, 1 },
2010  { ISD::ROTL, MVT::v4i32, 1 },
2011  { ISD::ROTL, MVT::v8i16, 1 },
2012  { ISD::ROTL, MVT::v16i8, 1 },
2013  { ISD::ROTR, MVT::v4i64, 6 },
2014  { ISD::ROTR, MVT::v8i32, 6 },
2015  { ISD::ROTR, MVT::v16i16, 6 },
2016  { ISD::ROTR, MVT::v32i8, 6 },
2017  { ISD::ROTR, MVT::v2i64, 2 },
2018  { ISD::ROTR, MVT::v4i32, 2 },
2019  { ISD::ROTR, MVT::v8i16, 2 },
2020  { ISD::ROTR, MVT::v16i8, 2 }
2021  };
2022  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
2023  { ISD::ROTL, MVT::i64, 1 },
2024  { ISD::ROTR, MVT::i64, 1 },
2025  { ISD::FSHL, MVT::i64, 4 }
2026  };
2027  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
2028  { ISD::ROTL, MVT::i32, 1 },
2029  { ISD::ROTL, MVT::i16, 1 },
2030  { ISD::ROTL, MVT::i8, 1 },
2031  { ISD::ROTR, MVT::i32, 1 },
2032  { ISD::ROTR, MVT::i16, 1 },
2033  { ISD::ROTR, MVT::i8, 1 },
2034  { ISD::FSHL, MVT::i32, 4 },
2035  { ISD::FSHL, MVT::i16, 4 },
2036  { ISD::FSHL, MVT::i8, 4 }
2037  };
2038 
2039  unsigned ISD = ISD::DELETED_NODE;
2040  switch (IID) {
2041  default:
2042  break;
2043  case Intrinsic::fshl:
2044  ISD = ISD::FSHL;
2045  if (Args[0] == Args[1])
2046  ISD = ISD::ROTL;
2047  break;
2048  case Intrinsic::fshr:
2049  // FSHR has same costs so don't duplicate.
2050  ISD = ISD::FSHL;
2051  if (Args[0] == Args[1])
2052  ISD = ISD::ROTR;
2053  break;
2054  }
2055 
2056  if (ISD != ISD::DELETED_NODE) {
2057  // Legalize the type.
2058  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
2059  MVT MTy = LT.second;
2060 
2061  // Attempt to lookup cost.
2062  if (ST->hasAVX512())
2063  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2064  return LT.first * Entry->Cost;
2065 
2066  if (ST->hasXOP())
2067  if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
2068  return LT.first * Entry->Cost;
2069 
2070  if (ST->is64Bit())
2071  if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
2072  return LT.first * Entry->Cost;
2073 
2074  if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
2075  return LT.first * Entry->Cost;
2076  }
2077 
2078  return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF);
2079 }
2080 
2081 int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
2082  assert(Val->isVectorTy() && "This must be a vector type");
2083 
2084  Type *ScalarType = Val->getScalarType();
2085 
2086  if (Index != -1U) {
2087  // Legalize the type.
2088  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
2089 
2090  // This type is legalized to a scalar type.
2091  if (!LT.second.isVector())
2092  return 0;
2093 
2094  // The type may be split. Normalize the index to the new type.
2095  unsigned Width = LT.second.getVectorNumElements();
2096  Index = Index % Width;
2097 
2098  // Floating point scalars are already located in index #0.
2099  if (ScalarType->isFloatingPointTy() && Index == 0)
2100  return 0;
2101  }
2102 
2103  // Add to the base cost if we know that the extracted element of a vector is
2104  // destined to be moved to and used in the integer register file.
2105  int RegisterFileMoveCost = 0;
2106  if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
2107  RegisterFileMoveCost = 1;
2108 
2109  return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
2110 }
2111 
2112 int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
2113  unsigned AddressSpace, const Instruction *I) {
2114  // Handle non-power-of-two vectors such as <3 x float>
2115  if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
2116  unsigned NumElem = VTy->getVectorNumElements();
2117 
2118  // Handle a few common cases:
2119  // <3 x float>
2120  if (NumElem == 3 && VTy->getScalarSizeInBits() == 32)
2121  // Cost = 64 bit store + extract + 32 bit store.
2122  return 3;
2123 
2124  // <3 x double>
2125  if (NumElem == 3 && VTy->getScalarSizeInBits() == 64)
2126  // Cost = 128 bit store + unpack + 64 bit store.
2127  return 3;
2128 
2129  // Assume that all other non-power-of-two numbers are scalarized.
2130  if (!isPowerOf2_32(NumElem)) {
2131  int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
2132  AddressSpace);
2133  int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load,
2134  Opcode == Instruction::Store);
2135  return NumElem * Cost + SplitCost;
2136  }
2137  }
2138 
2139  // Legalize the type.
2140  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
2141  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
2142  "Invalid Opcode");
2143 
2144  // Each load/store unit costs 1.
2145  int Cost = LT.first * 1;
2146 
2147  // This isn't exactly right. We're using slow unaligned 32-byte accesses as a
2148  // proxy for a double-pumped AVX memory interface such as on Sandybridge.
2149  if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow())
2150  Cost *= 2;
2151 
2152  return Cost;
2153 }
2154 
2155 int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
2156  unsigned Alignment,
2157  unsigned AddressSpace) {
2158  VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy);
2159  if (!SrcVTy)
2160  // To calculate scalar take the regular cost, without mask
2161  return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace);
2162 
2163  unsigned NumElem = SrcVTy->getVectorNumElements();
2164  VectorType *MaskTy =
2165  VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
2166  if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy)) ||
2167  (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy)) ||
2168  !isPowerOf2_32(NumElem)) {
2169  // Scalarization
2170  int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
2171  int ScalarCompareCost = getCmpSelInstrCost(
2172  Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr);
2173  int BranchCost = getCFInstrCost(Instruction::Br);
2174  int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
2175 
2176  int ValueSplitCost = getScalarizationOverhead(
2177  SrcVTy, Opcode == Instruction::Load, Opcode == Instruction::Store);
2178  int MemopCost =
2179  NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
2180  Alignment, AddressSpace);
2181  return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
2182  }
2183 
2184  // Legalize the type.
2185  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
2186  auto VT = TLI->getValueType(DL, SrcVTy);
2187  int Cost = 0;
2188  if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
2189  LT.second.getVectorNumElements() == NumElem)
2190  // Promotion requires expand/truncate for data and a shuffle for mask.
2191  Cost += getShuffleCost(TTI::SK_Select, SrcVTy, 0, nullptr) +
2192  getShuffleCost(TTI::SK_Select, MaskTy, 0, nullptr);
2193 
2194  else if (LT.second.getVectorNumElements() > NumElem) {
2195  VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(),
2196  LT.second.getVectorNumElements());
2197  // Expanding requires fill mask with zeroes
2198  Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy);
2199  }
2200  if (!ST->hasAVX512())
2201  return Cost + LT.first*4; // Each maskmov costs 4
2202 
2203  // AVX-512 masked load/store is cheapper
2204  return Cost+LT.first;
2205 }
2206 
2208  const SCEV *Ptr) {
2209  // Address computations in vectorized code with non-consecutive addresses will
2210  // likely result in more instructions compared to scalar code where the
2211  // computation can more often be merged into the index mode. The resulting
2212  // extra micro-ops can significantly decrease throughput.
2213  unsigned NumVectorInstToHideOverhead = 10;
2214 
2215  // Cost modeling of Strided Access Computation is hidden by the indexing
2216  // modes of X86 regardless of the stride value. We dont believe that there
2217  // is a difference between constant strided access in gerenal and constant
2218  // strided value which is less than or equal to 64.
2219  // Even in the case of (loop invariant) stride whose value is not known at
2220  // compile time, the address computation will not incur more than one extra
2221  // ADD instruction.
2222  if (Ty->isVectorTy() && SE) {
2223  if (!BaseT::isStridedAccess(Ptr))
2224  return NumVectorInstToHideOverhead;
2225  if (!BaseT::getConstantStrideStep(SE, Ptr))
2226  return 1;
2227  }
2228 
2229  return BaseT::getAddressComputationCost(Ty, SE, Ptr);
2230 }
2231 
2232 int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
2233  bool IsPairwise) {
2234 
2235  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2236 
2237  MVT MTy = LT.second;
2238 
2239  int ISD = TLI->InstructionOpcodeToISD(Opcode);
2240  assert(ISD && "Invalid opcode");
2241 
2242  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
2243  // and make it as the cost.
2244 
2245  static const CostTblEntry SSE42CostTblPairWise[] = {
2246  { ISD::FADD, MVT::v2f64, 2 },
2247  { ISD::FADD, MVT::v4f32, 4 },
2248  { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
2249  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
2250  { ISD::ADD, MVT::v8i16, 5 },
2251  };
2252 
2253  static const CostTblEntry AVX1CostTblPairWise[] = {
2254  { ISD::FADD, MVT::v4f32, 4 },
2255  { ISD::FADD, MVT::v4f64, 5 },
2256  { ISD::FADD, MVT::v8f32, 7 },
2257  { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
2258  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
2259  { ISD::ADD, MVT::v4i64, 5 }, // The data reported by the IACA tool is "4.8".
2260  { ISD::ADD, MVT::v8i16, 5 },
2261  { ISD::ADD, MVT::v8i32, 5 },
2262  };
2263 
2264  static const CostTblEntry SSE42CostTblNoPairWise[] = {
2265  { ISD::FADD, MVT::v2f64, 2 },
2266  { ISD::FADD, MVT::v4f32, 4 },
2267  { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
2268  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
2269  { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
2270  };
2271 
2272  static const CostTblEntry AVX1CostTblNoPairWise[] = {
2273  { ISD::FADD, MVT::v4f32, 3 },
2274  { ISD::FADD, MVT::v4f64, 3 },
2275  { ISD::FADD, MVT::v8f32, 4 },
2276  { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
2277  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "2.8".
2278  { ISD::ADD, MVT::v4i64, 3 },
2279  { ISD::ADD, MVT::v8i16, 4 },
2280  { ISD::ADD, MVT::v8i32, 5 },
2281  };
2282 
2283  if (IsPairwise) {
2284  if (ST->hasAVX())
2285  if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
2286  return LT.first * Entry->Cost;
2287 
2288  if (ST->hasSSE42())
2289  if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
2290  return LT.first * Entry->Cost;
2291  } else {
2292  if (ST->hasAVX())
2293  if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
2294  return LT.first * Entry->Cost;
2295 
2296  if (ST->hasSSE42())
2297  if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
2298  return LT.first * Entry->Cost;
2299  }
2300 
2301  return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise);
2302 }
2303 
2305  bool IsPairwise, bool IsUnsigned) {
2306  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2307 
2308  MVT MTy = LT.second;
2309 
2310  int ISD;
2311  if (ValTy->isIntOrIntVectorTy()) {
2312  ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
2313  } else {
2314  assert(ValTy->isFPOrFPVectorTy() &&
2315  "Expected float point or integer vector type.");
2316  ISD = ISD::FMINNUM;
2317  }
2318 
2319  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
2320  // and make it as the cost.
2321 
2322  static const CostTblEntry SSE42CostTblPairWise[] = {
2323  {ISD::FMINNUM, MVT::v2f64, 3},
2324  {ISD::FMINNUM, MVT::v4f32, 2},
2325  {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
2326  {ISD::UMIN, MVT::v2i64, 8}, // The data reported by the IACA is "8.6"
2327  {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
2328  {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
2329  {ISD::SMIN, MVT::v8i16, 2},
2330  {ISD::UMIN, MVT::v8i16, 2},
2331  };
2332 
2333  static const CostTblEntry AVX1CostTblPairWise[] = {
2334  {ISD::FMINNUM, MVT::v4f32, 1},
2335  {ISD::FMINNUM, MVT::v4f64, 1},
2336  {ISD::FMINNUM, MVT::v8f32, 2},
2337  {ISD::SMIN, MVT::v2i64, 3},
2338  {ISD::UMIN, MVT::v2i64, 3},
2339  {ISD::SMIN, MVT::v4i32, 1},
2340  {ISD::UMIN, MVT::v4i32, 1},
2341  {ISD::SMIN, MVT::v8i16, 1},
2342  {ISD::UMIN, MVT::v8i16, 1},
2343  {ISD::SMIN, MVT::v8i32, 3},
2344  {ISD::UMIN, MVT::v8i32, 3},
2345  };
2346 
2347  static const CostTblEntry AVX2CostTblPairWise[] = {
2348  {ISD::SMIN, MVT::v4i64, 2},
2349  {ISD::UMIN, MVT::v4i64, 2},
2350  {ISD::SMIN, MVT::v8i32, 1},
2351  {ISD::UMIN, MVT::v8i32, 1},
2352  {ISD::SMIN, MVT::v16i16, 1},
2353  {ISD::UMIN, MVT::v16i16, 1},
2354  {ISD::SMIN, MVT::v32i8, 2},
2355  {ISD::UMIN, MVT::v32i8, 2},
2356  };
2357 
2358  static const CostTblEntry AVX512CostTblPairWise[] = {
2359  {ISD::FMINNUM, MVT::v8f64, 1},
2360  {ISD::FMINNUM, MVT::v16f32, 2},
2361  {ISD::SMIN, MVT::v8i64, 2},
2362  {ISD::UMIN, MVT::v8i64, 2},
2363  {ISD::SMIN, MVT::v16i32, 1},
2364  {ISD::UMIN, MVT::v16i32, 1},
2365  };
2366 
2367  static const CostTblEntry SSE42CostTblNoPairWise[] = {
2368  {ISD::FMINNUM, MVT::v2f64, 3},
2369  {ISD::FMINNUM, MVT::v4f32, 3},
2370  {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
2371  {ISD::UMIN, MVT::v2i64, 9}, // The data reported by the IACA is "8.6"
2372  {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
2373  {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
2374  {ISD::SMIN, MVT::v8i16, 1}, // The data reported by the IACA is "1.5"
2375  {ISD::UMIN, MVT::v8i16, 2}, // The data reported by the IACA is "1.8"
2376  };
2377 
2378  static const CostTblEntry AVX1CostTblNoPairWise[] = {
2379  {ISD::FMINNUM, MVT::v4f32, 1},
2380  {ISD::FMINNUM, MVT::v4f64, 1},
2381  {ISD::FMINNUM, MVT::v8f32, 1},
2382  {ISD::SMIN, MVT::v2i64, 3},
2383  {ISD::UMIN, MVT::v2i64, 3},
2384  {ISD::SMIN, MVT::v4i32, 1},
2385  {ISD::UMIN, MVT::v4i32, 1},
2386  {ISD::SMIN, MVT::v8i16, 1},
2387  {ISD::UMIN, MVT::v8i16, 1},
2388  {ISD::SMIN, MVT::v8i32, 2},
2389  {ISD::UMIN, MVT::v8i32, 2},
2390  };
2391 
2392  static const CostTblEntry AVX2CostTblNoPairWise[] = {
2393  {ISD::SMIN, MVT::v4i64, 1},
2394  {ISD::UMIN, MVT::v4i64, 1},
2395  {ISD::SMIN, MVT::v8i32, 1},
2396  {ISD::UMIN, MVT::v8i32, 1},
2397  {ISD::SMIN, MVT::v16i16, 1},
2398  {ISD::UMIN, MVT::v16i16, 1},
2399  {ISD::SMIN, MVT::v32i8, 1},
2400  {ISD::UMIN, MVT::v32i8, 1},
2401  };
2402 
2403  static const CostTblEntry AVX512CostTblNoPairWise[] = {
2404  {ISD::FMINNUM, MVT::v8f64, 1},
2405  {ISD::FMINNUM, MVT::v16f32, 2},
2406  {ISD::SMIN, MVT::v8i64, 1},
2407  {ISD::UMIN, MVT::v8i64, 1},
2408  {ISD::SMIN, MVT::v16i32, 1},
2409  {ISD::UMIN, MVT::v16i32, 1},
2410  };
2411 
2412  if (IsPairwise) {
2413  if (ST->hasAVX512())
2414  if (const auto *Entry = CostTableLookup(AVX512CostTblPairWise, ISD, MTy))
2415  return LT.first * Entry->Cost;
2416 
2417  if (ST->hasAVX2())
2418  if (const auto *Entry = CostTableLookup(AVX2CostTblPairWise, ISD, MTy))
2419  return LT.first * Entry->Cost;
2420 
2421  if (ST->hasAVX())
2422  if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
2423  return LT.first * Entry->Cost;
2424 
2425  if (ST->hasSSE42())
2426  if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
2427  return LT.first * Entry->Cost;
2428  } else {
2429  if (ST->hasAVX512())
2430  if (const auto *Entry =
2431  CostTableLookup(AVX512CostTblNoPairWise, ISD, MTy))
2432  return LT.first * Entry->Cost;
2433 
2434  if (ST->hasAVX2())
2435  if (const auto *Entry = CostTableLookup(AVX2CostTblNoPairWise, ISD, MTy))
2436  return LT.first * Entry->Cost;
2437 
2438  if (ST->hasAVX())
2439  if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
2440  return LT.first * Entry->Cost;
2441 
2442  if (ST->hasSSE42())
2443  if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
2444  return LT.first * Entry->Cost;
2445  }
2446 
2447  return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned);
2448 }
2449 
2450 /// Calculate the cost of materializing a 64-bit value. This helper
2451 /// method might only calculate a fraction of a larger immediate. Therefore it
2452 /// is valid to return a cost of ZERO.
2453 int X86TTIImpl::getIntImmCost(int64_t Val) {
2454  if (Val == 0)
2455  return TTI::TCC_Free;
2456 
2457  if (isInt<32>(Val))
2458  return TTI::TCC_Basic;
2459 
2460  return 2 * TTI::TCC_Basic;
2461 }
2462 
2463 int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
2464  assert(Ty->isIntegerTy());
2465 
2466  unsigned BitSize = Ty->getPrimitiveSizeInBits();
2467  if (BitSize == 0)
2468  return ~0U;
2469 
2470  // Never hoist constants larger than 128bit, because this might lead to
2471  // incorrect code generation or assertions in codegen.
2472  // Fixme: Create a cost model for types larger than i128 once the codegen
2473  // issues have been fixed.
2474  if (BitSize > 128)
2475  return TTI::TCC_Free;
2476 
2477  if (Imm == 0)
2478  return TTI::TCC_Free;
2479 
2480  // Sign-extend all constants to a multiple of 64-bit.
2481  APInt ImmVal = Imm;
2482  if (BitSize % 64 != 0)
2483  ImmVal = Imm.sext(alignTo(BitSize, 64));
2484 
2485  // Split the constant into 64-bit chunks and calculate the cost for each
2486  // chunk.
2487  int Cost = 0;
2488  for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
2489  APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
2490  int64_t Val = Tmp.getSExtValue();
2491  Cost += getIntImmCost(Val);
2492  }
2493  // We need at least one instruction to materialize the constant.
2494  return std::max(1, Cost);
2495 }
2496 
2497 int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
2498  Type *Ty) {
2499  assert(Ty->isIntegerTy());
2500 
2501  unsigned BitSize = Ty->getPrimitiveSizeInBits();
2502  // There is no cost model for constants with a bit size of 0. Return TCC_Free
2503  // here, so that constant hoisting will ignore this constant.
2504  if (BitSize == 0)
2505  return TTI::TCC_Free;
2506 
2507  unsigned ImmIdx = ~0U;
2508  switch (Opcode) {
2509  default:
2510  return TTI::TCC_Free;
2511  case Instruction::GetElementPtr:
2512  // Always hoist the base address of a GetElementPtr. This prevents the
2513  // creation of new constants for every base constant that gets constant
2514  // folded with the offset.
2515  if (Idx == 0)
2516  return 2 * TTI::TCC_Basic;
2517  return TTI::TCC_Free;
2518  case Instruction::Store:
2519  ImmIdx = 0;
2520  break;
2521  case Instruction::ICmp:
2522  // This is an imperfect hack to prevent constant hoisting of
2523  // compares that might be trying to check if a 64-bit value fits in
2524  // 32-bits. The backend can optimize these cases using a right shift by 32.
2525  // Ideally we would check the compare predicate here. There also other
2526  // similar immediates the backend can use shifts for.
2527  if (Idx == 1 && Imm.getBitWidth() == 64) {
2528  uint64_t ImmVal = Imm.getZExtValue();
2529  if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
2530  return TTI::TCC_Free;
2531  }
2532  ImmIdx = 1;
2533  break;
2534  case Instruction::And:
2535  // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
2536  // by using a 32-bit operation with implicit zero extension. Detect such
2537  // immediates here as the normal path expects bit 31 to be sign extended.
2538  if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue()))
2539  return TTI::TCC_Free;
2540  ImmIdx = 1;
2541  break;
2542  case Instruction::Add:
2543  case Instruction::Sub:
2544  // For add/sub, we can use the opposite instruction for INT32_MIN.
2545  if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000)
2546  return TTI::TCC_Free;
2547  ImmIdx = 1;
2548  break;
2549  case Instruction::UDiv:
2550  case Instruction::SDiv:
2551  case Instruction::URem:
2552  case Instruction::SRem:
2553  // Division by constant is typically expanded later into a different
2554  // instruction sequence. This completely changes the constants.
2555  // Report them as "free" to stop ConstantHoist from marking them as opaque.
2556  return TTI::TCC_Free;
2557  case Instruction::Mul:
2558  case Instruction::Or:
2559  case Instruction::Xor:
2560  ImmIdx = 1;
2561  break;
2562  // Always return TCC_Free for the shift value of a shift instruction.
2563  case Instruction::Shl:
2564  case Instruction::LShr:
2565  case Instruction::AShr:
2566  if (Idx == 1)
2567  return TTI::TCC_Free;
2568  break;
2569  case Instruction::Trunc:
2570  case Instruction::ZExt:
2571  case Instruction::SExt:
2572  case Instruction::IntToPtr:
2573  case Instruction::PtrToInt:
2574  case Instruction::BitCast:
2575  case Instruction::PHI:
2576  case Instruction::Call:
2577  case Instruction::Select:
2578  case Instruction::Ret:
2579  case Instruction::Load:
2580  break;
2581  }
2582 
2583  if (Idx == ImmIdx) {
2584  int NumConstants = divideCeil(BitSize, 64);
2585  int Cost = X86TTIImpl::getIntImmCost(Imm, Ty);
2586  return (Cost <= NumConstants * TTI::TCC_Basic)
2587  ? static_cast<int>(TTI::TCC_Free)
2588  : Cost;
2589  }
2590 
2591  return X86TTIImpl::getIntImmCost(Imm, Ty);
2592 }
2593 
2594 int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
2595  Type *Ty) {
2596  assert(Ty->isIntegerTy());
2597 
2598  unsigned BitSize = Ty->getPrimitiveSizeInBits();
2599  // There is no cost model for constants with a bit size of 0. Return TCC_Free
2600  // here, so that constant hoisting will ignore this constant.
2601  if (BitSize == 0)
2602  return TTI::TCC_Free;
2603 
2604  switch (IID) {
2605  default:
2606  return TTI::TCC_Free;
2607  case Intrinsic::sadd_with_overflow:
2608  case Intrinsic::uadd_with_overflow:
2609  case Intrinsic::ssub_with_overflow:
2610  case Intrinsic::usub_with_overflow:
2611  case Intrinsic::smul_with_overflow:
2612  case Intrinsic::umul_with_overflow:
2613  if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
2614  return TTI::TCC_Free;
2615  break;
2616  case Intrinsic::experimental_stackmap:
2617  if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
2618  return TTI::TCC_Free;
2619  break;
2620  case Intrinsic::experimental_patchpoint_void:
2621  case Intrinsic::experimental_patchpoint_i64:
2622  if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
2623  return TTI::TCC_Free;
2624  break;
2625  }
2626  return X86TTIImpl::getIntImmCost(Imm, Ty);
2627 }
2628 
2629 unsigned X86TTIImpl::getUserCost(const User *U,
2630  ArrayRef<const Value *> Operands) {
2631  if (isa<StoreInst>(U)) {
2632  Value *Ptr = U->getOperand(1);
2633  // Store instruction with index and scale costs 2 Uops.
2634  // Check the preceding GEP to identify non-const indices.
2635  if (auto GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
2636  if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
2637  return TTI::TCC_Basic * 2;
2638  }
2639  return TTI::TCC_Basic;
2640  }
2641  return BaseT::getUserCost(U, Operands);
2642 }
2643 
2644 // Return an average cost of Gather / Scatter instruction, maybe improved later
2645 int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
2646  unsigned Alignment, unsigned AddressSpace) {
2647 
2648  assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
2649  unsigned VF = SrcVTy->getVectorNumElements();
2650 
2651  // Try to reduce index size from 64 bit (default for GEP)
2652  // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
2653  // operation will use 16 x 64 indices which do not fit in a zmm and needs
2654  // to split. Also check that the base pointer is the same for all lanes,
2655  // and that there's at most one variable index.
2656  auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) {
2657  unsigned IndexSize = DL.getPointerSizeInBits();
2659  if (IndexSize < 64 || !GEP)
2660  return IndexSize;
2661 
2662  unsigned NumOfVarIndices = 0;
2663  Value *Ptrs = GEP->getPointerOperand();
2664  if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
2665  return IndexSize;
2666  for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
2667  if (isa<Constant>(GEP->getOperand(i)))
2668  continue;
2669  Type *IndxTy = GEP->getOperand(i)->getType();
2670  if (IndxTy->isVectorTy())
2671  IndxTy = IndxTy->getVectorElementType();
2672  if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
2673  !isa<SExtInst>(GEP->getOperand(i))) ||
2674  ++NumOfVarIndices > 1)
2675  return IndexSize; // 64
2676  }
2677  return (unsigned)32;
2678  };
2679 
2680 
2681  // Trying to reduce IndexSize to 32 bits for vector 16.
2682  // By default the IndexSize is equal to pointer size.
2683  unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
2684  ? getIndexSizeInBits(Ptr, DL)
2686 
2687  Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(),
2688  IndexSize), VF);
2689  std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
2690  std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
2691  int SplitFactor = std::max(IdxsLT.first, SrcLT.first);
2692  if (SplitFactor > 1) {
2693  // Handle splitting of vector of pointers
2694  Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
2695  return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
2696  AddressSpace);
2697  }
2698 
2699  // The gather / scatter cost is given by Intel architects. It is a rough
2700  // number since we are looking at one instruction in a time.
2701  const int GSOverhead = (Opcode == Instruction::Load)
2702  ? ST->getGatherOverhead()
2703  : ST->getScatterOverhead();
2704  return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
2705  Alignment, AddressSpace);
2706 }
2707 
2708 /// Return the cost of full scalarization of gather / scatter operation.
2709 ///
2710 /// Opcode - Load or Store instruction.
2711 /// SrcVTy - The type of the data vector that should be gathered or scattered.
2712 /// VariableMask - The mask is non-constant at compile time.
2713 /// Alignment - Alignment for one element.
2714 /// AddressSpace - pointer[s] address space.
2715 ///
2716 int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
2717  bool VariableMask, unsigned Alignment,
2718  unsigned AddressSpace) {
2719  unsigned VF = SrcVTy->getVectorNumElements();
2720 
2721  int MaskUnpackCost = 0;
2722  if (VariableMask) {
2723  VectorType *MaskTy =
2724  VectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
2725  MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true);
2726  int ScalarCompareCost =
2727  getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()),
2728  nullptr);
2729  int BranchCost = getCFInstrCost(Instruction::Br);
2730  MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
2731  }
2732 
2733  // The cost of the scalar loads/stores.
2734  int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
2735  Alignment, AddressSpace);
2736 
2737  int InsertExtractCost = 0;
2738  if (Opcode == Instruction::Load)
2739  for (unsigned i = 0; i < VF; ++i)
2740  // Add the cost of inserting each scalar load into the vector
2741  InsertExtractCost +=
2742  getVectorInstrCost(Instruction::InsertElement, SrcVTy, i);
2743  else
2744  for (unsigned i = 0; i < VF; ++i)
2745  // Add the cost of extracting each element out of the data vector
2746  InsertExtractCost +=
2747  getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i);
2748 
2749  return MemoryOpCost + MaskUnpackCost + InsertExtractCost;
2750 }
2751 
2752 /// Calculate the cost of Gather / Scatter operation
2753 int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
2754  Value *Ptr, bool VariableMask,
2755  unsigned Alignment) {
2756  assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
2757  unsigned VF = SrcVTy->getVectorNumElements();
2758  PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
2759  if (!PtrTy && Ptr->getType()->isVectorTy())
2760  PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType());
2761  assert(PtrTy && "Unexpected type for Ptr argument");
2762  unsigned AddressSpace = PtrTy->getAddressSpace();
2763 
2764  bool Scalarize = false;
2765  if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) ||
2766  (Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy)))
2767  Scalarize = true;
2768  // Gather / Scatter for vector 2 is not profitable on KNL / SKX
2769  // Vector-4 of gather/scatter instruction does not exist on KNL.
2770  // We can extend it to 8 elements, but zeroing upper bits of
2771  // the mask vector will add more instructions. Right now we give the scalar
2772  // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction
2773  // is better in the VariableMask case.
2774  if (ST->hasAVX512() && (VF == 2 || (VF == 4 && !ST->hasVLX())))
2775  Scalarize = true;
2776 
2777  if (Scalarize)
2778  return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
2779  AddressSpace);
2780 
2781  return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
2782 }
2783 
2786  // X86 specific here are "instruction number 1st priority".
2787  return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
2788  C1.NumIVMuls, C1.NumBaseAdds,
2789  C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
2790  std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
2791  C2.NumIVMuls, C2.NumBaseAdds,
2792  C2.ScaleCost, C2.ImmCost, C2.SetupCost);
2793 }
2794 
2796  return ST->hasMacroFusion();
2797 }
2798 
2800  // The backend can't handle a single element vector.
2801  if (isa<VectorType>(DataTy) && DataTy->getVectorNumElements() == 1)
2802  return false;
2803  Type *ScalarTy = DataTy->getScalarType();
2804  int DataWidth = isa<PointerType>(ScalarTy) ?
2806 
2807  return ((DataWidth == 32 || DataWidth == 64) && ST->hasAVX()) ||
2808  ((DataWidth == 8 || DataWidth == 16) && ST->hasBWI());
2809 }
2810 
2812  return isLegalMaskedLoad(DataType);
2813 }
2814 
2816  // This function is called now in two cases: from the Loop Vectorizer
2817  // and from the Scalarizer.
2818  // When the Loop Vectorizer asks about legality of the feature,
2819  // the vectorization factor is not calculated yet. The Loop Vectorizer
2820  // sends a scalar type and the decision is based on the width of the
2821  // scalar element.
2822  // Later on, the cost model will estimate usage this intrinsic based on
2823  // the vector type.
2824  // The Scalarizer asks again about legality. It sends a vector type.
2825  // In this case we can reject non-power-of-2 vectors.
2826  // We also reject single element vectors as the type legalizer can't
2827  // scalarize it.
2828  if (isa<VectorType>(DataTy)) {
2829  unsigned NumElts = DataTy->getVectorNumElements();
2830  if (NumElts == 1 || !isPowerOf2_32(NumElts))
2831  return false;
2832  }
2833  Type *ScalarTy = DataTy->getScalarType();
2834  int DataWidth = isa<PointerType>(ScalarTy) ?
2836 
2837  // Some CPUs have better gather performance than others.
2838  // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
2839  // enable gather with a -march.
2840  return (DataWidth == 32 || DataWidth == 64) &&
2841  (ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2()));
2842 }
2843 
2845  // AVX2 doesn't support scatter
2846  if (!ST->hasAVX512())
2847  return false;
2848  return isLegalMaskedGather(DataType);
2849 }
2850 
2851 bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
2852  EVT VT = TLI->getValueType(DL, DataType);
2853  return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
2854 }
2855 
2857  return false;
2858 }
2859 
2861  const Function *Callee) const {
2862  const TargetMachine &TM = getTLI()->getTargetMachine();
2863 
2864  // Work this as a subsetting of subtarget features.
2865  const FeatureBitset &CallerBits =
2866  TM.getSubtargetImpl(*Caller)->getFeatureBits();
2867  const FeatureBitset &CalleeBits =
2868  TM.getSubtargetImpl(*Callee)->getFeatureBits();
2869 
2870  // FIXME: This is likely too limiting as it will include subtarget features
2871  // that we might not care about for inlining, but it is conservatively
2872  // correct.
2873  return (CallerBits & CalleeBits) == CalleeBits;
2874 }
2875 
2877 X86TTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const {
2878  // Only enable vector loads for equality comparison.
2879  // Right now the vector version is not as fast, see #33329.
2880  static const auto ThreeWayOptions = [this]() {
2882  if (ST->is64Bit()) {
2883  Options.LoadSizes.push_back(8);
2884  }
2885  Options.LoadSizes.push_back(4);
2886  Options.LoadSizes.push_back(2);
2887  Options.LoadSizes.push_back(1);
2888  return Options;
2889  }();
2890  static const auto EqZeroOptions = [this]() {
2892  // TODO: enable AVX512 when the DAG is ready.
2893  // if (ST->hasAVX512()) Options.LoadSizes.push_back(64);
2894  if (ST->hasAVX2()) Options.LoadSizes.push_back(32);
2895  if (ST->hasSSE2()) Options.LoadSizes.push_back(16);
2896  if (ST->is64Bit()) {
2897  Options.LoadSizes.push_back(8);
2898  }
2899  Options.LoadSizes.push_back(4);
2900  Options.LoadSizes.push_back(2);
2901  Options.LoadSizes.push_back(1);
2902  return Options;
2903  }();
2904  return IsZeroCmp ? &EqZeroOptions : &ThreeWayOptions;
2905 }
2906 
2908  // TODO: We expect this to be beneficial regardless of arch,
2909  // but there are currently some unexplained performance artifacts on Atom.
2910  // As a temporary solution, disable on Atom.
2911  return !(ST->isAtom());
2912 }
2913 
2914 // Get estimation for interleaved load/store operations for AVX2.
2915 // \p Factor is the interleaved-access factor (stride) - number of
2916 // (interleaved) elements in the group.
2917 // \p Indices contains the indices for a strided load: when the
2918 // interleaved load has gaps they indicate which elements are used.
2919 // If Indices is empty (or if the number of indices is equal to the size
2920 // of the interleaved-access as given in \p Factor) the access has no gaps.
2921 //
2922 // As opposed to AVX-512, AVX2 does not have generic shuffles that allow
2923 // computing the cost using a generic formula as a function of generic
2924 // shuffles. We therefore use a lookup table instead, filled according to
2925 // the instruction sequences that codegen currently generates.
2927  unsigned Factor,
2928  ArrayRef<unsigned> Indices,
2929  unsigned Alignment,
2930  unsigned AddressSpace,
2931  bool UseMaskForCond,
2932  bool UseMaskForGaps) {
2933 
2934  if (UseMaskForCond || UseMaskForGaps)
2935  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2936  Alignment, AddressSpace,
2937  UseMaskForCond, UseMaskForGaps);
2938 
2939  // We currently Support only fully-interleaved groups, with no gaps.
2940  // TODO: Support also strided loads (interleaved-groups with gaps).
2941  if (Indices.size() && Indices.size() != Factor)
2942  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2943  Alignment, AddressSpace);
2944 
2945  // VecTy for interleave memop is <VF*Factor x Elt>.
2946  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
2947  // VecTy = <12 x i32>.
2948  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
2949 
2950  // This function can be called with VecTy=<6xi128>, Factor=3, in which case
2951  // the VF=2, while v2i128 is an unsupported MVT vector type
2952  // (see MachineValueType.h::getVectorVT()).
2953  if (!LegalVT.isVector())
2954  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2955  Alignment, AddressSpace);
2956 
2957  unsigned VF = VecTy->getVectorNumElements() / Factor;
2958  Type *ScalarTy = VecTy->getVectorElementType();
2959 
2960  // Calculate the number of memory operations (NumOfMemOps), required
2961  // for load/store the VecTy.
2962  unsigned VecTySize = DL.getTypeStoreSize(VecTy);
2963  unsigned LegalVTSize = LegalVT.getStoreSize();
2964  unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
2965 
2966  // Get the cost of one memory operation.
2967  Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
2968  LegalVT.getVectorNumElements());
2969  unsigned MemOpCost =
2970  getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
2971 
2972  VectorType *VT = VectorType::get(ScalarTy, VF);
2973  EVT ETy = TLI->getValueType(DL, VT);
2974  if (!ETy.isSimple())
2975  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2976  Alignment, AddressSpace);
2977 
2978  // TODO: Complete for other data-types and strides.
2979  // Each combination of Stride, ElementTy and VF results in a different
2980  // sequence; The cost tables are therefore accessed with:
2981  // Factor (stride) and VectorType=VFxElemType.
2982  // The Cost accounts only for the shuffle sequence;
2983  // The cost of the loads/stores is accounted for separately.
2984  //
2985  static const CostTblEntry AVX2InterleavedLoadTbl[] = {
2986  { 2, MVT::v4i64, 6 }, //(load 8i64 and) deinterleave into 2 x 4i64
2987  { 2, MVT::v4f64, 6 }, //(load 8f64 and) deinterleave into 2 x 4f64
2988 
2989  { 3, MVT::v2i8, 10 }, //(load 6i8 and) deinterleave into 3 x 2i8
2990  { 3, MVT::v4i8, 4 }, //(load 12i8 and) deinterleave into 3 x 4i8
2991  { 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8
2992  { 3, MVT::v16i8, 11}, //(load 48i8 and) deinterleave into 3 x 16i8
2993  { 3, MVT::v32i8, 13}, //(load 96i8 and) deinterleave into 3 x 32i8
2994  { 3, MVT::v8f32, 17 }, //(load 24f32 and)deinterleave into 3 x 8f32
2995 
2996  { 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8
2997  { 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8
2998  { 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8
2999  { 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8
3000  { 4, MVT::v32i8, 80 }, //(load 128i8 and) deinterleave into 4 x 32i8
3001 
3002  { 8, MVT::v8f32, 40 } //(load 64f32 and)deinterleave into 8 x 8f32
3003  };
3004 
3005  static const CostTblEntry AVX2InterleavedStoreTbl[] = {
3006  { 2, MVT::v4i64, 6 }, //interleave into 2 x 4i64 into 8i64 (and store)
3007  { 2, MVT::v4f64, 6 }, //interleave into 2 x 4f64 into 8f64 (and store)
3008 
3009  { 3, MVT::v2i8, 7 }, //interleave 3 x 2i8 into 6i8 (and store)
3010  { 3, MVT::v4i8, 8 }, //interleave 3 x 4i8 into 12i8 (and store)
3011  { 3, MVT::v8i8, 11 }, //interleave 3 x 8i8 into 24i8 (and store)
3012  { 3, MVT::v16i8, 11 }, //interleave 3 x 16i8 into 48i8 (and store)
3013  { 3, MVT::v32i8, 13 }, //interleave 3 x 32i8 into 96i8 (and store)
3014 
3015  { 4, MVT::v2i8, 12 }, //interleave 4 x 2i8 into 8i8 (and store)
3016  { 4, MVT::v4i8, 9 }, //interleave 4 x 4i8 into 16i8 (and store)
3017  { 4, MVT::v8i8, 10 }, //interleave 4 x 8i8 into 32i8 (and store)
3018  { 4, MVT::v16i8, 10 }, //interleave 4 x 16i8 into 64i8 (and store)
3019  { 4, MVT::v32i8, 12 } //interleave 4 x 32i8 into 128i8 (and store)
3020  };
3021 
3022  if (Opcode == Instruction::Load) {
3023  if (const auto *Entry =
3024  CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT()))
3025  return NumOfMemOps * MemOpCost + Entry->Cost;
3026  } else {
3027  assert(Opcode == Instruction::Store &&
3028  "Expected Store Instruction at this point");
3029  if (const auto *Entry =
3030  CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT()))
3031  return NumOfMemOps * MemOpCost + Entry->Cost;
3032  }
3033 
3034  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3035  Alignment, AddressSpace);
3036 }
3037 
3038 // Get estimation for interleaved load/store operations and strided load.
3039 // \p Indices contains indices for strided load.
3040 // \p Factor - the factor of interleaving.
3041 // AVX-512 provides 3-src shuffles that significantly reduces the cost.
3043  unsigned Factor,
3044  ArrayRef<unsigned> Indices,
3045  unsigned Alignment,
3046  unsigned AddressSpace,
3047  bool UseMaskForCond,
3048  bool UseMaskForGaps) {
3049 
3050  if (UseMaskForCond || UseMaskForGaps)
3051  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3052  Alignment, AddressSpace,
3053  UseMaskForCond, UseMaskForGaps);
3054 
3055  // VecTy for interleave memop is <VF*Factor x Elt>.
3056  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
3057  // VecTy = <12 x i32>.
3058 
3059  // Calculate the number of memory operations (NumOfMemOps), required
3060  // for load/store the VecTy.
3061  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
3062  unsigned VecTySize = DL.getTypeStoreSize(VecTy);
3063  unsigned LegalVTSize = LegalVT.getStoreSize();
3064  unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
3065 
3066  // Get the cost of one memory operation.
3067  Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
3068  LegalVT.getVectorNumElements());
3069  unsigned MemOpCost =
3070  getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
3071 
3072  unsigned VF = VecTy->getVectorNumElements() / Factor;
3073  MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
3074 
3075  if (Opcode == Instruction::Load) {
3076  // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
3077  // contain the cost of the optimized shuffle sequence that the
3078  // X86InterleavedAccess pass will generate.
3079  // The cost of loads and stores are computed separately from the table.
3080 
3081  // X86InterleavedAccess support only the following interleaved-access group.
3082  static const CostTblEntry AVX512InterleavedLoadTbl[] = {
3083  {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
3084  {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
3085  {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
3086  };
3087 
3088  if (const auto *Entry =
3089  CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
3090  return NumOfMemOps * MemOpCost + Entry->Cost;
3091  //If an entry does not exist, fallback to the default implementation.
3092 
3093  // Kind of shuffle depends on number of loaded values.
3094  // If we load the entire data in one register, we can use a 1-src shuffle.
3095  // Otherwise, we'll merge 2 sources in each operation.
3096  TTI::ShuffleKind ShuffleKind =
3097  (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
3098 
3099  unsigned ShuffleCost =
3100  getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr);
3101 
3102  unsigned NumOfLoadsInInterleaveGrp =
3103  Indices.size() ? Indices.size() : Factor;
3104  Type *ResultTy = VectorType::get(VecTy->getVectorElementType(),
3105  VecTy->getVectorNumElements() / Factor);
3106  unsigned NumOfResults =
3107  getTLI()->getTypeLegalizationCost(DL, ResultTy).first *
3108  NumOfLoadsInInterleaveGrp;
3109 
3110  // About a half of the loads may be folded in shuffles when we have only
3111  // one result. If we have more than one result, we do not fold loads at all.
3112  unsigned NumOfUnfoldedLoads =
3113  NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
3114 
3115  // Get a number of shuffle operations per result.
3116  unsigned NumOfShufflesPerResult =
3117  std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
3118 
3119  // The SK_MergeTwoSrc shuffle clobbers one of src operands.
3120  // When we have more than one destination, we need additional instructions
3121  // to keep sources.
3122  unsigned NumOfMoves = 0;
3123  if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
3124  NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
3125 
3126  int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
3127  NumOfUnfoldedLoads * MemOpCost + NumOfMoves;
3128 
3129  return Cost;
3130  }
3131 
3132  // Store.
3133  assert(Opcode == Instruction::Store &&
3134  "Expected Store Instruction at this point");
3135  // X86InterleavedAccess support only the following interleaved-access group.
3136  static const CostTblEntry AVX512InterleavedStoreTbl[] = {
3137  {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
3138  {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
3139  {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
3140 
3141  {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
3142  {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
3143  {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
3144  {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
3145  };
3146 
3147  if (const auto *Entry =
3148  CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
3149  return NumOfMemOps * MemOpCost + Entry->Cost;
3150  //If an entry does not exist, fallback to the default implementation.
3151 
3152  // There is no strided stores meanwhile. And store can't be folded in
3153  // shuffle.
3154  unsigned NumOfSources = Factor; // The number of values to be merged.
3155  unsigned ShuffleCost =
3156  getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr);
3157  unsigned NumOfShufflesPerStore = NumOfSources - 1;
3158 
3159  // The SK_MergeTwoSrc shuffle clobbers one of src operands.
3160  // We need additional instructions to keep sources.
3161  unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
3162  int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
3163  NumOfMoves;
3164  return Cost;
3165 }
3166 
3167 int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
3168  unsigned Factor,
3169  ArrayRef<unsigned> Indices,
3170  unsigned Alignment,
3171  unsigned AddressSpace,
3172  bool UseMaskForCond,
3173  bool UseMaskForGaps) {
3174  auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) {
3175  Type *EltTy = VecTy->getVectorElementType();
3176  if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
3177  EltTy->isIntegerTy(32) || EltTy->isPointerTy())
3178  return true;
3179  if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8))
3180  return HasBW;
3181  return false;
3182  };
3183  if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
3184  return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
3185  Alignment, AddressSpace,
3186  UseMaskForCond, UseMaskForGaps);
3187  if (ST->hasAVX2())
3188  return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
3189  Alignment, AddressSpace,
3190  UseMaskForCond, UseMaskForGaps);
3191 
3192  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3193  Alignment, AddressSpace,
3194  UseMaskForCond, UseMaskForGaps);
3195 }
bool hasAVX() const
Definition: X86Subtarget.h:560
Type * getVectorElementType() const
Definition: Type.h:371
constexpr bool isUInt< 32 >(uint64_t x)
Definition: MathExtras.h:349
X = FP_ROUND(Y, TRUNC) - Rounding &#39;Y&#39; from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:538
bool is64Bit() const
Is this x86_64? (disregarding specific ABI / programming model)
Definition: X86Subtarget.h:522
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:594
unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >())
Definition: BasicTTIImpl.h:568
static IntegerType * getInt1Ty(LLVMContext &C)
Definition: Type.cpp:173
llvm::Optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1563
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:834
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:42
bool hasSSE41() const
Definition: X86Subtarget.h:558
This class represents lattice values for constants.
Definition: AllocatorList.h:24
unsigned minRequiredElementSize(const Value *Val, bool &isSigned)
static MVT getVectorVT(MVT VT, unsigned NumElements)
Cost tables and simple lookup functions.
int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty)
int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, bool VariableMask, unsigned Alignment)
Calculate the cost of Gather / Scatter operation.
bool hasPOPCNT() const
Definition: X86Subtarget.h:568
bool hasAVX2() const
Definition: X86Subtarget.h:561
#define LLVM_FALLTHROUGH
Definition: Compiler.h:86
const Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
bool isVector() const
Return true if this is a vector value type.
unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:507
void push_back(const T &Elt)
Definition: SmallVector.h:218
This file a TargetTransformInfo::Concept conforming object specific to the X86 target machine...
The main scalar evolution driver.
unsigned getRegisterBitWidth(bool Vector) const
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:253
unsigned getVectorNumElements() const
bool isLegalMaskedScatter(Type *DataType)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1180
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:360
Type Conversion Cost Table.
Definition: CostTable.h:45
uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the next integer (mod 2**64) that is greater than or equal to Value and is a multiple of Alig...
Definition: MathExtras.h:685
[US]{MIN/MAX} - Binary minimum or maximum or signed or unsigned integers.
Definition: ISDOpcodes.h:384
Hexagon Common GEP
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:230
unsigned getAtomicMemIntrinsicMaxElementSize() const
Cost Table Entry.
Definition: CostTable.h:25
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1509
unsigned getNumberOfRegisters(bool Vector)
int getGatherOverhead() const
Definition: X86Subtarget.h:621
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:130
unsigned getMaxInterleaveFactor(unsigned VF)
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:210
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
bool isUnalignedMem32Slow() const
Definition: X86Subtarget.h:620
unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef< Value * > Args, FastMathFlags FMF, unsigned VF=1)
Get intrinsic cost based on arguments.
bool isLegalMaskedStore(Type *DataType)
unsigned getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
int getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value *> Args=ArrayRef< const Value *>())
int getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond=false, bool UseMaskForGaps=false)
const FeatureBitset & getFeatureBits() const
Shift and rotation operations.
Definition: ISDOpcodes.h:410
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:197
unsigned getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwise)
Try to calculate arithmetic and shuffle op costs for reduction operations.
const TypeConversionCostTblEntry * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntry > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table, TypeTy must be comparable to CompareTy by ==. ...
Definition: CostTable.h:55
unsigned getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isLSRCostLess(TargetTransformInfo::LSRCost &C1, TargetTransformInfo::LSRCost &C2)
bool hasVLX() const
Definition: X86Subtarget.h:657
unsigned getSizeInBits() const
unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I)
Definition: BasicTTIImpl.h:772
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1575
unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:634
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:245
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:478
PopcntSupportKind
Flags indicating the kind of support for population count.
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:884
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:201
llvm::Optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:33
Selects elements from the corresponding lane of either source operand.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:203
int getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr)
Reverse the order of the vector.
bool hasDQI() const
Definition: X86Subtarget.h:655
MVT getVectorElementType() const
Value * getOperand(unsigned i) const
Definition: User.h:170
Class to represent pointers.
Definition: DerivedTypes.h:467
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:524
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return &#39;this&#39;.
Definition: Type.h:304
ExtractSubvector Index indicates start offset.
bool isFloatTy() const
Return true if this is &#39;float&#39;, a 32-bit IEEE fp type.
Definition: Type.h:147
an instruction for type-safe pointer arithmetic to access elements of arrays and structs ...
Definition: Instructions.h:843
bool isSLM() const
Definition: X86Subtarget.h:708
bool hasSSSE3() const
Definition: X86Subtarget.h:557
If not nullptr, enable inline expansion of memcmp.
* if(!EatIfPresent(lltok::kw_thread_local)) return false
ParseOptionalThreadLocal := /*empty.
Container class for subtarget features.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:429
Machine Value Type.
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46
unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Definition: BasicTTIImpl.h:850
Simple binary floating point operators.
Definition: ISDOpcodes.h:283
bool isLegalMaskedGather(Type *DataType)
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:149
unsigned getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwise, bool)
Try to calculate op costs for min/max reduction operations.
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:224
int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Expected to fold away in lowering.
unsigned getUserCost(const User *U, ArrayRef< const Value * > Operands)
bool isLegalMaskedLoad(Type *DataType)
const TTI::MemCmpExpansionOptions * enableMemCmpExpansion(bool IsZeroCmp) const
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:495
Merge elements from two source vectors into one with any shuffle mask.
int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace)
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:281
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
bool hasXOP() const
Definition: X86Subtarget.h:584
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:556
bool hasSSE42() const
Definition: X86Subtarget.h:559
Extended Value Type.
Definition: ValueTypes.h:34
int getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm)
unsigned getUserCost(const User *U, ArrayRef< const Value *> Operands)
const TargetMachine & getTargetMachine() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace, const Instruction *I=nullptr)
int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp)
const SCEVConstant * getConstantStrideStep(ScalarEvolution *SE, const SCEV *Ptr)
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
OperandValueProperties
Additional properties of an operand&#39;s values.
unsigned getCFInstrCost(unsigned Opcode)
Definition: BasicTTIImpl.h:767
bool isAtom() const
TODO: to be removed later and replaced with suitable properties.
Definition: X86Subtarget.h:707
unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp)
Definition: BasicTTIImpl.h:615
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:947
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:240
unsigned getNumOperands() const
Definition: User.h:192
constexpr bool isInt< 32 >(int64_t x)
Definition: MathExtras.h:309
unsigned getPreferVectorWidth() const
Definition: X86Subtarget.h:678
AddressSpace
Definition: NVPTXBaseInfo.h:22
bool hasVBMI() const
Definition: X86Subtarget.h:594
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:413
int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I=nullptr)
int getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwiseForm, bool IsUnsigned)
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target&#39;s TargetSubtargetInf...
unsigned getVectorNumElements() const
Definition: DerivedTypes.h:462
Class to represent vector types.
Definition: DerivedTypes.h:393
Class for arbitrary precision integers.
Definition: APInt.h:70
int getScatterOverhead() const
Definition: X86Subtarget.h:622
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:468
unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:819
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const
const CostTblEntry * CostTableLookup(ArrayRef< CostTblEntry > Tbl, int ISD, MVT Ty)
Find in cost table, TypeTy must be comparable to CompareTy by ==.
Definition: CostTable.h:32
This class represents an analyzed expression in the program.
bool isGLM() const
Definition: X86Subtarget.h:709
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return &#39;Legal&#39;) or we ...
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, const Instruction *I=nullptr)
#define I(x, y, z)
Definition: MD5.cpp:58
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:323
unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: BasicTTIImpl.h:812
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:699
InsertSubvector. Index indicates start offset.
unsigned Insns
TODO: Some of these could be merged.
bool hasCDI() const
Definition: X86Subtarget.h:651
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:185
const unsigned Kind
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
The cost of a typical &#39;add&#39; instruction.
bool hasSSE1() const
Definition: X86Subtarget.h:554
unsigned getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition: Type.cpp:115
LLVM Value Representation.
Definition: Value.h:73
uint64_t getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type...
Definition: DataLayout.h:411
bool hasDivRemOp(Type *DataType, bool IsSigned)
static VectorType * get(Type *ElementType, unsigned NumElements)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:606
bool hasMacroFusion() const
Definition: X86Subtarget.h:641
Broadcast element 0 to all other elements.
bool hasAVX512() const
Definition: X86Subtarget.h:562
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:59
Convenience struct for specifying and reasoning about fast-math flags.
Definition: Operator.h:160
bool hasBWI() const
Definition: X86Subtarget.h:656
OperandValueKind
Additional information about an operand&#39;s possible values.
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:443
This pass exposes codegen information to IR-level passes.
Conversion operators.
Definition: ISDOpcodes.h:465
bool hasFastGather() const
Definition: X86Subtarget.h:634
CacheLevel
The possible cache levels.
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:474
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:126
bool isDoubleTy() const
Return true if this is &#39;double&#39;, a 64-bit IEEE fp type.
Definition: Type.h:150
int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef< Type *> Tys, FastMathFlags FMF, unsigned ScalarizationCostPassed=UINT_MAX)
static IntegerType * getInt8Ty(LLVMContext &C)
Definition: Type.cpp:174
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond=false, bool UseMaskForGaps=false)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
std::pair< int, MVT > getTypeLegalizationCost(const DataLayout &DL, Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
bool hasSSE2() const
Definition: X86Subtarget.h:555
int getIntImmCost(int64_t)
Calculate the cost of materializing a 64-bit value.
This file describes how to lower LLVM code to machine code.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Shuffle elements of single source vector with any shuffle mask.