LLVM  8.0.0svn
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 /// \file
10 /// This file implements a TargetTransformInfo analysis pass specific to the
11 /// X86 target machine. It uses the target's detailed information to provide
12 /// more precise answers to certain TTI queries, while letting the target
13 /// independent and default TTI implementations handle the rest.
14 ///
15 //===----------------------------------------------------------------------===//
16 /// About Cost Model numbers used below it's necessary to say the following:
17 /// the numbers correspond to some "generic" X86 CPU instead of usage of
18 /// concrete CPU model. Usually the numbers correspond to CPU where the feature
19 /// apeared at the first time. For example, if we do Subtarget.hasSSE42() in
20 /// the lookups below the cost is based on Nehalem as that was the first CPU
21 /// to support that feature level and thus has most likely the worst case cost.
22 /// Some examples of other technologies/CPUs:
23 /// SSE 3 - Pentium4 / Athlon64
24 /// SSE 4.1 - Penryn
25 /// SSE 4.2 - Nehalem
26 /// AVX - Sandy Bridge
27 /// AVX2 - Haswell
28 /// AVX-512 - Xeon Phi / Skylake
29 /// And some examples of instruction target dependent costs (latency)
30 /// divss sqrtss rsqrtss
31 /// AMD K7 11-16 19 3
32 /// Piledriver 9-24 13-15 5
33 /// Jaguar 14 16 2
34 /// Pentium II,III 18 30 2
35 /// Nehalem 7-14 7-18 3
36 /// Haswell 10-13 11 5
37 /// TODO: Develop and implement the target dependent cost model and
38 /// specialize cost numbers for different Cost Model Targets such as throughput,
39 /// code size, latency and uop count.
40 //===----------------------------------------------------------------------===//
41 
42 #include "X86TargetTransformInfo.h"
45 #include "llvm/CodeGen/CostTable.h"
47 #include "llvm/IR/IntrinsicInst.h"
48 #include "llvm/Support/Debug.h"
49 
50 using namespace llvm;
51 
52 #define DEBUG_TYPE "x86tti"
53 
54 //===----------------------------------------------------------------------===//
55 //
56 // X86 cost model.
57 //
58 //===----------------------------------------------------------------------===//
59 
61 X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
62  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
63  // TODO: Currently the __builtin_popcount() implementation using SSE3
64  // instructions is inefficient. Once the problem is fixed, we should
65  // call ST->hasSSE3() instead of ST->hasPOPCNT().
67 }
68 
71  switch (Level) {
73  // - Penryn
74  // - Nehalem
75  // - Westmere
76  // - Sandy Bridge
77  // - Ivy Bridge
78  // - Haswell
79  // - Broadwell
80  // - Skylake
81  // - Kabylake
82  return 32 * 1024; // 32 KByte
84  // - Penryn
85  // - Nehalem
86  // - Westmere
87  // - Sandy Bridge
88  // - Ivy Bridge
89  // - Haswell
90  // - Broadwell
91  // - Skylake
92  // - Kabylake
93  return 256 * 1024; // 256 KByte
94  }
95 
96  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
97 }
98 
101  // - Penryn
102  // - Nehalem
103  // - Westmere
104  // - Sandy Bridge
105  // - Ivy Bridge
106  // - Haswell
107  // - Broadwell
108  // - Skylake
109  // - Kabylake
110  switch (Level) {
114  return 8;
115  }
116 
117  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
118 }
119 
120 unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
121  if (Vector && !ST->hasSSE1())
122  return 0;
123 
124  if (ST->is64Bit()) {
125  if (Vector && ST->hasAVX512())
126  return 32;
127  return 16;
128  }
129  return 8;
130 }
131 
132 unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const {
133  unsigned PreferVectorWidth = ST->getPreferVectorWidth();
134  if (Vector) {
135  if (ST->hasAVX512() && PreferVectorWidth >= 512)
136  return 512;
137  if (ST->hasAVX() && PreferVectorWidth >= 256)
138  return 256;
139  if (ST->hasSSE1() && PreferVectorWidth >= 128)
140  return 128;
141  return 0;
142  }
143 
144  if (ST->is64Bit())
145  return 64;
146 
147  return 32;
148 }
149 
150 unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
151  return getRegisterBitWidth(true);
152 }
153 
154 unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
155  // If the loop will not be vectorized, don't interleave the loop.
156  // Let regular unroll to unroll the loop, which saves the overflow
157  // check and memory check cost.
158  if (VF == 1)
159  return 1;
160 
161  if (ST->isAtom())
162  return 1;
163 
164  // Sandybridge and Haswell have multiple execution ports and pipelined
165  // vector units.
166  if (ST->hasAVX())
167  return 4;
168 
169  return 2;
170 }
171 
173  unsigned Opcode, Type *Ty,
175  TTI::OperandValueProperties Opd1PropInfo,
176  TTI::OperandValueProperties Opd2PropInfo,
178  // Legalize the type.
179  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
180 
181  int ISD = TLI->InstructionOpcodeToISD(Opcode);
182  assert(ISD && "Invalid opcode");
183 
184  static const CostTblEntry GLMCostTable[] = {
185  { ISD::FDIV, MVT::f32, 18 }, // divss
186  { ISD::FDIV, MVT::v4f32, 35 }, // divps
187  { ISD::FDIV, MVT::f64, 33 }, // divsd
188  { ISD::FDIV, MVT::v2f64, 65 }, // divpd
189  };
190 
191  if (ST->isGLM())
192  if (const auto *Entry = CostTableLookup(GLMCostTable, ISD,
193  LT.second))
194  return LT.first * Entry->Cost;
195 
196  static const CostTblEntry SLMCostTable[] = {
197  { ISD::MUL, MVT::v4i32, 11 }, // pmulld
198  { ISD::MUL, MVT::v8i16, 2 }, // pmullw
199  { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
200  { ISD::FMUL, MVT::f64, 2 }, // mulsd
201  { ISD::FMUL, MVT::v2f64, 4 }, // mulpd
202  { ISD::FMUL, MVT::v4f32, 2 }, // mulps
203  { ISD::FDIV, MVT::f32, 17 }, // divss
204  { ISD::FDIV, MVT::v4f32, 39 }, // divps
205  { ISD::FDIV, MVT::f64, 32 }, // divsd
206  { ISD::FDIV, MVT::v2f64, 69 }, // divpd
207  { ISD::FADD, MVT::v2f64, 2 }, // addpd
208  { ISD::FSUB, MVT::v2f64, 2 }, // subpd
209  // v2i64/v4i64 mul is custom lowered as a series of long:
210  // multiplies(3), shifts(3) and adds(2)
211  // slm muldq version throughput is 2 and addq throughput 4
212  // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
213  // 3X4 (addq throughput) = 17
214  { ISD::MUL, MVT::v2i64, 17 },
215  // slm addq\subq throughput is 4
216  { ISD::ADD, MVT::v2i64, 4 },
217  { ISD::SUB, MVT::v2i64, 4 },
218  };
219 
220  if (ST->isSLM()) {
221  if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
222  // Check if the operands can be shrinked into a smaller datatype.
223  bool Op1Signed = false;
224  unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
225  bool Op2Signed = false;
226  unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
227 
228  bool signedMode = Op1Signed | Op2Signed;
229  unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
230 
231  if (OpMinSize <= 7)
232  return LT.first * 3; // pmullw/sext
233  if (!signedMode && OpMinSize <= 8)
234  return LT.first * 3; // pmullw/zext
235  if (OpMinSize <= 15)
236  return LT.first * 5; // pmullw/pmulhw/pshuf
237  if (!signedMode && OpMinSize <= 16)
238  return LT.first * 5; // pmullw/pmulhw/pshuf
239  }
240 
241  if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
242  LT.second)) {
243  return LT.first * Entry->Cost;
244  }
245  }
246 
247  if ((ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
248  ISD == ISD::UREM) &&
251  Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
252  if (ISD == ISD::SDIV || ISD == ISD::SREM) {
253  // On X86, vector signed division by constants power-of-two are
254  // normally expanded to the sequence SRA + SRL + ADD + SRA.
255  // The OperandValue properties may not be the same as that of the previous
256  // operation; conservatively assume OP_None.
257  int Cost =
258  2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, Op2Info,
261  Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
264  Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info,
267 
268  if (ISD == ISD::SREM) {
269  // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
270  Cost += getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info);
271  Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Op1Info, Op2Info);
272  }
273 
274  return Cost;
275  }
276 
277  // Vector unsigned division/remainder will be simplified to shifts/masks.
278  if (ISD == ISD::UDIV)
279  return getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
282 
283  if (ISD == ISD::UREM)
284  return getArithmeticInstrCost(Instruction::And, Ty, Op1Info, Op2Info,
287  }
288 
289  static const CostTblEntry AVX512BWUniformConstCostTable[] = {
290  { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand.
291  { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand.
292  { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb.
293 
294  { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence
295  { ISD::SREM, MVT::v32i16, 8 }, // vpmulhw+mul+sub sequence
296  { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence
297  { ISD::UREM, MVT::v32i16, 8 }, // vpmulhuw+mul+sub sequence
298  };
299 
301  ST->hasBWI()) {
302  if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
303  LT.second))
304  return LT.first * Entry->Cost;
305  }
306 
307  static const CostTblEntry AVX512UniformConstCostTable[] = {
308  { ISD::SRA, MVT::v2i64, 1 },
309  { ISD::SRA, MVT::v4i64, 1 },
310  { ISD::SRA, MVT::v8i64, 1 },
311 
312  { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
313  { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
314  { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
315  { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
316  };
317 
319  ST->hasAVX512()) {
320  if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
321  LT.second))
322  return LT.first * Entry->Cost;
323  }
324 
325  static const CostTblEntry AVX2UniformConstCostTable[] = {
326  { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand.
327  { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand.
328  { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb.
329 
330  { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
331 
332  { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
333  { ISD::SREM, MVT::v16i16, 8 }, // vpmulhw+mul+sub sequence
334  { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
335  { ISD::UREM, MVT::v16i16, 8 }, // vpmulhuw+mul+sub sequence
336  { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
337  { ISD::SREM, MVT::v8i32, 19 }, // vpmuldq+mul+sub sequence
338  { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
339  { ISD::UREM, MVT::v8i32, 19 }, // vpmuludq+mul+sub sequence
340  };
341 
343  ST->hasAVX2()) {
344  if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
345  LT.second))
346  return LT.first * Entry->Cost;
347  }
348 
349  static const CostTblEntry SSE2UniformConstCostTable[] = {
350  { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand.
351  { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand.
352  { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
353 
354  { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split.
355  { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split.
356  { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
357 
358  { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
359  { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split.
360  { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
361  { ISD::SREM, MVT::v8i16, 8 }, // pmulhw+mul+sub sequence
362  { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
363  { ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split.
364  { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
365  { ISD::UREM, MVT::v8i16, 8 }, // pmulhuw+mul+sub sequence
366  { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split.
367  { ISD::SREM, MVT::v8i32, 48+2 }, // 2*pmuludq+mul+sub sequence + split.
368  { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
369  { ISD::SREM, MVT::v4i32, 24 }, // pmuludq+mul+sub sequence
370  { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split.
371  { ISD::UREM, MVT::v8i32, 40+2 }, // 2*pmuludq+mul+sub sequence + split.
372  { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
373  { ISD::UREM, MVT::v4i32, 20 }, // pmuludq+mul+sub sequence
374  };
375 
377  ST->hasSSE2()) {
378  // pmuldq sequence.
379  if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
380  return LT.first * 32;
381  if (ISD == ISD::SREM && LT.second == MVT::v8i32 && ST->hasAVX())
382  return LT.first * 38;
383  if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
384  return LT.first * 15;
385  if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41())
386  return LT.first * 20;
387 
388  // XOP has faster vXi8 shifts.
389  if ((ISD != ISD::SHL && ISD != ISD::SRL && ISD != ISD::SRA) ||
390  !ST->hasXOP())
391  if (const auto *Entry =
392  CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
393  return LT.first * Entry->Cost;
394  }
395 
396  static const CostTblEntry AVX2UniformCostTable[] = {
397  // Uniform splats are cheaper for the following instructions.
398  { ISD::SHL, MVT::v16i16, 1 }, // psllw.
399  { ISD::SRL, MVT::v16i16, 1 }, // psrlw.
400  { ISD::SRA, MVT::v16i16, 1 }, // psraw.
401  };
402 
403  if (ST->hasAVX2() &&
405  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
406  if (const auto *Entry =
407  CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
408  return LT.first * Entry->Cost;
409  }
410 
411  static const CostTblEntry SSE2UniformCostTable[] = {
412  // Uniform splats are cheaper for the following instructions.
413  { ISD::SHL, MVT::v8i16, 1 }, // psllw.
414  { ISD::SHL, MVT::v4i32, 1 }, // pslld
415  { ISD::SHL, MVT::v2i64, 1 }, // psllq.
416 
417  { ISD::SRL, MVT::v8i16, 1 }, // psrlw.
418  { ISD::SRL, MVT::v4i32, 1 }, // psrld.
419  { ISD::SRL, MVT::v2i64, 1 }, // psrlq.
420 
421  { ISD::SRA, MVT::v8i16, 1 }, // psraw.
422  { ISD::SRA, MVT::v4i32, 1 }, // psrad.
423  };
424 
425  if (ST->hasSSE2() &&
427  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
428  if (const auto *Entry =
429  CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
430  return LT.first * Entry->Cost;
431  }
432 
433  static const CostTblEntry AVX512DQCostTable[] = {
434  { ISD::MUL, MVT::v2i64, 1 },
435  { ISD::MUL, MVT::v4i64, 1 },
436  { ISD::MUL, MVT::v8i64, 1 }
437  };
438 
439  // Look for AVX512DQ lowering tricks for custom cases.
440  if (ST->hasDQI())
441  if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
442  return LT.first * Entry->Cost;
443 
444  static const CostTblEntry AVX512BWCostTable[] = {
445  { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
446  { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
447  { ISD::SRA, MVT::v8i16, 1 }, // vpsravw
448 
449  { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
450  { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
451  { ISD::SRA, MVT::v16i16, 1 }, // vpsravw
452 
453  { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
454  { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
455  { ISD::SRA, MVT::v32i16, 1 }, // vpsravw
456 
457  { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence.
458  { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence.
459  { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence.
460 
461  { ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence.
462  { ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence.
463  { ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence.
464  };
465 
466  // Look for AVX512BW lowering tricks for custom cases.
467  if (ST->hasBWI())
468  if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
469  return LT.first * Entry->Cost;
470 
471  static const CostTblEntry AVX512CostTable[] = {
472  { ISD::SHL, MVT::v16i32, 1 },
473  { ISD::SRL, MVT::v16i32, 1 },
474  { ISD::SRA, MVT::v16i32, 1 },
475 
476  { ISD::SHL, MVT::v8i64, 1 },
477  { ISD::SRL, MVT::v8i64, 1 },
478 
479  { ISD::SRA, MVT::v2i64, 1 },
480  { ISD::SRA, MVT::v4i64, 1 },
481  { ISD::SRA, MVT::v8i64, 1 },
482 
483  { ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence.
484  { ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence.
485  { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org)
486  { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org)
487  { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org)
488  { ISD::MUL, MVT::v8i64, 8 }, // 3*pmuludq/3*shift/2*add
489 
490  { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
491  { ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
492  { ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
493 
494  { ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
495  { ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
496  { ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
497  };
498 
499  if (ST->hasAVX512())
500  if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
501  return LT.first * Entry->Cost;
502 
503  static const CostTblEntry AVX2ShiftCostTable[] = {
504  // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
505  // customize them to detect the cases where shift amount is a scalar one.
506  { ISD::SHL, MVT::v4i32, 1 },
507  { ISD::SRL, MVT::v4i32, 1 },
508  { ISD::SRA, MVT::v4i32, 1 },
509  { ISD::SHL, MVT::v8i32, 1 },
510  { ISD::SRL, MVT::v8i32, 1 },
511  { ISD::SRA, MVT::v8i32, 1 },
512  { ISD::SHL, MVT::v2i64, 1 },
513  { ISD::SRL, MVT::v2i64, 1 },
514  { ISD::SHL, MVT::v4i64, 1 },
515  { ISD::SRL, MVT::v4i64, 1 },
516  };
517 
518  // Look for AVX2 lowering tricks.
519  if (ST->hasAVX2()) {
520  if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
523  // On AVX2, a packed v16i16 shift left by a constant build_vector
524  // is lowered into a vector multiply (vpmullw).
525  return getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info,
528 
529  if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
530  return LT.first * Entry->Cost;
531  }
532 
533  static const CostTblEntry XOPShiftCostTable[] = {
534  // 128bit shifts take 1cy, but right shifts require negation beforehand.
535  { ISD::SHL, MVT::v16i8, 1 },
536  { ISD::SRL, MVT::v16i8, 2 },
537  { ISD::SRA, MVT::v16i8, 2 },
538  { ISD::SHL, MVT::v8i16, 1 },
539  { ISD::SRL, MVT::v8i16, 2 },
540  { ISD::SRA, MVT::v8i16, 2 },
541  { ISD::SHL, MVT::v4i32, 1 },
542  { ISD::SRL, MVT::v4i32, 2 },
543  { ISD::SRA, MVT::v4i32, 2 },
544  { ISD::SHL, MVT::v2i64, 1 },
545  { ISD::SRL, MVT::v2i64, 2 },
546  { ISD::SRA, MVT::v2i64, 2 },
547  // 256bit shifts require splitting if AVX2 didn't catch them above.
548  { ISD::SHL, MVT::v32i8, 2+2 },
549  { ISD::SRL, MVT::v32i8, 4+2 },
550  { ISD::SRA, MVT::v32i8, 4+2 },
551  { ISD::SHL, MVT::v16i16, 2+2 },
552  { ISD::SRL, MVT::v16i16, 4+2 },
553  { ISD::SRA, MVT::v16i16, 4+2 },
554  { ISD::SHL, MVT::v8i32, 2+2 },
555  { ISD::SRL, MVT::v8i32, 4+2 },
556  { ISD::SRA, MVT::v8i32, 4+2 },
557  { ISD::SHL, MVT::v4i64, 2+2 },
558  { ISD::SRL, MVT::v4i64, 4+2 },
559  { ISD::SRA, MVT::v4i64, 4+2 },
560  };
561 
562  // Look for XOP lowering tricks.
563  if (ST->hasXOP())
564  if (const auto *Entry = CostTableLookup(XOPShiftCostTable, ISD, LT.second))
565  return LT.first * Entry->Cost;
566 
567  static const CostTblEntry SSE2UniformShiftCostTable[] = {
568  // Uniform splats are cheaper for the following instructions.
569  { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split.
570  { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split.
571  { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split.
572 
573  { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split.
574  { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split.
575  { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split.
576 
577  { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split.
578  { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split.
579  { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle.
580  { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split.
581  };
582 
583  if (ST->hasSSE2() &&
585  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
586 
587  // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
588  if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2())
589  return LT.first * 4; // 2*psrad + shuffle.
590 
591  if (const auto *Entry =
592  CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
593  return LT.first * Entry->Cost;
594  }
595 
596  if (ISD == ISD::SHL &&
598  MVT VT = LT.second;
599  // Vector shift left by non uniform constant can be lowered
600  // into vector multiply.
601  if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
602  ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
603  ISD = ISD::MUL;
604  }
605 
606  static const CostTblEntry AVX2CostTable[] = {
607  { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence.
608  { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
609 
610  { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence.
611  { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
612 
613  { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence.
614  { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence.
615  { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence.
616  { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence.
617 
618  { ISD::SUB, MVT::v32i8, 1 }, // psubb
619  { ISD::ADD, MVT::v32i8, 1 }, // paddb
620  { ISD::SUB, MVT::v16i16, 1 }, // psubw
621  { ISD::ADD, MVT::v16i16, 1 }, // paddw
622  { ISD::SUB, MVT::v8i32, 1 }, // psubd
623  { ISD::ADD, MVT::v8i32, 1 }, // paddd
624  { ISD::SUB, MVT::v4i64, 1 }, // psubq
625  { ISD::ADD, MVT::v4i64, 1 }, // paddq
626 
627  { ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence.
628  { ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence.
629  { ISD::MUL, MVT::v16i16, 1 }, // pmullw
630  { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org)
631  { ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add
632 
633  { ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
634  { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
635  { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
636  { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
637  { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
638  { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
639 
640  { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
641  { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
642  { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
643  { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/
644  { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
645  { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
646  };
647 
648  // Look for AVX2 lowering tricks for custom cases.
649  if (ST->hasAVX2())
650  if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
651  return LT.first * Entry->Cost;
652 
653  static const CostTblEntry AVX1CostTable[] = {
654  // We don't have to scalarize unsupported ops. We can issue two half-sized
655  // operations and we only need to extract the upper YMM half.
656  // Two ops + 1 extract + 1 insert = 4.
657  { ISD::MUL, MVT::v16i16, 4 },
658  { ISD::MUL, MVT::v8i32, 4 },
659  { ISD::SUB, MVT::v32i8, 4 },
660  { ISD::ADD, MVT::v32i8, 4 },
661  { ISD::SUB, MVT::v16i16, 4 },
662  { ISD::ADD, MVT::v16i16, 4 },
663  { ISD::SUB, MVT::v8i32, 4 },
664  { ISD::ADD, MVT::v8i32, 4 },
665  { ISD::SUB, MVT::v4i64, 4 },
666  { ISD::ADD, MVT::v4i64, 4 },
667 
668  // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
669  // are lowered as a series of long multiplies(3), shifts(3) and adds(2)
670  // Because we believe v4i64 to be a legal type, we must also include the
671  // extract+insert in the cost table. Therefore, the cost here is 18
672  // instead of 8.
673  { ISD::MUL, MVT::v4i64, 18 },
674 
675  { ISD::MUL, MVT::v32i8, 26 }, // extend/pmullw/trunc sequence.
676 
677  { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
678  { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
679  { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
680  { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/
681  { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/
682  { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/
683  };
684 
685  if (ST->hasAVX())
686  if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
687  return LT.first * Entry->Cost;
688 
689  static const CostTblEntry SSE42CostTable[] = {
690  { ISD::FADD, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
691  { ISD::FADD, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
692  { ISD::FADD, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
693  { ISD::FADD, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
694 
695  { ISD::FSUB, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
696  { ISD::FSUB, MVT::f32 , 1 }, // Nehalem from http://www.agner.org/
697  { ISD::FSUB, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
698  { ISD::FSUB, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
699 
700  { ISD::FMUL, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
701  { ISD::FMUL, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
702  { ISD::FMUL, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
703  { ISD::FMUL, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
704 
705  { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/
706  { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
707  { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/
708  { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
709  };
710 
711  if (ST->hasSSE42())
712  if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
713  return LT.first * Entry->Cost;
714 
715  static const CostTblEntry SSE41CostTable[] = {
716  { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence.
717  { ISD::SHL, MVT::v32i8, 2*11+2 }, // pblendvb sequence + split.
718  { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence.
719  { ISD::SHL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
720  { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
721  { ISD::SHL, MVT::v8i32, 2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split
722 
723  { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence.
724  { ISD::SRL, MVT::v32i8, 2*12+2 }, // pblendvb sequence + split.
725  { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence.
726  { ISD::SRL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
727  { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend.
728  { ISD::SRL, MVT::v8i32, 2*11+2 }, // Shift each lane + blend + split.
729 
730  { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence.
731  { ISD::SRA, MVT::v32i8, 2*24+2 }, // pblendvb sequence + split.
732  { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence.
733  { ISD::SRA, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
734  { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
735  { ISD::SRA, MVT::v8i32, 2*12+2 }, // Shift each lane + blend + split.
736 
737  { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org)
738  };
739 
740  if (ST->hasSSE41())
741  if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
742  return LT.first * Entry->Cost;
743 
744  static const CostTblEntry SSE2CostTable[] = {
745  // We don't correctly identify costs of casts because they are marked as
746  // custom.
747  { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence.
748  { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence.
749  { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
750  { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
751  { ISD::SHL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
752 
753  { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence.
754  { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence.
755  { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
756  { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
757  { ISD::SRL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
758 
759  { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence.
760  { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence.
761  { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend.
762  { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence.
763  { ISD::SRA, MVT::v4i64, 2*12+2 }, // srl/xor/sub sequence+split.
764 
765  { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence.
766  { ISD::MUL, MVT::v8i16, 1 }, // pmullw
767  { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
768  { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
769 
770  { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/
771  { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
772  { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
773  { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
774  };
775 
776  if (ST->hasSSE2())
777  if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
778  return LT.first * Entry->Cost;
779 
780  static const CostTblEntry SSE1CostTable[] = {
781  { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/
782  { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
783  };
784 
785  if (ST->hasSSE1())
786  if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
787  return LT.first * Entry->Cost;
788 
789  // It is not a good idea to vectorize division. We have to scalarize it and
790  // in the process we will often end up having to spilling regular
791  // registers. The overhead of division is going to dominate most kernels
792  // anyways so try hard to prevent vectorization of division - it is
793  // generally a bad idea. Assume somewhat arbitrarily that we have to be able
794  // to hide "20 cycles" for each lane.
795  if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM ||
796  ISD == ISD::UDIV || ISD == ISD::UREM)) {
797  int ScalarCost = getArithmeticInstrCost(
798  Opcode, Ty->getScalarType(), Op1Info, Op2Info,
800  return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
801  }
802 
803  // Fallback to the default implementation.
804  return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
805 }
806 
808  Type *SubTp) {
809  // 64-bit packed float vectors (v2f32) are widened to type v4f32.
810  // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
811  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
812 
813  // For Broadcasts we are splatting the first element from the first input
814  // register, so only need to reference that input and all the output
815  // registers are the same.
816  if (Kind == TTI::SK_Broadcast)
817  LT.first = 1;
818 
819  // We are going to permute multiple sources and the result will be in multiple
820  // destinations. Providing an accurate cost only for splits where the element
821  // type remains the same.
822  if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
823  MVT LegalVT = LT.second;
824  if (LegalVT.isVector() &&
825  LegalVT.getVectorElementType().getSizeInBits() ==
827  LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) {
828 
829  unsigned VecTySize = DL.getTypeStoreSize(Tp);
830  unsigned LegalVTSize = LegalVT.getStoreSize();
831  // Number of source vectors after legalization:
832  unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
833  // Number of destination vectors after legalization:
834  unsigned NumOfDests = LT.first;
835 
836  Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(),
837  LegalVT.getVectorNumElements());
838 
839  unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
840  return NumOfShuffles *
841  getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
842  }
843 
844  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
845  }
846 
847  // For 2-input shuffles, we must account for splitting the 2 inputs into many.
848  if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
849  // We assume that source and destination have the same vector type.
850  int NumOfDests = LT.first;
851  int NumOfShufflesPerDest = LT.first * 2 - 1;
852  LT.first = NumOfDests * NumOfShufflesPerDest;
853  }
854 
855  static const CostTblEntry AVX512VBMIShuffleTbl[] = {
856  { TTI::SK_Reverse, MVT::v64i8, 1 }, // vpermb
857  { TTI::SK_Reverse, MVT::v32i8, 1 }, // vpermb
858 
859  { TTI::SK_PermuteSingleSrc, MVT::v64i8, 1 }, // vpermb
860  { TTI::SK_PermuteSingleSrc, MVT::v32i8, 1 }, // vpermb
861 
862  { TTI::SK_PermuteTwoSrc, MVT::v64i8, 1 }, // vpermt2b
863  { TTI::SK_PermuteTwoSrc, MVT::v32i8, 1 }, // vpermt2b
864  { TTI::SK_PermuteTwoSrc, MVT::v16i8, 1 } // vpermt2b
865  };
866 
867  if (ST->hasVBMI())
868  if (const auto *Entry =
869  CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
870  return LT.first * Entry->Cost;
871 
872  static const CostTblEntry AVX512BWShuffleTbl[] = {
873  { TTI::SK_Broadcast, MVT::v32i16, 1 }, // vpbroadcastw
874  { TTI::SK_Broadcast, MVT::v64i8, 1 }, // vpbroadcastb
875 
876  { TTI::SK_Reverse, MVT::v32i16, 1 }, // vpermw
877  { TTI::SK_Reverse, MVT::v16i16, 1 }, // vpermw
878  { TTI::SK_Reverse, MVT::v64i8, 2 }, // pshufb + vshufi64x2
879 
880  { TTI::SK_PermuteSingleSrc, MVT::v32i16, 1 }, // vpermw
881  { TTI::SK_PermuteSingleSrc, MVT::v16i16, 1 }, // vpermw
882  { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // vpermw
883  { TTI::SK_PermuteSingleSrc, MVT::v64i8, 8 }, // extend to v32i16
884  { TTI::SK_PermuteSingleSrc, MVT::v32i8, 3 }, // vpermw + zext/trunc
885 
886  { TTI::SK_PermuteTwoSrc, MVT::v32i16, 1 }, // vpermt2w
887  { TTI::SK_PermuteTwoSrc, MVT::v16i16, 1 }, // vpermt2w
888  { TTI::SK_PermuteTwoSrc, MVT::v8i16, 1 }, // vpermt2w
889  { TTI::SK_PermuteTwoSrc, MVT::v32i8, 3 }, // zext + vpermt2w + trunc
890  { TTI::SK_PermuteTwoSrc, MVT::v64i8, 19 }, // 6 * v32i8 + 1
891  { TTI::SK_PermuteTwoSrc, MVT::v16i8, 3 } // zext + vpermt2w + trunc
892  };
893 
894  if (ST->hasBWI())
895  if (const auto *Entry =
896  CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
897  return LT.first * Entry->Cost;
898 
899  static const CostTblEntry AVX512ShuffleTbl[] = {
900  { TTI::SK_Broadcast, MVT::v8f64, 1 }, // vbroadcastpd
901  { TTI::SK_Broadcast, MVT::v16f32, 1 }, // vbroadcastps
902  { TTI::SK_Broadcast, MVT::v8i64, 1 }, // vpbroadcastq
903  { TTI::SK_Broadcast, MVT::v16i32, 1 }, // vpbroadcastd
904 
905  { TTI::SK_Reverse, MVT::v8f64, 1 }, // vpermpd
906  { TTI::SK_Reverse, MVT::v16f32, 1 }, // vpermps
907  { TTI::SK_Reverse, MVT::v8i64, 1 }, // vpermq
908  { TTI::SK_Reverse, MVT::v16i32, 1 }, // vpermd
909 
910  { TTI::SK_PermuteSingleSrc, MVT::v8f64, 1 }, // vpermpd
911  { TTI::SK_PermuteSingleSrc, MVT::v4f64, 1 }, // vpermpd
912  { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // vpermpd
913  { TTI::SK_PermuteSingleSrc, MVT::v16f32, 1 }, // vpermps
914  { TTI::SK_PermuteSingleSrc, MVT::v8f32, 1 }, // vpermps
915  { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // vpermps
916  { TTI::SK_PermuteSingleSrc, MVT::v8i64, 1 }, // vpermq
917  { TTI::SK_PermuteSingleSrc, MVT::v4i64, 1 }, // vpermq
918  { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // vpermq
919  { TTI::SK_PermuteSingleSrc, MVT::v16i32, 1 }, // vpermd
920  { TTI::SK_PermuteSingleSrc, MVT::v8i32, 1 }, // vpermd
921  { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }, // vpermd
922  { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb
923 
924  { TTI::SK_PermuteTwoSrc, MVT::v8f64, 1 }, // vpermt2pd
925  { TTI::SK_PermuteTwoSrc, MVT::v16f32, 1 }, // vpermt2ps
926  { TTI::SK_PermuteTwoSrc, MVT::v8i64, 1 }, // vpermt2q
927  { TTI::SK_PermuteTwoSrc, MVT::v16i32, 1 }, // vpermt2d
928  { TTI::SK_PermuteTwoSrc, MVT::v4f64, 1 }, // vpermt2pd
929  { TTI::SK_PermuteTwoSrc, MVT::v8f32, 1 }, // vpermt2ps
930  { TTI::SK_PermuteTwoSrc, MVT::v4i64, 1 }, // vpermt2q
931  { TTI::SK_PermuteTwoSrc, MVT::v8i32, 1 }, // vpermt2d
932  { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // vpermt2pd
933  { TTI::SK_PermuteTwoSrc, MVT::v4f32, 1 }, // vpermt2ps
934  { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // vpermt2q
935  { TTI::SK_PermuteTwoSrc, MVT::v4i32, 1 } // vpermt2d
936  };
937 
938  if (ST->hasAVX512())
939  if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
940  return LT.first * Entry->Cost;
941 
942  static const CostTblEntry AVX2ShuffleTbl[] = {
943  { TTI::SK_Broadcast, MVT::v4f64, 1 }, // vbroadcastpd
944  { TTI::SK_Broadcast, MVT::v8f32, 1 }, // vbroadcastps
945  { TTI::SK_Broadcast, MVT::v4i64, 1 }, // vpbroadcastq
946  { TTI::SK_Broadcast, MVT::v8i32, 1 }, // vpbroadcastd
947  { TTI::SK_Broadcast, MVT::v16i16, 1 }, // vpbroadcastw
948  { TTI::SK_Broadcast, MVT::v32i8, 1 }, // vpbroadcastb
949 
950  { TTI::SK_Reverse, MVT::v4f64, 1 }, // vpermpd
951  { TTI::SK_Reverse, MVT::v8f32, 1 }, // vpermps
952  { TTI::SK_Reverse, MVT::v4i64, 1 }, // vpermq
953  { TTI::SK_Reverse, MVT::v8i32, 1 }, // vpermd
954  { TTI::SK_Reverse, MVT::v16i16, 2 }, // vperm2i128 + pshufb
955  { TTI::SK_Reverse, MVT::v32i8, 2 }, // vperm2i128 + pshufb
956 
957  { TTI::SK_Select, MVT::v16i16, 1 }, // vpblendvb
958  { TTI::SK_Select, MVT::v32i8, 1 }, // vpblendvb
959 
960  { TTI::SK_PermuteSingleSrc, MVT::v4f64, 1 }, // vpermpd
961  { TTI::SK_PermuteSingleSrc, MVT::v8f32, 1 }, // vpermps
962  { TTI::SK_PermuteSingleSrc, MVT::v4i64, 1 }, // vpermq
963  { TTI::SK_PermuteSingleSrc, MVT::v8i32, 1 }, // vpermd
964  { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vperm2i128 + 2*vpshufb
965  // + vpblendvb
966  { TTI::SK_PermuteSingleSrc, MVT::v32i8, 4 }, // vperm2i128 + 2*vpshufb
967  // + vpblendvb
968 
969  { TTI::SK_PermuteTwoSrc, MVT::v4f64, 3 }, // 2*vpermpd + vblendpd
970  { TTI::SK_PermuteTwoSrc, MVT::v8f32, 3 }, // 2*vpermps + vblendps
971  { TTI::SK_PermuteTwoSrc, MVT::v4i64, 3 }, // 2*vpermq + vpblendd
972  { TTI::SK_PermuteTwoSrc, MVT::v8i32, 3 }, // 2*vpermd + vpblendd
973  { TTI::SK_PermuteTwoSrc, MVT::v16i16, 7 }, // 2*vperm2i128 + 4*vpshufb
974  // + vpblendvb
975  { TTI::SK_PermuteTwoSrc, MVT::v32i8, 7 }, // 2*vperm2i128 + 4*vpshufb
976  // + vpblendvb
977  };
978 
979  if (ST->hasAVX2())
980  if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
981  return LT.first * Entry->Cost;
982 
983  static const CostTblEntry XOPShuffleTbl[] = {
984  { TTI::SK_PermuteSingleSrc, MVT::v4f64, 2 }, // vperm2f128 + vpermil2pd
985  { TTI::SK_PermuteSingleSrc, MVT::v8f32, 2 }, // vperm2f128 + vpermil2ps
986  { TTI::SK_PermuteSingleSrc, MVT::v4i64, 2 }, // vperm2f128 + vpermil2pd
987  { TTI::SK_PermuteSingleSrc, MVT::v8i32, 2 }, // vperm2f128 + vpermil2ps
988  { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vextractf128 + 2*vpperm
989  // + vinsertf128
990  { TTI::SK_PermuteSingleSrc, MVT::v32i8, 4 }, // vextractf128 + 2*vpperm
991  // + vinsertf128
992 
993  { TTI::SK_PermuteTwoSrc, MVT::v16i16, 9 }, // 2*vextractf128 + 6*vpperm
994  // + vinsertf128
995  { TTI::SK_PermuteTwoSrc, MVT::v8i16, 1 }, // vpperm
996  { TTI::SK_PermuteTwoSrc, MVT::v32i8, 9 }, // 2*vextractf128 + 6*vpperm
997  // + vinsertf128
998  { TTI::SK_PermuteTwoSrc, MVT::v16i8, 1 }, // vpperm
999  };
1000 
1001  if (ST->hasXOP())
1002  if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1003  return LT.first * Entry->Cost;
1004 
1005  static const CostTblEntry AVX1ShuffleTbl[] = {
1006  { TTI::SK_Broadcast, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd
1007  { TTI::SK_Broadcast, MVT::v8f32, 2 }, // vperm2f128 + vpermilps
1008  { TTI::SK_Broadcast, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd
1009  { TTI::SK_Broadcast, MVT::v8i32, 2 }, // vperm2f128 + vpermilps
1010  { TTI::SK_Broadcast, MVT::v16i16, 3 }, // vpshuflw + vpshufd + vinsertf128
1011  { TTI::SK_Broadcast, MVT::v32i8, 2 }, // vpshufb + vinsertf128
1012 
1013  { TTI::SK_Reverse, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd
1014  { TTI::SK_Reverse, MVT::v8f32, 2 }, // vperm2f128 + vpermilps
1015  { TTI::SK_Reverse, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd
1016  { TTI::SK_Reverse, MVT::v8i32, 2 }, // vperm2f128 + vpermilps
1017  { TTI::SK_Reverse, MVT::v16i16, 4 }, // vextractf128 + 2*pshufb
1018  // + vinsertf128
1019  { TTI::SK_Reverse, MVT::v32i8, 4 }, // vextractf128 + 2*pshufb
1020  // + vinsertf128
1021 
1022  { TTI::SK_Select, MVT::v4i64, 1 }, // vblendpd
1023  { TTI::SK_Select, MVT::v4f64, 1 }, // vblendpd
1024  { TTI::SK_Select, MVT::v8i32, 1 }, // vblendps
1025  { TTI::SK_Select, MVT::v8f32, 1 }, // vblendps
1026  { TTI::SK_Select, MVT::v16i16, 3 }, // vpand + vpandn + vpor
1027  { TTI::SK_Select, MVT::v32i8, 3 }, // vpand + vpandn + vpor
1028 
1029  { TTI::SK_PermuteSingleSrc, MVT::v4f64, 2 }, // vperm2f128 + vshufpd
1030  { TTI::SK_PermuteSingleSrc, MVT::v4i64, 2 }, // vperm2f128 + vshufpd
1031  { TTI::SK_PermuteSingleSrc, MVT::v8f32, 4 }, // 2*vperm2f128 + 2*vshufps
1032  { TTI::SK_PermuteSingleSrc, MVT::v8i32, 4 }, // 2*vperm2f128 + 2*vshufps
1033  { TTI::SK_PermuteSingleSrc, MVT::v16i16, 8 }, // vextractf128 + 4*pshufb
1034  // + 2*por + vinsertf128
1035  { TTI::SK_PermuteSingleSrc, MVT::v32i8, 8 }, // vextractf128 + 4*pshufb
1036  // + 2*por + vinsertf128
1037 
1038  { TTI::SK_PermuteTwoSrc, MVT::v4f64, 3 }, // 2*vperm2f128 + vshufpd
1039  { TTI::SK_PermuteTwoSrc, MVT::v4i64, 3 }, // 2*vperm2f128 + vshufpd
1040  { TTI::SK_PermuteTwoSrc, MVT::v8f32, 4 }, // 2*vperm2f128 + 2*vshufps
1041  { TTI::SK_PermuteTwoSrc, MVT::v8i32, 4 }, // 2*vperm2f128 + 2*vshufps
1042  { TTI::SK_PermuteTwoSrc, MVT::v16i16, 15 }, // 2*vextractf128 + 8*pshufb
1043  // + 4*por + vinsertf128
1044  { TTI::SK_PermuteTwoSrc, MVT::v32i8, 15 }, // 2*vextractf128 + 8*pshufb
1045  // + 4*por + vinsertf128
1046  };
1047 
1048  if (ST->hasAVX())
1049  if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
1050  return LT.first * Entry->Cost;
1051 
1052  static const CostTblEntry SSE41ShuffleTbl[] = {
1053  { TTI::SK_Select, MVT::v2i64, 1 }, // pblendw
1054  { TTI::SK_Select, MVT::v2f64, 1 }, // movsd
1055  { TTI::SK_Select, MVT::v4i32, 1 }, // pblendw
1056  { TTI::SK_Select, MVT::v4f32, 1 }, // blendps
1057  { TTI::SK_Select, MVT::v8i16, 1 }, // pblendw
1058  { TTI::SK_Select, MVT::v16i8, 1 } // pblendvb
1059  };
1060 
1061  if (ST->hasSSE41())
1062  if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
1063  return LT.first * Entry->Cost;
1064 
1065  static const CostTblEntry SSSE3ShuffleTbl[] = {
1066  { TTI::SK_Broadcast, MVT::v8i16, 1 }, // pshufb
1067  { TTI::SK_Broadcast, MVT::v16i8, 1 }, // pshufb
1068 
1069  { TTI::SK_Reverse, MVT::v8i16, 1 }, // pshufb
1070  { TTI::SK_Reverse, MVT::v16i8, 1 }, // pshufb
1071 
1072  { TTI::SK_Select, MVT::v8i16, 3 }, // 2*pshufb + por
1073  { TTI::SK_Select, MVT::v16i8, 3 }, // 2*pshufb + por
1074 
1075  { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // pshufb
1076  { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb
1077 
1078  { TTI::SK_PermuteTwoSrc, MVT::v8i16, 3 }, // 2*pshufb + por
1079  { TTI::SK_PermuteTwoSrc, MVT::v16i8, 3 }, // 2*pshufb + por
1080  };
1081 
1082  if (ST->hasSSSE3())
1083  if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
1084  return LT.first * Entry->Cost;
1085 
1086  static const CostTblEntry SSE2ShuffleTbl[] = {
1087  { TTI::SK_Broadcast, MVT::v2f64, 1 }, // shufpd
1088  { TTI::SK_Broadcast, MVT::v2i64, 1 }, // pshufd
1089  { TTI::SK_Broadcast, MVT::v4i32, 1 }, // pshufd
1090  { TTI::SK_Broadcast, MVT::v8i16, 2 }, // pshuflw + pshufd
1091  { TTI::SK_Broadcast, MVT::v16i8, 3 }, // unpck + pshuflw + pshufd
1092 
1093  { TTI::SK_Reverse, MVT::v2f64, 1 }, // shufpd
1094  { TTI::SK_Reverse, MVT::v2i64, 1 }, // pshufd
1095  { TTI::SK_Reverse, MVT::v4i32, 1 }, // pshufd
1096  { TTI::SK_Reverse, MVT::v8i16, 3 }, // pshuflw + pshufhw + pshufd
1097  { TTI::SK_Reverse, MVT::v16i8, 9 }, // 2*pshuflw + 2*pshufhw
1098  // + 2*pshufd + 2*unpck + packus
1099 
1100  { TTI::SK_Select, MVT::v2i64, 1 }, // movsd
1101  { TTI::SK_Select, MVT::v2f64, 1 }, // movsd
1102  { TTI::SK_Select, MVT::v4i32, 2 }, // 2*shufps
1103  { TTI::SK_Select, MVT::v8i16, 3 }, // pand + pandn + por
1104  { TTI::SK_Select, MVT::v16i8, 3 }, // pand + pandn + por
1105 
1106  { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // shufpd
1107  { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // pshufd
1108  { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }, // pshufd
1109  { TTI::SK_PermuteSingleSrc, MVT::v8i16, 5 }, // 2*pshuflw + 2*pshufhw
1110  // + pshufd/unpck
1111  { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
1112  // + 2*pshufd + 2*unpck + 2*packus
1113 
1114  { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
1115  { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
1116  { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
1117  { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
1118  { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
1119  };
1120 
1121  if (ST->hasSSE2())
1122  if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
1123  return LT.first * Entry->Cost;
1124 
1125  static const CostTblEntry SSE1ShuffleTbl[] = {
1126  { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
1127  { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
1128  { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
1129  { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
1130  { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
1131  };
1132 
1133  if (ST->hasSSE1())
1134  if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
1135  return LT.first * Entry->Cost;
1136 
1137  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
1138 }
1139 
1140 int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1141  const Instruction *I) {
1142  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1143  assert(ISD && "Invalid opcode");
1144 
1145  // FIXME: Need a better design of the cost table to handle non-simple types of
1146  // potential massive combinations (elem_num x src_type x dst_type).
1147 
1148  static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
1155 
1162 
1169 
1176  };
1177 
1178  // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
1179  // 256-bit wide vectors.
1180 
1181  static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
1185 
1190 
1191  // v16i1 -> v16i32 - load + broadcast
1202 
1213 
1237 
1247  };
1248 
1249  static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
1266 
1273 
1276 
1278  };
1279 
1280  static const TypeConversionCostTblEntry AVXConversionTbl[] = {
1297 
1305 
1318 
1332  // The generic code to compute the scalar overhead is currently broken.
1333  // Workaround this limitation by estimating the scalarization overhead
1334  // here. We have roughly 10 instructions per scalar element.
1335  // Multiply that by the vector width.
1336  // FIXME: remove that when PR19268 is fixed.
1341 
1344  // This node is expanded into scalarized operations but BasicTTI is overly
1345  // optimistic estimating its cost. It computes 3 per element (one
1346  // vector-extract, one scalar conversion and one vector-insert). The
1347  // problem is that the inserts form a read-modify-write chain so latency
1348  // should be factored in too. Inflating the cost per element by 1.
1351 
1354  };
1355 
1356  static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
1363 
1382 
1390 
1391  };
1392 
1393  static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
1394  // These are somewhat magic numbers justified by looking at the output of
1395  // Intel's IACA, running some kernels and making sure when we take
1396  // legalization into account the throughput will be overestimated.
1398  { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1405 
1406  { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1414 
1416 
1441 
1451  };
1452 
1453  std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
1454  std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);
1455 
1456  if (ST->hasSSE2() && !ST->hasAVX()) {
1457  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
1458  LTDest.second, LTSrc.second))
1459  return LTSrc.first * Entry->Cost;
1460  }
1461 
1462  EVT SrcTy = TLI->getValueType(DL, Src);
1463  EVT DstTy = TLI->getValueType(DL, Dst);
1464 
1465  // The function getSimpleVT only handles simple value types.
1466  if (!SrcTy.isSimple() || !DstTy.isSimple())
1467  return BaseT::getCastInstrCost(Opcode, Dst, Src);
1468 
1469  if (ST->hasDQI())
1470  if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
1471  DstTy.getSimpleVT(),
1472  SrcTy.getSimpleVT()))
1473  return Entry->Cost;
1474 
1475  if (ST->hasAVX512())
1476  if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
1477  DstTy.getSimpleVT(),
1478  SrcTy.getSimpleVT()))
1479  return Entry->Cost;
1480 
1481  if (ST->hasAVX2()) {
1482  if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
1483  DstTy.getSimpleVT(),
1484  SrcTy.getSimpleVT()))
1485  return Entry->Cost;
1486  }
1487 
1488  if (ST->hasAVX()) {
1489  if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
1490  DstTy.getSimpleVT(),
1491  SrcTy.getSimpleVT()))
1492  return Entry->Cost;
1493  }
1494 
1495  if (ST->hasSSE41()) {
1496  if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
1497  DstTy.getSimpleVT(),
1498  SrcTy.getSimpleVT()))
1499  return Entry->Cost;
1500  }
1501 
1502  if (ST->hasSSE2()) {
1503  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
1504  DstTy.getSimpleVT(),
1505  SrcTy.getSimpleVT()))
1506  return Entry->Cost;
1507  }
1508 
1509  return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
1510 }
1511 
1512 int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1513  const Instruction *I) {
1514  // Legalize the type.
1515  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1516 
1517  MVT MTy = LT.second;
1518 
1519  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1520  assert(ISD && "Invalid opcode");
1521 
1522  static const CostTblEntry SSE2CostTbl[] = {
1523  { ISD::SETCC, MVT::v2i64, 8 },
1524  { ISD::SETCC, MVT::v4i32, 1 },
1525  { ISD::SETCC, MVT::v8i16, 1 },
1526  { ISD::SETCC, MVT::v16i8, 1 },
1527  };
1528 
1529  static const CostTblEntry SSE42CostTbl[] = {
1530  { ISD::SETCC, MVT::v2f64, 1 },
1531  { ISD::SETCC, MVT::v4f32, 1 },
1532  { ISD::SETCC, MVT::v2i64, 1 },
1533  };
1534 
1535  static const CostTblEntry AVX1CostTbl[] = {
1536  { ISD::SETCC, MVT::v4f64, 1 },
1537  { ISD::SETCC, MVT::v8f32, 1 },
1538  // AVX1 does not support 8-wide integer compare.
1539  { ISD::SETCC, MVT::v4i64, 4 },
1540  { ISD::SETCC, MVT::v8i32, 4 },
1541  { ISD::SETCC, MVT::v16i16, 4 },
1542  { ISD::SETCC, MVT::v32i8, 4 },
1543  };
1544 
1545  static const CostTblEntry AVX2CostTbl[] = {
1546  { ISD::SETCC, MVT::v4i64, 1 },
1547  { ISD::SETCC, MVT::v8i32, 1 },
1548  { ISD::SETCC, MVT::v16i16, 1 },
1549  { ISD::SETCC, MVT::v32i8, 1 },
1550  };
1551 
1552  static const CostTblEntry AVX512CostTbl[] = {
1553  { ISD::SETCC, MVT::v8i64, 1 },
1554  { ISD::SETCC, MVT::v16i32, 1 },
1555  { ISD::SETCC, MVT::v8f64, 1 },
1556  { ISD::SETCC, MVT::v16f32, 1 },
1557  };
1558 
1559  static const CostTblEntry AVX512BWCostTbl[] = {
1560  { ISD::SETCC, MVT::v32i16, 1 },
1561  { ISD::SETCC, MVT::v64i8, 1 },
1562  };
1563 
1564  if (ST->hasBWI())
1565  if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
1566  return LT.first * Entry->Cost;
1567 
1568  if (ST->hasAVX512())
1569  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
1570  return LT.first * Entry->Cost;
1571 
1572  if (ST->hasAVX2())
1573  if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
1574  return LT.first * Entry->Cost;
1575 
1576  if (ST->hasAVX())
1577  if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
1578  return LT.first * Entry->Cost;
1579 
1580  if (ST->hasSSE42())
1581  if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
1582  return LT.first * Entry->Cost;
1583 
1584  if (ST->hasSSE2())
1585  if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
1586  return LT.first * Entry->Cost;
1587 
1588  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
1589 }
1590 
1592 
1595  unsigned ScalarizationCostPassed) {
1596  // Costs should match the codegen from:
1597  // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
1598  // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
1599  // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
1600  // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
1601  // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
1602  static const CostTblEntry AVX512CDCostTbl[] = {
1603  { ISD::CTLZ, MVT::v8i64, 1 },
1604  { ISD::CTLZ, MVT::v16i32, 1 },
1605  { ISD::CTLZ, MVT::v32i16, 8 },
1606  { ISD::CTLZ, MVT::v64i8, 20 },
1607  { ISD::CTLZ, MVT::v4i64, 1 },
1608  { ISD::CTLZ, MVT::v8i32, 1 },
1609  { ISD::CTLZ, MVT::v16i16, 4 },
1610  { ISD::CTLZ, MVT::v32i8, 10 },
1611  { ISD::CTLZ, MVT::v2i64, 1 },
1612  { ISD::CTLZ, MVT::v4i32, 1 },
1613  { ISD::CTLZ, MVT::v8i16, 4 },
1614  { ISD::CTLZ, MVT::v16i8, 4 },
1615  };
1616  static const CostTblEntry AVX512BWCostTbl[] = {
1617  { ISD::BITREVERSE, MVT::v8i64, 5 },
1618  { ISD::BITREVERSE, MVT::v16i32, 5 },
1619  { ISD::BITREVERSE, MVT::v32i16, 5 },
1620  { ISD::BITREVERSE, MVT::v64i8, 5 },
1621  { ISD::CTLZ, MVT::v8i64, 23 },
1622  { ISD::CTLZ, MVT::v16i32, 22 },
1623  { ISD::CTLZ, MVT::v32i16, 18 },
1624  { ISD::CTLZ, MVT::v64i8, 17 },
1625  { ISD::CTPOP, MVT::v8i64, 7 },
1626  { ISD::CTPOP, MVT::v16i32, 11 },
1627  { ISD::CTPOP, MVT::v32i16, 9 },
1628  { ISD::CTPOP, MVT::v64i8, 6 },
1629  { ISD::CTTZ, MVT::v8i64, 10 },
1630  { ISD::CTTZ, MVT::v16i32, 14 },
1631  { ISD::CTTZ, MVT::v32i16, 12 },
1632  { ISD::CTTZ, MVT::v64i8, 9 },
1633  };
1634  static const CostTblEntry AVX512CostTbl[] = {
1635  { ISD::BITREVERSE, MVT::v8i64, 36 },
1636  { ISD::BITREVERSE, MVT::v16i32, 24 },
1637  { ISD::CTLZ, MVT::v8i64, 29 },
1638  { ISD::CTLZ, MVT::v16i32, 35 },
1639  { ISD::CTPOP, MVT::v8i64, 16 },
1640  { ISD::CTPOP, MVT::v16i32, 24 },
1641  { ISD::CTTZ, MVT::v8i64, 20 },
1642  { ISD::CTTZ, MVT::v16i32, 28 },
1643  };
1644  static const CostTblEntry XOPCostTbl[] = {
1645  { ISD::BITREVERSE, MVT::v4i64, 4 },
1646  { ISD::BITREVERSE, MVT::v8i32, 4 },
1647  { ISD::BITREVERSE, MVT::v16i16, 4 },
1648  { ISD::BITREVERSE, MVT::v32i8, 4 },
1649  { ISD::BITREVERSE, MVT::v2i64, 1 },
1650  { ISD::BITREVERSE, MVT::v4i32, 1 },
1651  { ISD::BITREVERSE, MVT::v8i16, 1 },
1652  { ISD::BITREVERSE, MVT::v16i8, 1 },
1653  { ISD::BITREVERSE, MVT::i64, 3 },
1654  { ISD::BITREVERSE, MVT::i32, 3 },
1655  { ISD::BITREVERSE, MVT::i16, 3 },
1656  { ISD::BITREVERSE, MVT::i8, 3 }
1657  };
1658  static const CostTblEntry AVX2CostTbl[] = {
1659  { ISD::BITREVERSE, MVT::v4i64, 5 },
1660  { ISD::BITREVERSE, MVT::v8i32, 5 },
1661  { ISD::BITREVERSE, MVT::v16i16, 5 },
1662  { ISD::BITREVERSE, MVT::v32i8, 5 },
1663  { ISD::BSWAP, MVT::v4i64, 1 },
1664  { ISD::BSWAP, MVT::v8i32, 1 },
1665  { ISD::BSWAP, MVT::v16i16, 1 },
1666  { ISD::CTLZ, MVT::v4i64, 23 },
1667  { ISD::CTLZ, MVT::v8i32, 18 },
1668  { ISD::CTLZ, MVT::v16i16, 14 },
1669  { ISD::CTLZ, MVT::v32i8, 9 },
1670  { ISD::CTPOP, MVT::v4i64, 7 },
1671  { ISD::CTPOP, MVT::v8i32, 11 },
1672  { ISD::CTPOP, MVT::v16i16, 9 },
1673  { ISD::CTPOP, MVT::v32i8, 6 },
1674  { ISD::CTTZ, MVT::v4i64, 10 },
1675  { ISD::CTTZ, MVT::v8i32, 14 },
1676  { ISD::CTTZ, MVT::v16i16, 12 },
1677  { ISD::CTTZ, MVT::v32i8, 9 },
1678  { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/
1679  { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
1680  { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
1681  { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/
1682  { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
1683  { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
1684  };
1685  static const CostTblEntry AVX1CostTbl[] = {
1686  { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert
1687  { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert
1688  { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert
1689  { ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert
1690  { ISD::BSWAP, MVT::v4i64, 4 },
1691  { ISD::BSWAP, MVT::v8i32, 4 },
1692  { ISD::BSWAP, MVT::v16i16, 4 },
1693  { ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert
1694  { ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert
1695  { ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert
1696  { ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
1697  { ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert
1698  { ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert
1699  { ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert
1700  { ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert
1701  { ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert
1702  { ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert
1703  { ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert
1704  { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
1705  { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
1706  { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
1707  { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
1708  { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/
1709  { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/
1710  { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/
1711  };
1712  static const CostTblEntry GLMCostTbl[] = {
1713  { ISD::FSQRT, MVT::f32, 19 }, // sqrtss
1714  { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps
1715  { ISD::FSQRT, MVT::f64, 34 }, // sqrtsd
1716  { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd
1717  };
1718  static const CostTblEntry SLMCostTbl[] = {
1719  { ISD::FSQRT, MVT::f32, 20 }, // sqrtss
1720  { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps
1721  { ISD::FSQRT, MVT::f64, 35 }, // sqrtsd
1722  { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd
1723  };
1724  static const CostTblEntry SSE42CostTbl[] = {
1725  { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
1726  { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
1727  };
1728  static const CostTblEntry SSSE3CostTbl[] = {
1729  { ISD::BITREVERSE, MVT::v2i64, 5 },
1730  { ISD::BITREVERSE, MVT::v4i32, 5 },
1731  { ISD::BITREVERSE, MVT::v8i16, 5 },
1732  { ISD::BITREVERSE, MVT::v16i8, 5 },
1733  { ISD::BSWAP, MVT::v2i64, 1 },
1734  { ISD::BSWAP, MVT::v4i32, 1 },
1735  { ISD::BSWAP, MVT::v8i16, 1 },
1736  { ISD::CTLZ, MVT::v2i64, 23 },
1737  { ISD::CTLZ, MVT::v4i32, 18 },
1738  { ISD::CTLZ, MVT::v8i16, 14 },
1739  { ISD::CTLZ, MVT::v16i8, 9 },
1740  { ISD::CTPOP, MVT::v2i64, 7 },
1741  { ISD::CTPOP, MVT::v4i32, 11 },
1742  { ISD::CTPOP, MVT::v8i16, 9 },
1743  { ISD::CTPOP, MVT::v16i8, 6 },
1744  { ISD::CTTZ, MVT::v2i64, 10 },
1745  { ISD::CTTZ, MVT::v4i32, 14 },
1746  { ISD::CTTZ, MVT::v8i16, 12 },
1747  { ISD::CTTZ, MVT::v16i8, 9 }
1748  };
1749  static const CostTblEntry SSE2CostTbl[] = {
1750  { ISD::BITREVERSE, MVT::v2i64, 29 },
1751  { ISD::BITREVERSE, MVT::v4i32, 27 },
1752  { ISD::BITREVERSE, MVT::v8i16, 27 },
1753  { ISD::BITREVERSE, MVT::v16i8, 20 },
1754  { ISD::BSWAP, MVT::v2i64, 7 },
1755  { ISD::BSWAP, MVT::v4i32, 7 },
1756  { ISD::BSWAP, MVT::v8i16, 7 },
1757  { ISD::CTLZ, MVT::v2i64, 25 },
1758  { ISD::CTLZ, MVT::v4i32, 26 },
1759  { ISD::CTLZ, MVT::v8i16, 20 },
1760  { ISD::CTLZ, MVT::v16i8, 17 },
1761  { ISD::CTPOP, MVT::v2i64, 12 },
1762  { ISD::CTPOP, MVT::v4i32, 15 },
1763  { ISD::CTPOP, MVT::v8i16, 13 },
1764  { ISD::CTPOP, MVT::v16i8, 10 },
1765  { ISD::CTTZ, MVT::v2i64, 14 },
1766  { ISD::CTTZ, MVT::v4i32, 18 },
1767  { ISD::CTTZ, MVT::v8i16, 16 },
1768  { ISD::CTTZ, MVT::v16i8, 13 },
1769  { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/
1770  { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/
1771  };
1772  static const CostTblEntry SSE1CostTbl[] = {
1773  { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
1774  { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
1775  };
1776  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
1777  { ISD::BITREVERSE, MVT::i64, 14 }
1778  };
1779  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1780  { ISD::BITREVERSE, MVT::i32, 14 },
1781  { ISD::BITREVERSE, MVT::i16, 14 },
1782  { ISD::BITREVERSE, MVT::i8, 11 }
1783  };
1784 
1785  unsigned ISD = ISD::DELETED_NODE;
1786  switch (IID) {
1787  default:
1788  break;
1789  case Intrinsic::bitreverse:
1790  ISD = ISD::BITREVERSE;
1791  break;
1792  case Intrinsic::bswap:
1793  ISD = ISD::BSWAP;
1794  break;
1795  case Intrinsic::ctlz:
1796  ISD = ISD::CTLZ;
1797  break;
1798  case Intrinsic::ctpop:
1799  ISD = ISD::CTPOP;
1800  break;
1801  case Intrinsic::cttz:
1802  ISD = ISD::CTTZ;
1803  break;
1804  case Intrinsic::sqrt:
1805  ISD = ISD::FSQRT;
1806  break;
1807  }
1808 
1809  // Legalize the type.
1810  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
1811  MVT MTy = LT.second;
1812 
1813  // Attempt to lookup cost.
1814  if (ST->isGLM())
1815  if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
1816  return LT.first * Entry->Cost;
1817 
1818  if (ST->isSLM())
1819  if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
1820  return LT.first * Entry->Cost;
1821 
1822  if (ST->hasCDI())
1823  if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
1824  return LT.first * Entry->Cost;
1825 
1826  if (ST->hasBWI())
1827  if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
1828  return LT.first * Entry->Cost;
1829 
1830  if (ST->hasAVX512())
1831  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
1832  return LT.first * Entry->Cost;
1833 
1834  if (ST->hasXOP())
1835  if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
1836  return LT.first * Entry->Cost;
1837 
1838  if (ST->hasAVX2())
1839  if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
1840  return LT.first * Entry->Cost;
1841 
1842  if (ST->hasAVX())
1843  if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
1844  return LT.first * Entry->Cost;
1845 
1846  if (ST->hasSSE42())
1847  if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
1848  return LT.first * Entry->Cost;
1849 
1850  if (ST->hasSSSE3())
1851  if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
1852  return LT.first * Entry->Cost;
1853 
1854  if (ST->hasSSE2())
1855  if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
1856  return LT.first * Entry->Cost;
1857 
1858  if (ST->hasSSE1())
1859  if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
1860  return LT.first * Entry->Cost;
1861 
1862  if (ST->is64Bit())
1863  if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
1864  return LT.first * Entry->Cost;
1865 
1866  if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
1867  return LT.first * Entry->Cost;
1868 
1869  return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed);
1870 }
1871 
1873  ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) {
1874  return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF);
1875 }
1876 
1877 int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
1878  assert(Val->isVectorTy() && "This must be a vector type");
1879 
1880  Type *ScalarType = Val->getScalarType();
1881 
1882  if (Index != -1U) {
1883  // Legalize the type.
1884  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
1885 
1886  // This type is legalized to a scalar type.
1887  if (!LT.second.isVector())
1888  return 0;
1889 
1890  // The type may be split. Normalize the index to the new type.
1891  unsigned Width = LT.second.getVectorNumElements();
1892  Index = Index % Width;
1893 
1894  // Floating point scalars are already located in index #0.
1895  if (ScalarType->isFloatingPointTy() && Index == 0)
1896  return 0;
1897  }
1898 
1899  // Add to the base cost if we know that the extracted element of a vector is
1900  // destined to be moved to and used in the integer register file.
1901  int RegisterFileMoveCost = 0;
1902  if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
1903  RegisterFileMoveCost = 1;
1904 
1905  return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
1906 }
1907 
1908 int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
1909  unsigned AddressSpace, const Instruction *I) {
1910  // Handle non-power-of-two vectors such as <3 x float>
1911  if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
1912  unsigned NumElem = VTy->getVectorNumElements();
1913 
1914  // Handle a few common cases:
1915  // <3 x float>
1916  if (NumElem == 3 && VTy->getScalarSizeInBits() == 32)
1917  // Cost = 64 bit store + extract + 32 bit store.
1918  return 3;
1919 
1920  // <3 x double>
1921  if (NumElem == 3 && VTy->getScalarSizeInBits() == 64)
1922  // Cost = 128 bit store + unpack + 64 bit store.
1923  return 3;
1924 
1925  // Assume that all other non-power-of-two numbers are scalarized.
1926  if (!isPowerOf2_32(NumElem)) {
1927  int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
1928  AddressSpace);
1929  int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load,
1930  Opcode == Instruction::Store);
1931  return NumElem * Cost + SplitCost;
1932  }
1933  }
1934 
1935  // Legalize the type.
1936  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
1937  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1938  "Invalid Opcode");
1939 
1940  // Each load/store unit costs 1.
1941  int Cost = LT.first * 1;
1942 
1943  // This isn't exactly right. We're using slow unaligned 32-byte accesses as a
1944  // proxy for a double-pumped AVX memory interface such as on Sandybridge.
1945  if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow())
1946  Cost *= 2;
1947 
1948  return Cost;
1949 }
1950 
1951 int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
1952  unsigned Alignment,
1953  unsigned AddressSpace) {
1954  VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy);
1955  if (!SrcVTy)
1956  // To calculate scalar take the regular cost, without mask
1957  return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace);
1958 
1959  unsigned NumElem = SrcVTy->getVectorNumElements();
1960  VectorType *MaskTy =
1961  VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
1962  if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy)) ||
1963  (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy)) ||
1964  !isPowerOf2_32(NumElem)) {
1965  // Scalarization
1966  int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
1967  int ScalarCompareCost = getCmpSelInstrCost(
1968  Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr);
1969  int BranchCost = getCFInstrCost(Instruction::Br);
1970  int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
1971 
1972  int ValueSplitCost = getScalarizationOverhead(
1973  SrcVTy, Opcode == Instruction::Load, Opcode == Instruction::Store);
1974  int MemopCost =
1975  NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
1976  Alignment, AddressSpace);
1977  return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
1978  }
1979 
1980  // Legalize the type.
1981  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
1982  auto VT = TLI->getValueType(DL, SrcVTy);
1983  int Cost = 0;
1984  if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
1985  LT.second.getVectorNumElements() == NumElem)
1986  // Promotion requires expand/truncate for data and a shuffle for mask.
1987  Cost += getShuffleCost(TTI::SK_Select, SrcVTy, 0, nullptr) +
1988  getShuffleCost(TTI::SK_Select, MaskTy, 0, nullptr);
1989 
1990  else if (LT.second.getVectorNumElements() > NumElem) {
1991  VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(),
1992  LT.second.getVectorNumElements());
1993  // Expanding requires fill mask with zeroes
1994  Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy);
1995  }
1996  if (!ST->hasAVX512())
1997  return Cost + LT.first*4; // Each maskmov costs 4
1998 
1999  // AVX-512 masked load/store is cheapper
2000  return Cost+LT.first;
2001 }
2002 
2004  const SCEV *Ptr) {
2005  // Address computations in vectorized code with non-consecutive addresses will
2006  // likely result in more instructions compared to scalar code where the
2007  // computation can more often be merged into the index mode. The resulting
2008  // extra micro-ops can significantly decrease throughput.
2009  unsigned NumVectorInstToHideOverhead = 10;
2010 
2011  // Cost modeling of Strided Access Computation is hidden by the indexing
2012  // modes of X86 regardless of the stride value. We dont believe that there
2013  // is a difference between constant strided access in gerenal and constant
2014  // strided value which is less than or equal to 64.
2015  // Even in the case of (loop invariant) stride whose value is not known at
2016  // compile time, the address computation will not incur more than one extra
2017  // ADD instruction.
2018  if (Ty->isVectorTy() && SE) {
2019  if (!BaseT::isStridedAccess(Ptr))
2020  return NumVectorInstToHideOverhead;
2021  if (!BaseT::getConstantStrideStep(SE, Ptr))
2022  return 1;
2023  }
2024 
2025  return BaseT::getAddressComputationCost(Ty, SE, Ptr);
2026 }
2027 
2028 int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
2029  bool IsPairwise) {
2030 
2031  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2032 
2033  MVT MTy = LT.second;
2034 
2035  int ISD = TLI->InstructionOpcodeToISD(Opcode);
2036  assert(ISD && "Invalid opcode");
2037 
2038  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
2039  // and make it as the cost.
2040 
2041  static const CostTblEntry SSE42CostTblPairWise[] = {
2042  { ISD::FADD, MVT::v2f64, 2 },
2043  { ISD::FADD, MVT::v4f32, 4 },
2044  { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
2045  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
2046  { ISD::ADD, MVT::v8i16, 5 },
2047  };
2048 
2049  static const CostTblEntry AVX1CostTblPairWise[] = {
2050  { ISD::FADD, MVT::v4f32, 4 },
2051  { ISD::FADD, MVT::v4f64, 5 },
2052  { ISD::FADD, MVT::v8f32, 7 },
2053  { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
2054  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
2055  { ISD::ADD, MVT::v4i64, 5 }, // The data reported by the IACA tool is "4.8".
2056  { ISD::ADD, MVT::v8i16, 5 },
2057  { ISD::ADD, MVT::v8i32, 5 },
2058  };
2059 
2060  static const CostTblEntry SSE42CostTblNoPairWise[] = {
2061  { ISD::FADD, MVT::v2f64, 2 },
2062  { ISD::FADD, MVT::v4f32, 4 },
2063  { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
2064  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
2065  { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
2066  };
2067 
2068  static const CostTblEntry AVX1CostTblNoPairWise[] = {
2069  { ISD::FADD, MVT::v4f32, 3 },
2070  { ISD::FADD, MVT::v4f64, 3 },
2071  { ISD::FADD, MVT::v8f32, 4 },
2072  { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
2073  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "2.8".
2074  { ISD::ADD, MVT::v4i64, 3 },
2075  { ISD::ADD, MVT::v8i16, 4 },
2076  { ISD::ADD, MVT::v8i32, 5 },
2077  };
2078 
2079  if (IsPairwise) {
2080  if (ST->hasAVX())
2081  if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
2082  return LT.first * Entry->Cost;
2083 
2084  if (ST->hasSSE42())
2085  if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
2086  return LT.first * Entry->Cost;
2087  } else {
2088  if (ST->hasAVX())
2089  if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
2090  return LT.first * Entry->Cost;
2091 
2092  if (ST->hasSSE42())
2093  if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
2094  return LT.first * Entry->Cost;
2095  }
2096 
2097  return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise);
2098 }
2099 
2101  bool IsPairwise, bool IsUnsigned) {
2102  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2103 
2104  MVT MTy = LT.second;
2105 
2106  int ISD;
2107  if (ValTy->isIntOrIntVectorTy()) {
2108  ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
2109  } else {
2110  assert(ValTy->isFPOrFPVectorTy() &&
2111  "Expected float point or integer vector type.");
2112  ISD = ISD::FMINNUM;
2113  }
2114 
2115  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
2116  // and make it as the cost.
2117 
2118  static const CostTblEntry SSE42CostTblPairWise[] = {
2119  {ISD::FMINNUM, MVT::v2f64, 3},
2120  {ISD::FMINNUM, MVT::v4f32, 2},
2121  {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
2122  {ISD::UMIN, MVT::v2i64, 8}, // The data reported by the IACA is "8.6"
2123  {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
2124  {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
2125  {ISD::SMIN, MVT::v8i16, 2},
2126  {ISD::UMIN, MVT::v8i16, 2},
2127  };
2128 
2129  static const CostTblEntry AVX1CostTblPairWise[] = {
2130  {ISD::FMINNUM, MVT::v4f32, 1},
2131  {ISD::FMINNUM, MVT::v4f64, 1},
2132  {ISD::FMINNUM, MVT::v8f32, 2},
2133  {ISD::SMIN, MVT::v2i64, 3},
2134  {ISD::UMIN, MVT::v2i64, 3},
2135  {ISD::SMIN, MVT::v4i32, 1},
2136  {ISD::UMIN, MVT::v4i32, 1},
2137  {ISD::SMIN, MVT::v8i16, 1},
2138  {ISD::UMIN, MVT::v8i16, 1},
2139  {ISD::SMIN, MVT::v8i32, 3},
2140  {ISD::UMIN, MVT::v8i32, 3},
2141  };
2142 
2143  static const CostTblEntry AVX2CostTblPairWise[] = {
2144  {ISD::SMIN, MVT::v4i64, 2},
2145  {ISD::UMIN, MVT::v4i64, 2},
2146  {ISD::SMIN, MVT::v8i32, 1},
2147  {ISD::UMIN, MVT::v8i32, 1},
2148  {ISD::SMIN, MVT::v16i16, 1},
2149  {ISD::UMIN, MVT::v16i16, 1},
2150  {ISD::SMIN, MVT::v32i8, 2},
2151  {ISD::UMIN, MVT::v32i8, 2},
2152  };
2153 
2154  static const CostTblEntry AVX512CostTblPairWise[] = {
2155  {ISD::FMINNUM, MVT::v8f64, 1},
2156  {ISD::FMINNUM, MVT::v16f32, 2},
2157  {ISD::SMIN, MVT::v8i64, 2},
2158  {ISD::UMIN, MVT::v8i64, 2},
2159  {ISD::SMIN, MVT::v16i32, 1},
2160  {ISD::UMIN, MVT::v16i32, 1},
2161  };
2162 
2163  static const CostTblEntry SSE42CostTblNoPairWise[] = {
2164  {ISD::FMINNUM, MVT::v2f64, 3},
2165  {ISD::FMINNUM, MVT::v4f32, 3},
2166  {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
2167  {ISD::UMIN, MVT::v2i64, 9}, // The data reported by the IACA is "8.6"
2168  {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
2169  {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
2170  {ISD::SMIN, MVT::v8i16, 1}, // The data reported by the IACA is "1.5"
2171  {ISD::UMIN, MVT::v8i16, 2}, // The data reported by the IACA is "1.8"
2172  };
2173 
2174  static const CostTblEntry AVX1CostTblNoPairWise[] = {
2175  {ISD::FMINNUM, MVT::v4f32, 1},
2176  {ISD::FMINNUM, MVT::v4f64, 1},
2177  {ISD::FMINNUM, MVT::v8f32, 1},
2178  {ISD::SMIN, MVT::v2i64, 3},
2179  {ISD::UMIN, MVT::v2i64, 3},
2180  {ISD::SMIN, MVT::v4i32, 1},
2181  {ISD::UMIN, MVT::v4i32, 1},
2182  {ISD::SMIN, MVT::v8i16, 1},
2183  {ISD::UMIN, MVT::v8i16, 1},
2184  {ISD::SMIN, MVT::v8i32, 2},
2185  {ISD::UMIN, MVT::v8i32, 2},
2186  };
2187 
2188  static const CostTblEntry AVX2CostTblNoPairWise[] = {
2189  {ISD::SMIN, MVT::v4i64, 1},
2190  {ISD::UMIN, MVT::v4i64, 1},
2191  {ISD::SMIN, MVT::v8i32, 1},
2192  {ISD::UMIN, MVT::v8i32, 1},
2193  {ISD::SMIN, MVT::v16i16, 1},
2194  {ISD::UMIN, MVT::v16i16, 1},
2195  {ISD::SMIN, MVT::v32i8, 1},
2196  {ISD::UMIN, MVT::v32i8, 1},
2197  };
2198 
2199  static const CostTblEntry AVX512CostTblNoPairWise[] = {
2200  {ISD::FMINNUM, MVT::v8f64, 1},
2201  {ISD::FMINNUM, MVT::v16f32, 2},
2202  {ISD::SMIN, MVT::v8i64, 1},
2203  {ISD::UMIN, MVT::v8i64, 1},
2204  {ISD::SMIN, MVT::v16i32, 1},
2205  {ISD::UMIN, MVT::v16i32, 1},
2206  };
2207 
2208  if (IsPairwise) {
2209  if (ST->hasAVX512())
2210  if (const auto *Entry = CostTableLookup(AVX512CostTblPairWise, ISD, MTy))
2211  return LT.first * Entry->Cost;
2212 
2213  if (ST->hasAVX2())
2214  if (const auto *Entry = CostTableLookup(AVX2CostTblPairWise, ISD, MTy))
2215  return LT.first * Entry->Cost;
2216 
2217  if (ST->hasAVX())
2218  if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
2219  return LT.first * Entry->Cost;
2220 
2221  if (ST->hasSSE42())
2222  if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
2223  return LT.first * Entry->Cost;
2224  } else {
2225  if (ST->hasAVX512())
2226  if (const auto *Entry =
2227  CostTableLookup(AVX512CostTblNoPairWise, ISD, MTy))
2228  return LT.first * Entry->Cost;
2229 
2230  if (ST->hasAVX2())
2231  if (const auto *Entry = CostTableLookup(AVX2CostTblNoPairWise, ISD, MTy))
2232  return LT.first * Entry->Cost;
2233 
2234  if (ST->hasAVX())
2235  if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
2236  return LT.first * Entry->Cost;
2237 
2238  if (ST->hasSSE42())
2239  if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
2240  return LT.first * Entry->Cost;
2241  }
2242 
2243  return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned);
2244 }
2245 
2246 /// Calculate the cost of materializing a 64-bit value. This helper
2247 /// method might only calculate a fraction of a larger immediate. Therefore it
2248 /// is valid to return a cost of ZERO.
2249 int X86TTIImpl::getIntImmCost(int64_t Val) {
2250  if (Val == 0)
2251  return TTI::TCC_Free;
2252 
2253  if (isInt<32>(Val))
2254  return TTI::TCC_Basic;
2255 
2256  return 2 * TTI::TCC_Basic;
2257 }
2258 
2259 int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
2260  assert(Ty->isIntegerTy());
2261 
2262  unsigned BitSize = Ty->getPrimitiveSizeInBits();
2263  if (BitSize == 0)
2264  return ~0U;
2265 
2266  // Never hoist constants larger than 128bit, because this might lead to
2267  // incorrect code generation or assertions in codegen.
2268  // Fixme: Create a cost model for types larger than i128 once the codegen
2269  // issues have been fixed.
2270  if (BitSize > 128)
2271  return TTI::TCC_Free;
2272 
2273  if (Imm == 0)
2274  return TTI::TCC_Free;
2275 
2276  // Sign-extend all constants to a multiple of 64-bit.
2277  APInt ImmVal = Imm;
2278  if (BitSize % 64 != 0)
2279  ImmVal = Imm.sext(alignTo(BitSize, 64));
2280 
2281  // Split the constant into 64-bit chunks and calculate the cost for each
2282  // chunk.
2283  int Cost = 0;
2284  for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
2285  APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
2286  int64_t Val = Tmp.getSExtValue();
2287  Cost += getIntImmCost(Val);
2288  }
2289  // We need at least one instruction to materialize the constant.
2290  return std::max(1, Cost);
2291 }
2292 
2293 int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
2294  Type *Ty) {
2295  assert(Ty->isIntegerTy());
2296 
2297  unsigned BitSize = Ty->getPrimitiveSizeInBits();
2298  // There is no cost model for constants with a bit size of 0. Return TCC_Free
2299  // here, so that constant hoisting will ignore this constant.
2300  if (BitSize == 0)
2301  return TTI::TCC_Free;
2302 
2303  unsigned ImmIdx = ~0U;
2304  switch (Opcode) {
2305  default:
2306  return TTI::TCC_Free;
2307  case Instruction::GetElementPtr:
2308  // Always hoist the base address of a GetElementPtr. This prevents the
2309  // creation of new constants for every base constant that gets constant
2310  // folded with the offset.
2311  if (Idx == 0)
2312  return 2 * TTI::TCC_Basic;
2313  return TTI::TCC_Free;
2314  case Instruction::Store:
2315  ImmIdx = 0;
2316  break;
2317  case Instruction::ICmp:
2318  // This is an imperfect hack to prevent constant hoisting of
2319  // compares that might be trying to check if a 64-bit value fits in
2320  // 32-bits. The backend can optimize these cases using a right shift by 32.
2321  // Ideally we would check the compare predicate here. There also other
2322  // similar immediates the backend can use shifts for.
2323  if (Idx == 1 && Imm.getBitWidth() == 64) {
2324  uint64_t ImmVal = Imm.getZExtValue();
2325  if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
2326  return TTI::TCC_Free;
2327  }
2328  ImmIdx = 1;
2329  break;
2330  case Instruction::And:
2331  // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
2332  // by using a 32-bit operation with implicit zero extension. Detect such
2333  // immediates here as the normal path expects bit 31 to be sign extended.
2334  if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue()))
2335  return TTI::TCC_Free;
2336  ImmIdx = 1;
2337  break;
2338  case Instruction::Add:
2339  case Instruction::Sub:
2340  // For add/sub, we can use the opposite instruction for INT32_MIN.
2341  if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000)
2342  return TTI::TCC_Free;
2343  ImmIdx = 1;
2344  break;
2345  case Instruction::Mul:
2346  case Instruction::UDiv:
2347  case Instruction::SDiv:
2348  case Instruction::URem:
2349  case Instruction::SRem:
2350  case Instruction::Or:
2351  case Instruction::Xor:
2352  ImmIdx = 1;
2353  break;
2354  // Always return TCC_Free for the shift value of a shift instruction.
2355  case Instruction::Shl:
2356  case Instruction::LShr:
2357  case Instruction::AShr:
2358  if (Idx == 1)
2359  return TTI::TCC_Free;
2360  break;
2361  case Instruction::Trunc:
2362  case Instruction::ZExt:
2363  case Instruction::SExt:
2364  case Instruction::IntToPtr:
2365  case Instruction::PtrToInt:
2366  case Instruction::BitCast:
2367  case Instruction::PHI:
2368  case Instruction::Call:
2369  case Instruction::Select:
2370  case Instruction::Ret:
2371  case Instruction::Load:
2372  break;
2373  }
2374 
2375  if (Idx == ImmIdx) {
2376  int NumConstants = divideCeil(BitSize, 64);
2377  int Cost = X86TTIImpl::getIntImmCost(Imm, Ty);
2378  return (Cost <= NumConstants * TTI::TCC_Basic)
2379  ? static_cast<int>(TTI::TCC_Free)
2380  : Cost;
2381  }
2382 
2383  return X86TTIImpl::getIntImmCost(Imm, Ty);
2384 }
2385 
2386 int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
2387  Type *Ty) {
2388  assert(Ty->isIntegerTy());
2389 
2390  unsigned BitSize = Ty->getPrimitiveSizeInBits();
2391  // There is no cost model for constants with a bit size of 0. Return TCC_Free
2392  // here, so that constant hoisting will ignore this constant.
2393  if (BitSize == 0)
2394  return TTI::TCC_Free;
2395 
2396  switch (IID) {
2397  default:
2398  return TTI::TCC_Free;
2399  case Intrinsic::sadd_with_overflow:
2400  case Intrinsic::uadd_with_overflow:
2401  case Intrinsic::ssub_with_overflow:
2402  case Intrinsic::usub_with_overflow:
2403  case Intrinsic::smul_with_overflow:
2404  case Intrinsic::umul_with_overflow:
2405  if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
2406  return TTI::TCC_Free;
2407  break;
2408  case Intrinsic::experimental_stackmap:
2409  if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
2410  return TTI::TCC_Free;
2411  break;
2412  case Intrinsic::experimental_patchpoint_void:
2413  case Intrinsic::experimental_patchpoint_i64:
2414  if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
2415  return TTI::TCC_Free;
2416  break;
2417  }
2418  return X86TTIImpl::getIntImmCost(Imm, Ty);
2419 }
2420 
2421 unsigned X86TTIImpl::getUserCost(const User *U,
2422  ArrayRef<const Value *> Operands) {
2423  if (isa<StoreInst>(U)) {
2424  Value *Ptr = U->getOperand(1);
2425  // Store instruction with index and scale costs 2 Uops.
2426  // Check the preceding GEP to identify non-const indices.
2427  if (auto GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
2428  if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
2429  return TTI::TCC_Basic * 2;
2430  }
2431  return TTI::TCC_Basic;
2432  }
2433  return BaseT::getUserCost(U, Operands);
2434 }
2435 
2436 // Return an average cost of Gather / Scatter instruction, maybe improved later
2437 int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
2438  unsigned Alignment, unsigned AddressSpace) {
2439 
2440  assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
2441  unsigned VF = SrcVTy->getVectorNumElements();
2442 
2443  // Try to reduce index size from 64 bit (default for GEP)
2444  // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
2445  // operation will use 16 x 64 indices which do not fit in a zmm and needs
2446  // to split. Also check that the base pointer is the same for all lanes,
2447  // and that there's at most one variable index.
2448  auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) {
2449  unsigned IndexSize = DL.getPointerSizeInBits();
2451  if (IndexSize < 64 || !GEP)
2452  return IndexSize;
2453 
2454  unsigned NumOfVarIndices = 0;
2455  Value *Ptrs = GEP->getPointerOperand();
2456  if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
2457  return IndexSize;
2458  for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
2459  if (isa<Constant>(GEP->getOperand(i)))
2460  continue;
2461  Type *IndxTy = GEP->getOperand(i)->getType();
2462  if (IndxTy->isVectorTy())
2463  IndxTy = IndxTy->getVectorElementType();
2464  if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
2465  !isa<SExtInst>(GEP->getOperand(i))) ||
2466  ++NumOfVarIndices > 1)
2467  return IndexSize; // 64
2468  }
2469  return (unsigned)32;
2470  };
2471 
2472 
2473  // Trying to reduce IndexSize to 32 bits for vector 16.
2474  // By default the IndexSize is equal to pointer size.
2475  unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
2476  ? getIndexSizeInBits(Ptr, DL)
2478 
2479  Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(),
2480  IndexSize), VF);
2481  std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
2482  std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
2483  int SplitFactor = std::max(IdxsLT.first, SrcLT.first);
2484  if (SplitFactor > 1) {
2485  // Handle splitting of vector of pointers
2486  Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
2487  return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
2488  AddressSpace);
2489  }
2490 
2491  // The gather / scatter cost is given by Intel architects. It is a rough
2492  // number since we are looking at one instruction in a time.
2493  const int GSOverhead = (Opcode == Instruction::Load)
2494  ? ST->getGatherOverhead()
2495  : ST->getScatterOverhead();
2496  return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
2497  Alignment, AddressSpace);
2498 }
2499 
2500 /// Return the cost of full scalarization of gather / scatter operation.
2501 ///
2502 /// Opcode - Load or Store instruction.
2503 /// SrcVTy - The type of the data vector that should be gathered or scattered.
2504 /// VariableMask - The mask is non-constant at compile time.
2505 /// Alignment - Alignment for one element.
2506 /// AddressSpace - pointer[s] address space.
2507 ///
2508 int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
2509  bool VariableMask, unsigned Alignment,
2510  unsigned AddressSpace) {
2511  unsigned VF = SrcVTy->getVectorNumElements();
2512 
2513  int MaskUnpackCost = 0;
2514  if (VariableMask) {
2515  VectorType *MaskTy =
2516  VectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
2517  MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true);
2518  int ScalarCompareCost =
2519  getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()),
2520  nullptr);
2521  int BranchCost = getCFInstrCost(Instruction::Br);
2522  MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
2523  }
2524 
2525  // The cost of the scalar loads/stores.
2526  int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
2527  Alignment, AddressSpace);
2528 
2529  int InsertExtractCost = 0;
2530  if (Opcode == Instruction::Load)
2531  for (unsigned i = 0; i < VF; ++i)
2532  // Add the cost of inserting each scalar load into the vector
2533  InsertExtractCost +=
2534  getVectorInstrCost(Instruction::InsertElement, SrcVTy, i);
2535  else
2536  for (unsigned i = 0; i < VF; ++i)
2537  // Add the cost of extracting each element out of the data vector
2538  InsertExtractCost +=
2539  getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i);
2540 
2541  return MemoryOpCost + MaskUnpackCost + InsertExtractCost;
2542 }
2543 
2544 /// Calculate the cost of Gather / Scatter operation
2545 int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
2546  Value *Ptr, bool VariableMask,
2547  unsigned Alignment) {
2548  assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
2549  unsigned VF = SrcVTy->getVectorNumElements();
2550  PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
2551  if (!PtrTy && Ptr->getType()->isVectorTy())
2552  PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType());
2553  assert(PtrTy && "Unexpected type for Ptr argument");
2554  unsigned AddressSpace = PtrTy->getAddressSpace();
2555 
2556  bool Scalarize = false;
2557  if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) ||
2558  (Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy)))
2559  Scalarize = true;
2560  // Gather / Scatter for vector 2 is not profitable on KNL / SKX
2561  // Vector-4 of gather/scatter instruction does not exist on KNL.
2562  // We can extend it to 8 elements, but zeroing upper bits of
2563  // the mask vector will add more instructions. Right now we give the scalar
2564  // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction
2565  // is better in the VariableMask case.
2566  if (ST->hasAVX512() && (VF == 2 || (VF == 4 && !ST->hasVLX())))
2567  Scalarize = true;
2568 
2569  if (Scalarize)
2570  return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
2571  AddressSpace);
2572 
2573  return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
2574 }
2575 
2578  // X86 specific here are "instruction number 1st priority".
2579  return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
2580  C1.NumIVMuls, C1.NumBaseAdds,
2581  C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
2582  std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
2583  C2.NumIVMuls, C2.NumBaseAdds,
2584  C2.ScaleCost, C2.ImmCost, C2.SetupCost);
2585 }
2586 
2588  return ST->hasMacroFusion();
2589 }
2590 
2592  // The backend can't handle a single element vector.
2593  if (isa<VectorType>(DataTy) && DataTy->getVectorNumElements() == 1)
2594  return false;
2595  Type *ScalarTy = DataTy->getScalarType();
2596  int DataWidth = isa<PointerType>(ScalarTy) ?
2598 
2599  return ((DataWidth == 32 || DataWidth == 64) && ST->hasAVX()) ||
2600  ((DataWidth == 8 || DataWidth == 16) && ST->hasBWI());
2601 }
2602 
2604  return isLegalMaskedLoad(DataType);
2605 }
2606 
2608  // This function is called now in two cases: from the Loop Vectorizer
2609  // and from the Scalarizer.
2610  // When the Loop Vectorizer asks about legality of the feature,
2611  // the vectorization factor is not calculated yet. The Loop Vectorizer
2612  // sends a scalar type and the decision is based on the width of the
2613  // scalar element.
2614  // Later on, the cost model will estimate usage this intrinsic based on
2615  // the vector type.
2616  // The Scalarizer asks again about legality. It sends a vector type.
2617  // In this case we can reject non-power-of-2 vectors.
2618  // We also reject single element vectors as the type legalizer can't
2619  // scalarize it.
2620  if (isa<VectorType>(DataTy)) {
2621  unsigned NumElts = DataTy->getVectorNumElements();
2622  if (NumElts == 1 || !isPowerOf2_32(NumElts))
2623  return false;
2624  }
2625  Type *ScalarTy = DataTy->getScalarType();
2626  int DataWidth = isa<PointerType>(ScalarTy) ?
2628 
2629  // Some CPUs have better gather performance than others.
2630  // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
2631  // enable gather with a -march.
2632  return (DataWidth == 32 || DataWidth == 64) &&
2633  (ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2()));
2634 }
2635 
2637  // AVX2 doesn't support scatter
2638  if (!ST->hasAVX512())
2639  return false;
2640  return isLegalMaskedGather(DataType);
2641 }
2642 
2643 bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
2644  EVT VT = TLI->getValueType(DL, DataType);
2645  return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
2646 }
2647 
2649  return false;
2650 }
2651 
2653  const Function *Callee) const {
2654  const TargetMachine &TM = getTLI()->getTargetMachine();
2655 
2656  // Work this as a subsetting of subtarget features.
2657  const FeatureBitset &CallerBits =
2658  TM.getSubtargetImpl(*Caller)->getFeatureBits();
2659  const FeatureBitset &CalleeBits =
2660  TM.getSubtargetImpl(*Callee)->getFeatureBits();
2661 
2662  // FIXME: This is likely too limiting as it will include subtarget features
2663  // that we might not care about for inlining, but it is conservatively
2664  // correct.
2665  return (CallerBits & CalleeBits) == CalleeBits;
2666 }
2667 
2669 X86TTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const {
2670  // Only enable vector loads for equality comparison.
2671  // Right now the vector version is not as fast, see #33329.
2672  static const auto ThreeWayOptions = [this]() {
2674  if (ST->is64Bit()) {
2675  Options.LoadSizes.push_back(8);
2676  }
2677  Options.LoadSizes.push_back(4);
2678  Options.LoadSizes.push_back(2);
2679  Options.LoadSizes.push_back(1);
2680  return Options;
2681  }();
2682  static const auto EqZeroOptions = [this]() {
2684  // TODO: enable AVX512 when the DAG is ready.
2685  // if (ST->hasAVX512()) Options.LoadSizes.push_back(64);
2686  if (ST->hasAVX2()) Options.LoadSizes.push_back(32);
2687  if (ST->hasSSE2()) Options.LoadSizes.push_back(16);
2688  if (ST->is64Bit()) {
2689  Options.LoadSizes.push_back(8);
2690  }
2691  Options.LoadSizes.push_back(4);
2692  Options.LoadSizes.push_back(2);
2693  Options.LoadSizes.push_back(1);
2694  return Options;
2695  }();
2696  return IsZeroCmp ? &EqZeroOptions : &ThreeWayOptions;
2697 }
2698 
2700  // TODO: We expect this to be beneficial regardless of arch,
2701  // but there are currently some unexplained performance artifacts on Atom.
2702  // As a temporary solution, disable on Atom.
2703  return !(ST->isAtom());
2704 }
2705 
2706 // Get estimation for interleaved load/store operations for AVX2.
2707 // \p Factor is the interleaved-access factor (stride) - number of
2708 // (interleaved) elements in the group.
2709 // \p Indices contains the indices for a strided load: when the
2710 // interleaved load has gaps they indicate which elements are used.
2711 // If Indices is empty (or if the number of indices is equal to the size
2712 // of the interleaved-access as given in \p Factor) the access has no gaps.
2713 //
2714 // As opposed to AVX-512, AVX2 does not have generic shuffles that allow
2715 // computing the cost using a generic formula as a function of generic
2716 // shuffles. We therefore use a lookup table instead, filled according to
2717 // the instruction sequences that codegen currently generates.
2719  unsigned Factor,
2720  ArrayRef<unsigned> Indices,
2721  unsigned Alignment,
2722  unsigned AddressSpace) {
2723 
2724  // We currently Support only fully-interleaved groups, with no gaps.
2725  // TODO: Support also strided loads (interleaved-groups with gaps).
2726  if (Indices.size() && Indices.size() != Factor)
2727  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2728  Alignment, AddressSpace);
2729 
2730  // VecTy for interleave memop is <VF*Factor x Elt>.
2731  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
2732  // VecTy = <12 x i32>.
2733  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
2734 
2735  // This function can be called with VecTy=<6xi128>, Factor=3, in which case
2736  // the VF=2, while v2i128 is an unsupported MVT vector type
2737  // (see MachineValueType.h::getVectorVT()).
2738  if (!LegalVT.isVector())
2739  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2740  Alignment, AddressSpace);
2741 
2742  unsigned VF = VecTy->getVectorNumElements() / Factor;
2743  Type *ScalarTy = VecTy->getVectorElementType();
2744 
2745  // Calculate the number of memory operations (NumOfMemOps), required
2746  // for load/store the VecTy.
2747  unsigned VecTySize = DL.getTypeStoreSize(VecTy);
2748  unsigned LegalVTSize = LegalVT.getStoreSize();
2749  unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
2750 
2751  // Get the cost of one memory operation.
2752  Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
2753  LegalVT.getVectorNumElements());
2754  unsigned MemOpCost =
2755  getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
2756 
2757  VectorType *VT = VectorType::get(ScalarTy, VF);
2758  EVT ETy = TLI->getValueType(DL, VT);
2759  if (!ETy.isSimple())
2760  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2761  Alignment, AddressSpace);
2762 
2763  // TODO: Complete for other data-types and strides.
2764  // Each combination of Stride, ElementTy and VF results in a different
2765  // sequence; The cost tables are therefore accessed with:
2766  // Factor (stride) and VectorType=VFxElemType.
2767  // The Cost accounts only for the shuffle sequence;
2768  // The cost of the loads/stores is accounted for separately.
2769  //
2770  static const CostTblEntry AVX2InterleavedLoadTbl[] = {
2771  { 2, MVT::v4i64, 6 }, //(load 8i64 and) deinterleave into 2 x 4i64
2772  { 2, MVT::v4f64, 6 }, //(load 8f64 and) deinterleave into 2 x 4f64
2773 
2774  { 3, MVT::v2i8, 10 }, //(load 6i8 and) deinterleave into 3 x 2i8
2775  { 3, MVT::v4i8, 4 }, //(load 12i8 and) deinterleave into 3 x 4i8
2776  { 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8
2777  { 3, MVT::v16i8, 11}, //(load 48i8 and) deinterleave into 3 x 16i8
2778  { 3, MVT::v32i8, 13}, //(load 96i8 and) deinterleave into 3 x 32i8
2779  { 3, MVT::v8f32, 17 }, //(load 24f32 and)deinterleave into 3 x 8f32
2780 
2781  { 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8
2782  { 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8
2783  { 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8
2784  { 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8
2785  { 4, MVT::v32i8, 80 }, //(load 128i8 and) deinterleave into 4 x 32i8
2786 
2787  { 8, MVT::v8f32, 40 } //(load 64f32 and)deinterleave into 8 x 8f32
2788  };
2789 
2790  static const CostTblEntry AVX2InterleavedStoreTbl[] = {
2791  { 2, MVT::v4i64, 6 }, //interleave into 2 x 4i64 into 8i64 (and store)
2792  { 2, MVT::v4f64, 6 }, //interleave into 2 x 4f64 into 8f64 (and store)
2793 
2794  { 3, MVT::v2i8, 7 }, //interleave 3 x 2i8 into 6i8 (and store)
2795  { 3, MVT::v4i8, 8 }, //interleave 3 x 4i8 into 12i8 (and store)
2796  { 3, MVT::v8i8, 11 }, //interleave 3 x 8i8 into 24i8 (and store)
2797  { 3, MVT::v16i8, 11 }, //interleave 3 x 16i8 into 48i8 (and store)
2798  { 3, MVT::v32i8, 13 }, //interleave 3 x 32i8 into 96i8 (and store)
2799 
2800  { 4, MVT::v2i8, 12 }, //interleave 4 x 2i8 into 8i8 (and store)
2801  { 4, MVT::v4i8, 9 }, //interleave 4 x 4i8 into 16i8 (and store)
2802  { 4, MVT::v8i8, 10 }, //interleave 4 x 8i8 into 32i8 (and store)
2803  { 4, MVT::v16i8, 10 }, //interleave 4 x 16i8 into 64i8 (and store)
2804  { 4, MVT::v32i8, 12 } //interleave 4 x 32i8 into 128i8 (and store)
2805  };
2806 
2807  if (Opcode == Instruction::Load) {
2808  if (const auto *Entry =
2809  CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT()))
2810  return NumOfMemOps * MemOpCost + Entry->Cost;
2811  } else {
2812  assert(Opcode == Instruction::Store &&
2813  "Expected Store Instruction at this point");
2814  if (const auto *Entry =
2815  CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT()))
2816  return NumOfMemOps * MemOpCost + Entry->Cost;
2817  }
2818 
2819  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2820  Alignment, AddressSpace);
2821 }
2822 
2823 // Get estimation for interleaved load/store operations and strided load.
2824 // \p Indices contains indices for strided load.
2825 // \p Factor - the factor of interleaving.
2826 // AVX-512 provides 3-src shuffles that significantly reduces the cost.
2828  unsigned Factor,
2829  ArrayRef<unsigned> Indices,
2830  unsigned Alignment,
2831  unsigned AddressSpace) {
2832 
2833  // VecTy for interleave memop is <VF*Factor x Elt>.
2834  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
2835  // VecTy = <12 x i32>.
2836 
2837  // Calculate the number of memory operations (NumOfMemOps), required
2838  // for load/store the VecTy.
2839  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
2840  unsigned VecTySize = DL.getTypeStoreSize(VecTy);
2841  unsigned LegalVTSize = LegalVT.getStoreSize();
2842  unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
2843 
2844  // Get the cost of one memory operation.
2845  Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
2846  LegalVT.getVectorNumElements());
2847  unsigned MemOpCost =
2848  getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
2849 
2850  unsigned VF = VecTy->getVectorNumElements() / Factor;
2851  MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
2852 
2853  if (Opcode == Instruction::Load) {
2854  // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
2855  // contain the cost of the optimized shuffle sequence that the
2856  // X86InterleavedAccess pass will generate.
2857  // The cost of loads and stores are computed separately from the table.
2858 
2859  // X86InterleavedAccess support only the following interleaved-access group.
2860  static const CostTblEntry AVX512InterleavedLoadTbl[] = {
2861  {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
2862  {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
2863  {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
2864  };
2865 
2866  if (const auto *Entry =
2867  CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
2868  return NumOfMemOps * MemOpCost + Entry->Cost;
2869  //If an entry does not exist, fallback to the default implementation.
2870 
2871  // Kind of shuffle depends on number of loaded values.
2872  // If we load the entire data in one register, we can use a 1-src shuffle.
2873  // Otherwise, we'll merge 2 sources in each operation.
2874  TTI::ShuffleKind ShuffleKind =
2875  (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
2876 
2877  unsigned ShuffleCost =
2878  getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr);
2879 
2880  unsigned NumOfLoadsInInterleaveGrp =
2881  Indices.size() ? Indices.size() : Factor;
2882  Type *ResultTy = VectorType::get(VecTy->getVectorElementType(),
2883  VecTy->getVectorNumElements() / Factor);
2884  unsigned NumOfResults =
2885  getTLI()->getTypeLegalizationCost(DL, ResultTy).first *
2886  NumOfLoadsInInterleaveGrp;
2887 
2888  // About a half of the loads may be folded in shuffles when we have only
2889  // one result. If we have more than one result, we do not fold loads at all.
2890  unsigned NumOfUnfoldedLoads =
2891  NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
2892 
2893  // Get a number of shuffle operations per result.
2894  unsigned NumOfShufflesPerResult =
2895  std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
2896 
2897  // The SK_MergeTwoSrc shuffle clobbers one of src operands.
2898  // When we have more than one destination, we need additional instructions
2899  // to keep sources.
2900  unsigned NumOfMoves = 0;
2901  if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
2902  NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
2903 
2904  int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
2905  NumOfUnfoldedLoads * MemOpCost + NumOfMoves;
2906 
2907  return Cost;
2908  }
2909 
2910  // Store.
2911  assert(Opcode == Instruction::Store &&
2912  "Expected Store Instruction at this point");
2913  // X86InterleavedAccess support only the following interleaved-access group.
2914  static const CostTblEntry AVX512InterleavedStoreTbl[] = {
2915  {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
2916  {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
2917  {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
2918 
2919  {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
2920  {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
2921  {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
2922  {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
2923  };
2924 
2925  if (const auto *Entry =
2926  CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
2927  return NumOfMemOps * MemOpCost + Entry->Cost;
2928  //If an entry does not exist, fallback to the default implementation.
2929 
2930  // There is no strided stores meanwhile. And store can't be folded in
2931  // shuffle.
2932  unsigned NumOfSources = Factor; // The number of values to be merged.
2933  unsigned ShuffleCost =
2934  getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr);
2935  unsigned NumOfShufflesPerStore = NumOfSources - 1;
2936 
2937  // The SK_MergeTwoSrc shuffle clobbers one of src operands.
2938  // We need additional instructions to keep sources.
2939  unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
2940  int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
2941  NumOfMoves;
2942  return Cost;
2943 }
2944 
2945 int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
2946  unsigned Factor,
2947  ArrayRef<unsigned> Indices,
2948  unsigned Alignment,
2949  unsigned AddressSpace) {
2950  auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) {
2951  Type *EltTy = VecTy->getVectorElementType();
2952  if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
2953  EltTy->isIntegerTy(32) || EltTy->isPointerTy())
2954  return true;
2955  if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8))
2956  return HasBW;
2957  return false;
2958  };
2959  if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
2960  return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
2961  Alignment, AddressSpace);
2962  if (ST->hasAVX2())
2963  return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
2964  Alignment, AddressSpace);
2965 
2966  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2967  Alignment, AddressSpace);
2968 }
bool hasAVX() const
Definition: X86Subtarget.h:554
Type * getVectorElementType() const
Definition: Type.h:371
constexpr bool isUInt< 32 >(uint64_t x)
Definition: MathExtras.h:349
X = FP_ROUND(Y, TRUNC) - Rounding &#39;Y&#39; from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:508
bool is64Bit() const
Is this x86_64? (disregarding specific ABI / programming model)
Definition: X86Subtarget.h:516
void push_back(const T &Elt)
Definition: SmallVector.h:218
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:562
unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >())
Definition: BasicTTIImpl.h:507
static IntegerType * getInt1Ty(LLVMContext &C)
Definition: Type.cpp:173
llvm::Optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1557
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:834
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:42
bool hasSSE41() const
Definition: X86Subtarget.h:552
Compute iterated dominance frontiers using a linear time algorithm.
Definition: AllocatorList.h:24
unsigned minRequiredElementSize(const Value *Val, bool &isSigned)
static MVT getVectorVT(MVT VT, unsigned NumElements)
Cost tables and simple lookup functions.
int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty)
int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, bool VariableMask, unsigned Alignment)
Calculate the cost of Gather / Scatter operation.
bool hasPOPCNT() const
Definition: X86Subtarget.h:562
bool hasAVX2() const
Definition: X86Subtarget.h:555
#define LLVM_FALLTHROUGH
Definition: Compiler.h:86
const Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
bool isVector() const
Return true if this is a vector value type.
unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:446
This file a TargetTransformInfo::Concept conforming object specific to the X86 target machine...
The main scalar evolution driver.
unsigned getRegisterBitWidth(bool Vector) const
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:253
unsigned getVectorNumElements() const
bool isLegalMaskedScatter(Type *DataType)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1042
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:360
Type Conversion Cost Table.
Definition: CostTable.h:45
uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the next integer (mod 2**64) that is greater than or equal to Value and is a multiple of Alig...
Definition: MathExtras.h:685
[US]{MIN/MAX} - Binary minimum or maximum or signed or unsigned integers.
Definition: ISDOpcodes.h:360
Hexagon Common GEP
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:230
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace)
unsigned getAtomicMemIntrinsicMaxElementSize() const
Cost Table Entry.
Definition: CostTable.h:25
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1503
unsigned getNumberOfRegisters(bool Vector)
int getGatherOverhead() const
Definition: X86Subtarget.h:614
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:130
unsigned getMaxInterleaveFactor(unsigned VF)
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:210
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
bool isUnalignedMem32Slow() const
Definition: X86Subtarget.h:613
unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef< Value * > Args, FastMathFlags FMF, unsigned VF=1)
Get intrinsic cost based on arguments.
Definition: BasicTTIImpl.h:899
bool isLegalMaskedStore(Type *DataType)
unsigned getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
int getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value *> Args=ArrayRef< const Value *>())
const FeatureBitset & getFeatureBits() const
Shift and rotation operations.
Definition: ISDOpcodes.h:382
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:197
unsigned getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwise)
Try to calculate arithmetic and shuffle op costs for reduction operations.
const TypeConversionCostTblEntry * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntry > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table, TypeTy must be comparable to CompareTy by ==. ...
Definition: CostTable.h:55
unsigned getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isLSRCostLess(TargetTransformInfo::LSRCost &C1, TargetTransformInfo::LSRCost &C2)
bool hasVLX() const
Definition: X86Subtarget.h:648
unsigned getSizeInBits() const
unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace)
Definition: BasicTTIImpl.h:783
unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I)
Definition: BasicTTIImpl.h:705
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1569
unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:567
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:245
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:450
PopcntSupportKind
Flags indicating the kind of support for population count.
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:884
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:201
llvm::Optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:33
Selects elements from the corresponding lane of either source operand.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:203
int getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr)
Reverse the order of the vector.
amdgpu Simplify well known AMD library false Value * Callee
bool hasDQI() const
Definition: X86Subtarget.h:646
MVT getVectorElementType() const
Value * getOperand(unsigned i) const
Definition: User.h:170
Class to represent pointers.
Definition: DerivedTypes.h:467
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:494
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return &#39;this&#39;.
Definition: Type.h:304
bool isFloatTy() const
Return true if this is &#39;float&#39;, a 32-bit IEEE fp type.
Definition: Type.h:147
an instruction for type-safe pointer arithmetic to access elements of arrays and structs ...
Definition: Instructions.h:841
bool isSLM() const
Definition: X86Subtarget.h:698
bool hasSSSE3() const
Definition: X86Subtarget.h:551
If not nullptr, enable inline expansion of memcmp.
* if(!EatIfPresent(lltok::kw_thread_local)) return false
ParseOptionalThreadLocal := /*empty.
Container class for subtarget features.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:429
Machine Value Type.
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46
Simple binary floating point operators.
Definition: ISDOpcodes.h:260
bool isLegalMaskedGather(Type *DataType)
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:149
unsigned getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwise, bool)
Try to calculate op costs for min/max reduction operations.
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:224
Expected to fold away in lowering.
unsigned getUserCost(const User *U, ArrayRef< const Value * > Operands)
bool isLegalMaskedLoad(Type *DataType)
const TTI::MemCmpExpansionOptions * enableMemCmpExpansion(bool IsZeroCmp) const
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:495
Merge elements from two source vectors into one with any shuffle mask.
int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace)
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:281
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
bool hasXOP() const
Definition: X86Subtarget.h:578
int getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace)
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:526
bool hasSSE42() const
Definition: X86Subtarget.h:553
Extended Value Type.
Definition: ValueTypes.h:34
int getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm)
unsigned getUserCost(const User *U, ArrayRef< const Value *> Operands)
const TargetMachine & getTargetMachine() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace, const Instruction *I=nullptr)
int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp)
const SCEVConstant * getConstantStrideStep(ScalarEvolution *SE, const SCEV *Ptr)
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
OperandValueProperties
Additional properties of an operand&#39;s values.
unsigned getCFInstrCost(unsigned Opcode)
Definition: BasicTTIImpl.h:700
bool isAtom() const
TODO: to be removed later and replaced with suitable properties.
Definition: X86Subtarget.h:697
unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp)
Definition: BasicTTIImpl.h:554
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:947
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:240
unsigned getNumOperands() const
Definition: User.h:192
constexpr bool isInt< 32 >(int64_t x)
Definition: MathExtras.h:309
unsigned getPreferVectorWidth() const
Definition: X86Subtarget.h:668
AddressSpace
Definition: NVPTXBaseInfo.h:22
bool hasVBMI() const
Definition: X86Subtarget.h:588
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:385
int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I=nullptr)
int getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwiseForm, bool IsUnsigned)
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target&#39;s TargetSubtargetInf...
unsigned getVectorNumElements() const
Definition: DerivedTypes.h:462
Class to represent vector types.
Definition: DerivedTypes.h:393
Class for arbitrary precision integers.
Definition: APInt.h:70
int getScatterOverhead() const
Definition: X86Subtarget.h:615
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:440
unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:752
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const
const CostTblEntry * CostTableLookup(ArrayRef< CostTblEntry > Tbl, int ISD, MVT Ty)
Find in cost table, TypeTy must be comparable to CompareTy by ==.
Definition: CostTable.h:32
This class represents an analyzed expression in the program.
bool isGLM() const
Definition: X86Subtarget.h:699
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, const Instruction *I=nullptr)
#define I(x, y, z)
Definition: MD5.cpp:58
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:323
unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: BasicTTIImpl.h:745
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:699
InsertSubvector. Index indicates start offset.
unsigned Insns
TODO: Some of these could be merged.
bool hasCDI() const
Definition: X86Subtarget.h:642
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:185
const unsigned Kind
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
The cost of a typical &#39;add&#39; instruction.
int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace)
bool hasSSE1() const
Definition: X86Subtarget.h:548
unsigned getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition: Type.cpp:115
LLVM Value Representation.
Definition: Value.h:73
uint64_t getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type...
Definition: DataLayout.h:411
bool hasDivRemOp(Type *DataType, bool IsSigned)
static VectorType * get(Type *ElementType, unsigned NumElements)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:593
bool hasMacroFusion() const
Definition: X86Subtarget.h:632
Broadcast element 0 to all other elements.
bool hasAVX512() const
Definition: X86Subtarget.h:556
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:59
Convenience struct for specifying and reasoning about fast-math flags.
Definition: Operator.h:160
bool hasBWI() const
Definition: X86Subtarget.h:647
OperandValueKind
Additional information about an operand&#39;s possible values.
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:415
This pass exposes codegen information to IR-level passes.
Conversion operators.
Definition: ISDOpcodes.h:437
bool hasFastGather() const
Definition: X86Subtarget.h:627
CacheLevel
The possible cache levels.
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:446
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:126
bool isDoubleTy() const
Return true if this is &#39;double&#39;, a 64-bit IEEE fp type.
Definition: Type.h:150
int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef< Type *> Tys, FastMathFlags FMF, unsigned ScalarizationCostPassed=UINT_MAX)
static IntegerType * getInt8Ty(LLVMContext &C)
Definition: Type.cpp:174
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
std::pair< int, MVT > getTypeLegalizationCost(const DataLayout &DL, Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
bool hasSSE2() const
Definition: X86Subtarget.h:549
int getIntImmCost(int64_t)
Calculate the cost of materializing a 64-bit value.
This file describes how to lower LLVM code to machine code.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Shuffle elements of single source vector with any shuffle mask.