LLVM  7.0.0svn
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 /// \file
10 /// This file implements a TargetTransformInfo analysis pass specific to the
11 /// X86 target machine. It uses the target's detailed information to provide
12 /// more precise answers to certain TTI queries, while letting the target
13 /// independent and default TTI implementations handle the rest.
14 ///
15 //===----------------------------------------------------------------------===//
16 /// About Cost Model numbers used below it's necessary to say the following:
17 /// the numbers correspond to some "generic" X86 CPU instead of usage of
18 /// concrete CPU model. Usually the numbers correspond to CPU where the feature
19 /// apeared at the first time. For example, if we do Subtarget.hasSSE42() in
20 /// the lookups below the cost is based on Nehalem as that was the first CPU
21 /// to support that feature level and thus has most likely the worst case cost.
22 /// Some examples of other technologies/CPUs:
23 /// SSE 3 - Pentium4 / Athlon64
24 /// SSE 4.1 - Penryn
25 /// SSE 4.2 - Nehalem
26 /// AVX - Sandy Bridge
27 /// AVX2 - Haswell
28 /// AVX-512 - Xeon Phi / Skylake
29 /// And some examples of instruction target dependent costs (latency)
30 /// divss sqrtss rsqrtss
31 /// AMD K7 11-16 19 3
32 /// Piledriver 9-24 13-15 5
33 /// Jaguar 14 16 2
34 /// Pentium II,III 18 30 2
35 /// Nehalem 7-14 7-18 3
36 /// Haswell 10-13 11 5
37 /// TODO: Develop and implement the target dependent cost model and
38 /// specialize cost numbers for different Cost Model Targets such as throughput,
39 /// code size, latency and uop count.
40 //===----------------------------------------------------------------------===//
41 
42 #include "X86TargetTransformInfo.h"
45 #include "llvm/CodeGen/CostTable.h"
47 #include "llvm/IR/IntrinsicInst.h"
48 #include "llvm/Support/Debug.h"
49 
50 using namespace llvm;
51 
52 #define DEBUG_TYPE "x86tti"
53 
54 //===----------------------------------------------------------------------===//
55 //
56 // X86 cost model.
57 //
58 //===----------------------------------------------------------------------===//
59 
61 X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
62  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
63  // TODO: Currently the __builtin_popcount() implementation using SSE3
64  // instructions is inefficient. Once the problem is fixed, we should
65  // call ST->hasSSE3() instead of ST->hasPOPCNT().
67 }
68 
71  switch (Level) {
73  // - Penryn
74  // - Nehalem
75  // - Westmere
76  // - Sandy Bridge
77  // - Ivy Bridge
78  // - Haswell
79  // - Broadwell
80  // - Skylake
81  // - Kabylake
82  return 32 * 1024; // 32 KByte
84  // - Penryn
85  // - Nehalem
86  // - Westmere
87  // - Sandy Bridge
88  // - Ivy Bridge
89  // - Haswell
90  // - Broadwell
91  // - Skylake
92  // - Kabylake
93  return 256 * 1024; // 256 KByte
94  }
95 
96  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
97 }
98 
101  // - Penryn
102  // - Nehalem
103  // - Westmere
104  // - Sandy Bridge
105  // - Ivy Bridge
106  // - Haswell
107  // - Broadwell
108  // - Skylake
109  // - Kabylake
110  switch (Level) {
114  return 8;
115  }
116 
117  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
118 }
119 
120 unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
121  if (Vector && !ST->hasSSE1())
122  return 0;
123 
124  if (ST->is64Bit()) {
125  if (Vector && ST->hasAVX512())
126  return 32;
127  return 16;
128  }
129  return 8;
130 }
131 
132 unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const {
133  unsigned PreferVectorWidth = ST->getPreferVectorWidth();
134  if (Vector) {
135  if (ST->hasAVX512() && PreferVectorWidth >= 512)
136  return 512;
137  if (ST->hasAVX() && PreferVectorWidth >= 256)
138  return 256;
139  if (ST->hasSSE1() && PreferVectorWidth >= 128)
140  return 128;
141  return 0;
142  }
143 
144  if (ST->is64Bit())
145  return 64;
146 
147  return 32;
148 }
149 
150 unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
151  return getRegisterBitWidth(true);
152 }
153 
154 unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
155  // If the loop will not be vectorized, don't interleave the loop.
156  // Let regular unroll to unroll the loop, which saves the overflow
157  // check and memory check cost.
158  if (VF == 1)
159  return 1;
160 
161  if (ST->isAtom())
162  return 1;
163 
164  // Sandybridge and Haswell have multiple execution ports and pipelined
165  // vector units.
166  if (ST->hasAVX())
167  return 4;
168 
169  return 2;
170 }
171 
173  unsigned Opcode, Type *Ty,
175  TTI::OperandValueProperties Opd1PropInfo,
176  TTI::OperandValueProperties Opd2PropInfo,
178  // Legalize the type.
179  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
180 
181  int ISD = TLI->InstructionOpcodeToISD(Opcode);
182  assert(ISD && "Invalid opcode");
183 
184  static const CostTblEntry GLMCostTable[] = {
185  { ISD::FDIV, MVT::f32, 18 }, // divss
186  { ISD::FDIV, MVT::v4f32, 35 }, // divps
187  { ISD::FDIV, MVT::f64, 33 }, // divsd
188  { ISD::FDIV, MVT::v2f64, 65 }, // divpd
189  };
190 
191  if (ST->isGLM())
192  if (const auto *Entry = CostTableLookup(GLMCostTable, ISD,
193  LT.second))
194  return LT.first * Entry->Cost;
195 
196  static const CostTblEntry SLMCostTable[] = {
197  { ISD::MUL, MVT::v4i32, 11 }, // pmulld
198  { ISD::MUL, MVT::v8i16, 2 }, // pmullw
199  { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
200  { ISD::FMUL, MVT::f64, 2 }, // mulsd
201  { ISD::FMUL, MVT::v2f64, 4 }, // mulpd
202  { ISD::FMUL, MVT::v4f32, 2 }, // mulps
203  { ISD::FDIV, MVT::f32, 17 }, // divss
204  { ISD::FDIV, MVT::v4f32, 39 }, // divps
205  { ISD::FDIV, MVT::f64, 32 }, // divsd
206  { ISD::FDIV, MVT::v2f64, 69 }, // divpd
207  { ISD::FADD, MVT::v2f64, 2 }, // addpd
208  { ISD::FSUB, MVT::v2f64, 2 }, // subpd
209  // v2i64/v4i64 mul is custom lowered as a series of long:
210  // multiplies(3), shifts(3) and adds(2)
211  // slm muldq version throughput is 2 and addq throughput 4
212  // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
213  // 3X4 (addq throughput) = 17
214  { ISD::MUL, MVT::v2i64, 17 },
215  // slm addq\subq throughput is 4
216  { ISD::ADD, MVT::v2i64, 4 },
217  { ISD::SUB, MVT::v2i64, 4 },
218  };
219 
220  if (ST->isSLM()) {
221  if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
222  // Check if the operands can be shrinked into a smaller datatype.
223  bool Op1Signed = false;
224  unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
225  bool Op2Signed = false;
226  unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
227 
228  bool signedMode = Op1Signed | Op2Signed;
229  unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
230 
231  if (OpMinSize <= 7)
232  return LT.first * 3; // pmullw/sext
233  if (!signedMode && OpMinSize <= 8)
234  return LT.first * 3; // pmullw/zext
235  if (OpMinSize <= 15)
236  return LT.first * 5; // pmullw/pmulhw/pshuf
237  if (!signedMode && OpMinSize <= 16)
238  return LT.first * 5; // pmullw/pmulhw/pshuf
239  }
240 
241  if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
242  LT.second)) {
243  return LT.first * Entry->Cost;
244  }
245  }
246 
247  if (ISD == ISD::SDIV &&
250  Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
251  // On X86, vector signed division by constants power-of-two are
252  // normally expanded to the sequence SRA + SRL + ADD + SRA.
253  // The OperandValue properties may not be the same as that of the previous
254  // operation; conservatively assume OP_None.
255  int Cost = 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info,
258  Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
261  Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info,
264 
265  return Cost;
266  }
267 
268  static const CostTblEntry AVX512BWUniformConstCostTable[] = {
269  { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand.
270  { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand.
271  { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb.
272 
273  { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence
274  { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence
275  };
276 
278  ST->hasBWI()) {
279  if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
280  LT.second))
281  return LT.first * Entry->Cost;
282  }
283 
284  static const CostTblEntry AVX512UniformConstCostTable[] = {
285  { ISD::SRA, MVT::v2i64, 1 },
286  { ISD::SRA, MVT::v4i64, 1 },
287  { ISD::SRA, MVT::v8i64, 1 },
288 
289  { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
290  { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
291  };
292 
294  ST->hasAVX512()) {
295  if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
296  LT.second))
297  return LT.first * Entry->Cost;
298  }
299 
300  static const CostTblEntry AVX2UniformConstCostTable[] = {
301  { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand.
302  { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand.
303  { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb.
304 
305  { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
306 
307  { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
308  { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
309  { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
310  { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
311  };
312 
314  ST->hasAVX2()) {
315  if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
316  LT.second))
317  return LT.first * Entry->Cost;
318  }
319 
320  static const CostTblEntry SSE2UniformConstCostTable[] = {
321  { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand.
322  { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand.
323  { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
324 
325  { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split.
326  { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split.
327  { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
328 
329  { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
330  { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
331  { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
332  { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
333  { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split.
334  { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
335  { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split.
336  { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
337  };
338 
340  ST->hasSSE2()) {
341  // pmuldq sequence.
342  if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
343  return LT.first * 32;
344  if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
345  return LT.first * 15;
346 
347  // XOP has faster vXi8 shifts.
348  if ((ISD != ISD::SHL && ISD != ISD::SRL && ISD != ISD::SRA) ||
349  !ST->hasXOP())
350  if (const auto *Entry =
351  CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
352  return LT.first * Entry->Cost;
353  }
354 
355  static const CostTblEntry AVX2UniformCostTable[] = {
356  // Uniform splats are cheaper for the following instructions.
357  { ISD::SHL, MVT::v16i16, 1 }, // psllw.
358  { ISD::SRL, MVT::v16i16, 1 }, // psrlw.
359  { ISD::SRA, MVT::v16i16, 1 }, // psraw.
360  };
361 
362  if (ST->hasAVX2() &&
364  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
365  if (const auto *Entry =
366  CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
367  return LT.first * Entry->Cost;
368  }
369 
370  static const CostTblEntry SSE2UniformCostTable[] = {
371  // Uniform splats are cheaper for the following instructions.
372  { ISD::SHL, MVT::v8i16, 1 }, // psllw.
373  { ISD::SHL, MVT::v4i32, 1 }, // pslld
374  { ISD::SHL, MVT::v2i64, 1 }, // psllq.
375 
376  { ISD::SRL, MVT::v8i16, 1 }, // psrlw.
377  { ISD::SRL, MVT::v4i32, 1 }, // psrld.
378  { ISD::SRL, MVT::v2i64, 1 }, // psrlq.
379 
380  { ISD::SRA, MVT::v8i16, 1 }, // psraw.
381  { ISD::SRA, MVT::v4i32, 1 }, // psrad.
382  };
383 
384  if (ST->hasSSE2() &&
386  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
387  if (const auto *Entry =
388  CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
389  return LT.first * Entry->Cost;
390  }
391 
392  static const CostTblEntry AVX512DQCostTable[] = {
393  { ISD::MUL, MVT::v2i64, 1 },
394  { ISD::MUL, MVT::v4i64, 1 },
395  { ISD::MUL, MVT::v8i64, 1 }
396  };
397 
398  // Look for AVX512DQ lowering tricks for custom cases.
399  if (ST->hasDQI())
400  if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
401  return LT.first * Entry->Cost;
402 
403  static const CostTblEntry AVX512BWCostTable[] = {
404  { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
405  { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
406  { ISD::SRA, MVT::v8i16, 1 }, // vpsravw
407 
408  { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
409  { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
410  { ISD::SRA, MVT::v16i16, 1 }, // vpsravw
411 
412  { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
413  { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
414  { ISD::SRA, MVT::v32i16, 1 }, // vpsravw
415 
416  { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence.
417  { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence.
418  { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence.
419 
420  { ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence.
421  { ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence.
422  { ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence.
423  };
424 
425  // Look for AVX512BW lowering tricks for custom cases.
426  if (ST->hasBWI())
427  if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
428  return LT.first * Entry->Cost;
429 
430  static const CostTblEntry AVX512CostTable[] = {
431  { ISD::SHL, MVT::v16i32, 1 },
432  { ISD::SRL, MVT::v16i32, 1 },
433  { ISD::SRA, MVT::v16i32, 1 },
434 
435  { ISD::SHL, MVT::v8i64, 1 },
436  { ISD::SRL, MVT::v8i64, 1 },
437 
438  { ISD::SRA, MVT::v2i64, 1 },
439  { ISD::SRA, MVT::v4i64, 1 },
440  { ISD::SRA, MVT::v8i64, 1 },
441 
442  { ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence.
443  { ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence.
444  { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org)
445  { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org)
446  { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org)
447  { ISD::MUL, MVT::v8i64, 8 }, // 3*pmuludq/3*shift/2*add
448 
449  { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
450  { ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
451  { ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
452 
453  { ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
454  { ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
455  { ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
456  };
457 
458  if (ST->hasAVX512())
459  if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
460  return LT.first * Entry->Cost;
461 
462  static const CostTblEntry AVX2ShiftCostTable[] = {
463  // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
464  // customize them to detect the cases where shift amount is a scalar one.
465  { ISD::SHL, MVT::v4i32, 1 },
466  { ISD::SRL, MVT::v4i32, 1 },
467  { ISD::SRA, MVT::v4i32, 1 },
468  { ISD::SHL, MVT::v8i32, 1 },
469  { ISD::SRL, MVT::v8i32, 1 },
470  { ISD::SRA, MVT::v8i32, 1 },
471  { ISD::SHL, MVT::v2i64, 1 },
472  { ISD::SRL, MVT::v2i64, 1 },
473  { ISD::SHL, MVT::v4i64, 1 },
474  { ISD::SRL, MVT::v4i64, 1 },
475  };
476 
477  // Look for AVX2 lowering tricks.
478  if (ST->hasAVX2()) {
479  if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
482  // On AVX2, a packed v16i16 shift left by a constant build_vector
483  // is lowered into a vector multiply (vpmullw).
484  return getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info,
487 
488  if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
489  return LT.first * Entry->Cost;
490  }
491 
492  static const CostTblEntry XOPShiftCostTable[] = {
493  // 128bit shifts take 1cy, but right shifts require negation beforehand.
494  { ISD::SHL, MVT::v16i8, 1 },
495  { ISD::SRL, MVT::v16i8, 2 },
496  { ISD::SRA, MVT::v16i8, 2 },
497  { ISD::SHL, MVT::v8i16, 1 },
498  { ISD::SRL, MVT::v8i16, 2 },
499  { ISD::SRA, MVT::v8i16, 2 },
500  { ISD::SHL, MVT::v4i32, 1 },
501  { ISD::SRL, MVT::v4i32, 2 },
502  { ISD::SRA, MVT::v4i32, 2 },
503  { ISD::SHL, MVT::v2i64, 1 },
504  { ISD::SRL, MVT::v2i64, 2 },
505  { ISD::SRA, MVT::v2i64, 2 },
506  // 256bit shifts require splitting if AVX2 didn't catch them above.
507  { ISD::SHL, MVT::v32i8, 2+2 },
508  { ISD::SRL, MVT::v32i8, 4+2 },
509  { ISD::SRA, MVT::v32i8, 4+2 },
510  { ISD::SHL, MVT::v16i16, 2+2 },
511  { ISD::SRL, MVT::v16i16, 4+2 },
512  { ISD::SRA, MVT::v16i16, 4+2 },
513  { ISD::SHL, MVT::v8i32, 2+2 },
514  { ISD::SRL, MVT::v8i32, 4+2 },
515  { ISD::SRA, MVT::v8i32, 4+2 },
516  { ISD::SHL, MVT::v4i64, 2+2 },
517  { ISD::SRL, MVT::v4i64, 4+2 },
518  { ISD::SRA, MVT::v4i64, 4+2 },
519  };
520 
521  // Look for XOP lowering tricks.
522  if (ST->hasXOP())
523  if (const auto *Entry = CostTableLookup(XOPShiftCostTable, ISD, LT.second))
524  return LT.first * Entry->Cost;
525 
526  static const CostTblEntry SSE2UniformShiftCostTable[] = {
527  // Uniform splats are cheaper for the following instructions.
528  { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split.
529  { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split.
530  { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split.
531 
532  { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split.
533  { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split.
534  { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split.
535 
536  { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split.
537  { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split.
538  { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle.
539  { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split.
540  };
541 
542  if (ST->hasSSE2() &&
544  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
545 
546  // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
547  if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2())
548  return LT.first * 4; // 2*psrad + shuffle.
549 
550  if (const auto *Entry =
551  CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
552  return LT.first * Entry->Cost;
553  }
554 
555  if (ISD == ISD::SHL &&
557  MVT VT = LT.second;
558  // Vector shift left by non uniform constant can be lowered
559  // into vector multiply.
560  if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
561  ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
562  ISD = ISD::MUL;
563  }
564 
565  static const CostTblEntry AVX2CostTable[] = {
566  { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence.
567  { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
568 
569  { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence.
570  { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
571 
572  { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence.
573  { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence.
574  { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence.
575  { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence.
576 
577  { ISD::SUB, MVT::v32i8, 1 }, // psubb
578  { ISD::ADD, MVT::v32i8, 1 }, // paddb
579  { ISD::SUB, MVT::v16i16, 1 }, // psubw
580  { ISD::ADD, MVT::v16i16, 1 }, // paddw
581  { ISD::SUB, MVT::v8i32, 1 }, // psubd
582  { ISD::ADD, MVT::v8i32, 1 }, // paddd
583  { ISD::SUB, MVT::v4i64, 1 }, // psubq
584  { ISD::ADD, MVT::v4i64, 1 }, // paddq
585 
586  { ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence.
587  { ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence.
588  { ISD::MUL, MVT::v16i16, 1 }, // pmullw
589  { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org)
590  { ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add
591 
592  { ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
593  { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
594  { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
595  { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
596  { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
597  { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
598 
599  { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
600  { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
601  { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
602  { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/
603  { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
604  { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
605  };
606 
607  // Look for AVX2 lowering tricks for custom cases.
608  if (ST->hasAVX2())
609  if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
610  return LT.first * Entry->Cost;
611 
612  static const CostTblEntry AVX1CostTable[] = {
613  // We don't have to scalarize unsupported ops. We can issue two half-sized
614  // operations and we only need to extract the upper YMM half.
615  // Two ops + 1 extract + 1 insert = 4.
616  { ISD::MUL, MVT::v16i16, 4 },
617  { ISD::MUL, MVT::v8i32, 4 },
618  { ISD::SUB, MVT::v32i8, 4 },
619  { ISD::ADD, MVT::v32i8, 4 },
620  { ISD::SUB, MVT::v16i16, 4 },
621  { ISD::ADD, MVT::v16i16, 4 },
622  { ISD::SUB, MVT::v8i32, 4 },
623  { ISD::ADD, MVT::v8i32, 4 },
624  { ISD::SUB, MVT::v4i64, 4 },
625  { ISD::ADD, MVT::v4i64, 4 },
626 
627  // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
628  // are lowered as a series of long multiplies(3), shifts(3) and adds(2)
629  // Because we believe v4i64 to be a legal type, we must also include the
630  // extract+insert in the cost table. Therefore, the cost here is 18
631  // instead of 8.
632  { ISD::MUL, MVT::v4i64, 18 },
633 
634  { ISD::MUL, MVT::v32i8, 26 }, // extend/pmullw/trunc sequence.
635 
636  { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
637  { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
638  { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
639  { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/
640  { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/
641  { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/
642  };
643 
644  if (ST->hasAVX())
645  if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
646  return LT.first * Entry->Cost;
647 
648  static const CostTblEntry SSE42CostTable[] = {
649  { ISD::FADD, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
650  { ISD::FADD, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
651  { ISD::FADD, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
652  { ISD::FADD, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
653 
654  { ISD::FSUB, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
655  { ISD::FSUB, MVT::f32 , 1 }, // Nehalem from http://www.agner.org/
656  { ISD::FSUB, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
657  { ISD::FSUB, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
658 
659  { ISD::FMUL, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
660  { ISD::FMUL, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
661  { ISD::FMUL, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
662  { ISD::FMUL, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
663 
664  { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/
665  { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
666  { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/
667  { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
668  };
669 
670  if (ST->hasSSE42())
671  if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
672  return LT.first * Entry->Cost;
673 
674  static const CostTblEntry SSE41CostTable[] = {
675  { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence.
676  { ISD::SHL, MVT::v32i8, 2*11+2 }, // pblendvb sequence + split.
677  { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence.
678  { ISD::SHL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
679  { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
680  { ISD::SHL, MVT::v8i32, 2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split
681 
682  { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence.
683  { ISD::SRL, MVT::v32i8, 2*12+2 }, // pblendvb sequence + split.
684  { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence.
685  { ISD::SRL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
686  { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend.
687  { ISD::SRL, MVT::v8i32, 2*11+2 }, // Shift each lane + blend + split.
688 
689  { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence.
690  { ISD::SRA, MVT::v32i8, 2*24+2 }, // pblendvb sequence + split.
691  { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence.
692  { ISD::SRA, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
693  { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
694  { ISD::SRA, MVT::v8i32, 2*12+2 }, // Shift each lane + blend + split.
695 
696  { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org)
697  };
698 
699  if (ST->hasSSE41())
700  if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
701  return LT.first * Entry->Cost;
702 
703  static const CostTblEntry SSE2CostTable[] = {
704  // We don't correctly identify costs of casts because they are marked as
705  // custom.
706  { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence.
707  { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence.
708  { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
709  { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
710  { ISD::SHL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
711 
712  { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence.
713  { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence.
714  { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
715  { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
716  { ISD::SRL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
717 
718  { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence.
719  { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence.
720  { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend.
721  { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence.
722  { ISD::SRA, MVT::v4i64, 2*12+2 }, // srl/xor/sub sequence+split.
723 
724  { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence.
725  { ISD::MUL, MVT::v8i16, 1 }, // pmullw
726  { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
727  { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
728 
729  { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/
730  { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
731  { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
732  { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
733  };
734 
735  if (ST->hasSSE2())
736  if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
737  return LT.first * Entry->Cost;
738 
739  static const CostTblEntry SSE1CostTable[] = {
740  { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/
741  { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
742  };
743 
744  if (ST->hasSSE1())
745  if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
746  return LT.first * Entry->Cost;
747 
748  // It is not a good idea to vectorize division. We have to scalarize it and
749  // in the process we will often end up having to spilling regular
750  // registers. The overhead of division is going to dominate most kernels
751  // anyways so try hard to prevent vectorization of division - it is
752  // generally a bad idea. Assume somewhat arbitrarily that we have to be able
753  // to hide "20 cycles" for each lane.
754  if ((ISD == ISD::SDIV || ISD == ISD::UDIV) && LT.second.isVector()) {
755  int ScalarCost = getArithmeticInstrCost(
756  Opcode, Ty->getScalarType(), Op1Info, Op2Info,
758  return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
759  }
760 
761  // Fallback to the default implementation.
762  return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
763 }
764 
766  Type *SubTp) {
767  // 64-bit packed float vectors (v2f32) are widened to type v4f32.
768  // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
769  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
770 
771  // For Broadcasts we are splatting the first element from the first input
772  // register, so only need to reference that input and all the output
773  // registers are the same.
774  if (Kind == TTI::SK_Broadcast)
775  LT.first = 1;
776 
777  // We are going to permute multiple sources and the result will be in multiple
778  // destinations. Providing an accurate cost only for splits where the element
779  // type remains the same.
780  if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
781  MVT LegalVT = LT.second;
782  if (LegalVT.isVector() &&
783  LegalVT.getVectorElementType().getSizeInBits() ==
785  LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) {
786 
787  unsigned VecTySize = DL.getTypeStoreSize(Tp);
788  unsigned LegalVTSize = LegalVT.getStoreSize();
789  // Number of source vectors after legalization:
790  unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
791  // Number of destination vectors after legalization:
792  unsigned NumOfDests = LT.first;
793 
794  Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(),
795  LegalVT.getVectorNumElements());
796 
797  unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
798  return NumOfShuffles *
799  getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
800  }
801 
802  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
803  }
804 
805  // For 2-input shuffles, we must account for splitting the 2 inputs into many.
806  if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
807  // We assume that source and destination have the same vector type.
808  int NumOfDests = LT.first;
809  int NumOfShufflesPerDest = LT.first * 2 - 1;
810  LT.first = NumOfDests * NumOfShufflesPerDest;
811  }
812 
813  static const CostTblEntry AVX512VBMIShuffleTbl[] = {
814  { TTI::SK_Reverse, MVT::v64i8, 1 }, // vpermb
815  { TTI::SK_Reverse, MVT::v32i8, 1 }, // vpermb
816 
817  { TTI::SK_PermuteSingleSrc, MVT::v64i8, 1 }, // vpermb
818  { TTI::SK_PermuteSingleSrc, MVT::v32i8, 1 }, // vpermb
819 
820  { TTI::SK_PermuteTwoSrc, MVT::v64i8, 1 }, // vpermt2b
821  { TTI::SK_PermuteTwoSrc, MVT::v32i8, 1 }, // vpermt2b
822  { TTI::SK_PermuteTwoSrc, MVT::v16i8, 1 } // vpermt2b
823  };
824 
825  if (ST->hasVBMI())
826  if (const auto *Entry =
827  CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
828  return LT.first * Entry->Cost;
829 
830  static const CostTblEntry AVX512BWShuffleTbl[] = {
831  { TTI::SK_Broadcast, MVT::v32i16, 1 }, // vpbroadcastw
832  { TTI::SK_Broadcast, MVT::v64i8, 1 }, // vpbroadcastb
833 
834  { TTI::SK_Reverse, MVT::v32i16, 1 }, // vpermw
835  { TTI::SK_Reverse, MVT::v16i16, 1 }, // vpermw
836  { TTI::SK_Reverse, MVT::v64i8, 2 }, // pshufb + vshufi64x2
837 
838  { TTI::SK_PermuteSingleSrc, MVT::v32i16, 1 }, // vpermw
839  { TTI::SK_PermuteSingleSrc, MVT::v16i16, 1 }, // vpermw
840  { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // vpermw
841  { TTI::SK_PermuteSingleSrc, MVT::v64i8, 8 }, // extend to v32i16
842  { TTI::SK_PermuteSingleSrc, MVT::v32i8, 3 }, // vpermw + zext/trunc
843 
844  { TTI::SK_PermuteTwoSrc, MVT::v32i16, 1 }, // vpermt2w
845  { TTI::SK_PermuteTwoSrc, MVT::v16i16, 1 }, // vpermt2w
846  { TTI::SK_PermuteTwoSrc, MVT::v8i16, 1 }, // vpermt2w
847  { TTI::SK_PermuteTwoSrc, MVT::v32i8, 3 }, // zext + vpermt2w + trunc
848  { TTI::SK_PermuteTwoSrc, MVT::v64i8, 19 }, // 6 * v32i8 + 1
849  { TTI::SK_PermuteTwoSrc, MVT::v16i8, 3 } // zext + vpermt2w + trunc
850  };
851 
852  if (ST->hasBWI())
853  if (const auto *Entry =
854  CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
855  return LT.first * Entry->Cost;
856 
857  static const CostTblEntry AVX512ShuffleTbl[] = {
858  { TTI::SK_Broadcast, MVT::v8f64, 1 }, // vbroadcastpd
859  { TTI::SK_Broadcast, MVT::v16f32, 1 }, // vbroadcastps
860  { TTI::SK_Broadcast, MVT::v8i64, 1 }, // vpbroadcastq
861  { TTI::SK_Broadcast, MVT::v16i32, 1 }, // vpbroadcastd
862 
863  { TTI::SK_Reverse, MVT::v8f64, 1 }, // vpermpd
864  { TTI::SK_Reverse, MVT::v16f32, 1 }, // vpermps
865  { TTI::SK_Reverse, MVT::v8i64, 1 }, // vpermq
866  { TTI::SK_Reverse, MVT::v16i32, 1 }, // vpermd
867 
868  { TTI::SK_PermuteSingleSrc, MVT::v8f64, 1 }, // vpermpd
869  { TTI::SK_PermuteSingleSrc, MVT::v4f64, 1 }, // vpermpd
870  { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // vpermpd
871  { TTI::SK_PermuteSingleSrc, MVT::v16f32, 1 }, // vpermps
872  { TTI::SK_PermuteSingleSrc, MVT::v8f32, 1 }, // vpermps
873  { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // vpermps
874  { TTI::SK_PermuteSingleSrc, MVT::v8i64, 1 }, // vpermq
875  { TTI::SK_PermuteSingleSrc, MVT::v4i64, 1 }, // vpermq
876  { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // vpermq
877  { TTI::SK_PermuteSingleSrc, MVT::v16i32, 1 }, // vpermd
878  { TTI::SK_PermuteSingleSrc, MVT::v8i32, 1 }, // vpermd
879  { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }, // vpermd
880  { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb
881 
882  { TTI::SK_PermuteTwoSrc, MVT::v8f64, 1 }, // vpermt2pd
883  { TTI::SK_PermuteTwoSrc, MVT::v16f32, 1 }, // vpermt2ps
884  { TTI::SK_PermuteTwoSrc, MVT::v8i64, 1 }, // vpermt2q
885  { TTI::SK_PermuteTwoSrc, MVT::v16i32, 1 }, // vpermt2d
886  { TTI::SK_PermuteTwoSrc, MVT::v4f64, 1 }, // vpermt2pd
887  { TTI::SK_PermuteTwoSrc, MVT::v8f32, 1 }, // vpermt2ps
888  { TTI::SK_PermuteTwoSrc, MVT::v4i64, 1 }, // vpermt2q
889  { TTI::SK_PermuteTwoSrc, MVT::v8i32, 1 }, // vpermt2d
890  { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // vpermt2pd
891  { TTI::SK_PermuteTwoSrc, MVT::v4f32, 1 }, // vpermt2ps
892  { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // vpermt2q
893  { TTI::SK_PermuteTwoSrc, MVT::v4i32, 1 } // vpermt2d
894  };
895 
896  if (ST->hasAVX512())
897  if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
898  return LT.first * Entry->Cost;
899 
900  static const CostTblEntry AVX2ShuffleTbl[] = {
901  { TTI::SK_Broadcast, MVT::v4f64, 1 }, // vbroadcastpd
902  { TTI::SK_Broadcast, MVT::v8f32, 1 }, // vbroadcastps
903  { TTI::SK_Broadcast, MVT::v4i64, 1 }, // vpbroadcastq
904  { TTI::SK_Broadcast, MVT::v8i32, 1 }, // vpbroadcastd
905  { TTI::SK_Broadcast, MVT::v16i16, 1 }, // vpbroadcastw
906  { TTI::SK_Broadcast, MVT::v32i8, 1 }, // vpbroadcastb
907 
908  { TTI::SK_Reverse, MVT::v4f64, 1 }, // vpermpd
909  { TTI::SK_Reverse, MVT::v8f32, 1 }, // vpermps
910  { TTI::SK_Reverse, MVT::v4i64, 1 }, // vpermq
911  { TTI::SK_Reverse, MVT::v8i32, 1 }, // vpermd
912  { TTI::SK_Reverse, MVT::v16i16, 2 }, // vperm2i128 + pshufb
913  { TTI::SK_Reverse, MVT::v32i8, 2 }, // vperm2i128 + pshufb
914 
915  { TTI::SK_Select, MVT::v16i16, 1 }, // vpblendvb
916  { TTI::SK_Select, MVT::v32i8, 1 }, // vpblendvb
917 
918  { TTI::SK_PermuteSingleSrc, MVT::v4f64, 1 }, // vpermpd
919  { TTI::SK_PermuteSingleSrc, MVT::v8f32, 1 }, // vpermps
920  { TTI::SK_PermuteSingleSrc, MVT::v4i64, 1 }, // vpermq
921  { TTI::SK_PermuteSingleSrc, MVT::v8i32, 1 }, // vpermd
922  { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vperm2i128 + 2*vpshufb
923  // + vpblendvb
924  { TTI::SK_PermuteSingleSrc, MVT::v32i8, 4 }, // vperm2i128 + 2*vpshufb
925  // + vpblendvb
926 
927  { TTI::SK_PermuteTwoSrc, MVT::v4f64, 3 }, // 2*vpermpd + vblendpd
928  { TTI::SK_PermuteTwoSrc, MVT::v8f32, 3 }, // 2*vpermps + vblendps
929  { TTI::SK_PermuteTwoSrc, MVT::v4i64, 3 }, // 2*vpermq + vpblendd
930  { TTI::SK_PermuteTwoSrc, MVT::v8i32, 3 }, // 2*vpermd + vpblendd
931  { TTI::SK_PermuteTwoSrc, MVT::v16i16, 7 }, // 2*vperm2i128 + 4*vpshufb
932  // + vpblendvb
933  { TTI::SK_PermuteTwoSrc, MVT::v32i8, 7 }, // 2*vperm2i128 + 4*vpshufb
934  // + vpblendvb
935  };
936 
937  if (ST->hasAVX2())
938  if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
939  return LT.first * Entry->Cost;
940 
941  static const CostTblEntry XOPShuffleTbl[] = {
942  { TTI::SK_PermuteSingleSrc, MVT::v4f64, 2 }, // vperm2f128 + vpermil2pd
943  { TTI::SK_PermuteSingleSrc, MVT::v8f32, 2 }, // vperm2f128 + vpermil2ps
944  { TTI::SK_PermuteSingleSrc, MVT::v4i64, 2 }, // vperm2f128 + vpermil2pd
945  { TTI::SK_PermuteSingleSrc, MVT::v8i32, 2 }, // vperm2f128 + vpermil2ps
946  { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vextractf128 + 2*vpperm
947  // + vinsertf128
948  { TTI::SK_PermuteSingleSrc, MVT::v32i8, 4 }, // vextractf128 + 2*vpperm
949  // + vinsertf128
950 
951  { TTI::SK_PermuteTwoSrc, MVT::v16i16, 9 }, // 2*vextractf128 + 6*vpperm
952  // + vinsertf128
953  { TTI::SK_PermuteTwoSrc, MVT::v8i16, 1 }, // vpperm
954  { TTI::SK_PermuteTwoSrc, MVT::v32i8, 9 }, // 2*vextractf128 + 6*vpperm
955  // + vinsertf128
956  { TTI::SK_PermuteTwoSrc, MVT::v16i8, 1 }, // vpperm
957  };
958 
959  if (ST->hasXOP())
960  if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
961  return LT.first * Entry->Cost;
962 
963  static const CostTblEntry AVX1ShuffleTbl[] = {
964  { TTI::SK_Broadcast, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd
965  { TTI::SK_Broadcast, MVT::v8f32, 2 }, // vperm2f128 + vpermilps
966  { TTI::SK_Broadcast, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd
967  { TTI::SK_Broadcast, MVT::v8i32, 2 }, // vperm2f128 + vpermilps
968  { TTI::SK_Broadcast, MVT::v16i16, 3 }, // vpshuflw + vpshufd + vinsertf128
969  { TTI::SK_Broadcast, MVT::v32i8, 2 }, // vpshufb + vinsertf128
970 
971  { TTI::SK_Reverse, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd
972  { TTI::SK_Reverse, MVT::v8f32, 2 }, // vperm2f128 + vpermilps
973  { TTI::SK_Reverse, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd
974  { TTI::SK_Reverse, MVT::v8i32, 2 }, // vperm2f128 + vpermilps
975  { TTI::SK_Reverse, MVT::v16i16, 4 }, // vextractf128 + 2*pshufb
976  // + vinsertf128
977  { TTI::SK_Reverse, MVT::v32i8, 4 }, // vextractf128 + 2*pshufb
978  // + vinsertf128
979 
980  { TTI::SK_Select, MVT::v4i64, 1 }, // vblendpd
981  { TTI::SK_Select, MVT::v4f64, 1 }, // vblendpd
982  { TTI::SK_Select, MVT::v8i32, 1 }, // vblendps
983  { TTI::SK_Select, MVT::v8f32, 1 }, // vblendps
984  { TTI::SK_Select, MVT::v16i16, 3 }, // vpand + vpandn + vpor
985  { TTI::SK_Select, MVT::v32i8, 3 }, // vpand + vpandn + vpor
986 
987  { TTI::SK_PermuteSingleSrc, MVT::v4f64, 2 }, // vperm2f128 + vshufpd
988  { TTI::SK_PermuteSingleSrc, MVT::v4i64, 2 }, // vperm2f128 + vshufpd
989  { TTI::SK_PermuteSingleSrc, MVT::v8f32, 4 }, // 2*vperm2f128 + 2*vshufps
990  { TTI::SK_PermuteSingleSrc, MVT::v8i32, 4 }, // 2*vperm2f128 + 2*vshufps
991  { TTI::SK_PermuteSingleSrc, MVT::v16i16, 8 }, // vextractf128 + 4*pshufb
992  // + 2*por + vinsertf128
993  { TTI::SK_PermuteSingleSrc, MVT::v32i8, 8 }, // vextractf128 + 4*pshufb
994  // + 2*por + vinsertf128
995 
996  { TTI::SK_PermuteTwoSrc, MVT::v4f64, 3 }, // 2*vperm2f128 + vshufpd
997  { TTI::SK_PermuteTwoSrc, MVT::v4i64, 3 }, // 2*vperm2f128 + vshufpd
998  { TTI::SK_PermuteTwoSrc, MVT::v8f32, 4 }, // 2*vperm2f128 + 2*vshufps
999  { TTI::SK_PermuteTwoSrc, MVT::v8i32, 4 }, // 2*vperm2f128 + 2*vshufps
1000  { TTI::SK_PermuteTwoSrc, MVT::v16i16, 15 }, // 2*vextractf128 + 8*pshufb
1001  // + 4*por + vinsertf128
1002  { TTI::SK_PermuteTwoSrc, MVT::v32i8, 15 }, // 2*vextractf128 + 8*pshufb
1003  // + 4*por + vinsertf128
1004  };
1005 
1006  if (ST->hasAVX())
1007  if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
1008  return LT.first * Entry->Cost;
1009 
1010  static const CostTblEntry SSE41ShuffleTbl[] = {
1011  { TTI::SK_Select, MVT::v2i64, 1 }, // pblendw
1012  { TTI::SK_Select, MVT::v2f64, 1 }, // movsd
1013  { TTI::SK_Select, MVT::v4i32, 1 }, // pblendw
1014  { TTI::SK_Select, MVT::v4f32, 1 }, // blendps
1015  { TTI::SK_Select, MVT::v8i16, 1 }, // pblendw
1016  { TTI::SK_Select, MVT::v16i8, 1 } // pblendvb
1017  };
1018 
1019  if (ST->hasSSE41())
1020  if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
1021  return LT.first * Entry->Cost;
1022 
1023  static const CostTblEntry SSSE3ShuffleTbl[] = {
1024  { TTI::SK_Broadcast, MVT::v8i16, 1 }, // pshufb
1025  { TTI::SK_Broadcast, MVT::v16i8, 1 }, // pshufb
1026 
1027  { TTI::SK_Reverse, MVT::v8i16, 1 }, // pshufb
1028  { TTI::SK_Reverse, MVT::v16i8, 1 }, // pshufb
1029 
1030  { TTI::SK_Select, MVT::v8i16, 3 }, // 2*pshufb + por
1031  { TTI::SK_Select, MVT::v16i8, 3 }, // 2*pshufb + por
1032 
1033  { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // pshufb
1034  { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb
1035 
1036  { TTI::SK_PermuteTwoSrc, MVT::v8i16, 3 }, // 2*pshufb + por
1037  { TTI::SK_PermuteTwoSrc, MVT::v16i8, 3 }, // 2*pshufb + por
1038  };
1039 
1040  if (ST->hasSSSE3())
1041  if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
1042  return LT.first * Entry->Cost;
1043 
1044  static const CostTblEntry SSE2ShuffleTbl[] = {
1045  { TTI::SK_Broadcast, MVT::v2f64, 1 }, // shufpd
1046  { TTI::SK_Broadcast, MVT::v2i64, 1 }, // pshufd
1047  { TTI::SK_Broadcast, MVT::v4i32, 1 }, // pshufd
1048  { TTI::SK_Broadcast, MVT::v8i16, 2 }, // pshuflw + pshufd
1049  { TTI::SK_Broadcast, MVT::v16i8, 3 }, // unpck + pshuflw + pshufd
1050 
1051  { TTI::SK_Reverse, MVT::v2f64, 1 }, // shufpd
1052  { TTI::SK_Reverse, MVT::v2i64, 1 }, // pshufd
1053  { TTI::SK_Reverse, MVT::v4i32, 1 }, // pshufd
1054  { TTI::SK_Reverse, MVT::v8i16, 3 }, // pshuflw + pshufhw + pshufd
1055  { TTI::SK_Reverse, MVT::v16i8, 9 }, // 2*pshuflw + 2*pshufhw
1056  // + 2*pshufd + 2*unpck + packus
1057 
1058  { TTI::SK_Select, MVT::v2i64, 1 }, // movsd
1059  { TTI::SK_Select, MVT::v2f64, 1 }, // movsd
1060  { TTI::SK_Select, MVT::v4i32, 2 }, // 2*shufps
1061  { TTI::SK_Select, MVT::v8i16, 3 }, // pand + pandn + por
1062  { TTI::SK_Select, MVT::v16i8, 3 }, // pand + pandn + por
1063 
1064  { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // shufpd
1065  { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // pshufd
1066  { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }, // pshufd
1067  { TTI::SK_PermuteSingleSrc, MVT::v8i16, 5 }, // 2*pshuflw + 2*pshufhw
1068  // + pshufd/unpck
1069  { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
1070  // + 2*pshufd + 2*unpck + 2*packus
1071 
1072  { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
1073  { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
1074  { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
1075  { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
1076  { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
1077  };
1078 
1079  if (ST->hasSSE2())
1080  if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
1081  return LT.first * Entry->Cost;
1082 
1083  static const CostTblEntry SSE1ShuffleTbl[] = {
1084  { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
1085  { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
1086  { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
1087  { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
1088  { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
1089  };
1090 
1091  if (ST->hasSSE1())
1092  if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
1093  return LT.first * Entry->Cost;
1094 
1095  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
1096 }
1097 
1098 int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1099  const Instruction *I) {
1100  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1101  assert(ISD && "Invalid opcode");
1102 
1103  // FIXME: Need a better design of the cost table to handle non-simple types of
1104  // potential massive combinations (elem_num x src_type x dst_type).
1105 
1106  static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
1113 
1120 
1127 
1134  };
1135 
1136  // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
1137  // 256-bit wide vectors.
1138 
1139  static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
1143 
1148 
1149  // v16i1 -> v16i32 - load + broadcast
1160 
1171 
1195 
1204  };
1205 
1206  static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
1223 
1230 
1233 
1235  };
1236 
1237  static const TypeConversionCostTblEntry AVXConversionTbl[] = {
1254 
1262 
1275 
1289  // The generic code to compute the scalar overhead is currently broken.
1290  // Workaround this limitation by estimating the scalarization overhead
1291  // here. We have roughly 10 instructions per scalar element.
1292  // Multiply that by the vector width.
1293  // FIXME: remove that when PR19268 is fixed.
1298 
1301  // This node is expanded into scalarized operations but BasicTTI is overly
1302  // optimistic estimating its cost. It computes 3 per element (one
1303  // vector-extract, one scalar conversion and one vector-insert). The
1304  // problem is that the inserts form a read-modify-write chain so latency
1305  // should be factored in too. Inflating the cost per element by 1.
1308 
1311  };
1312 
1313  static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
1320 
1339 
1347 
1348  };
1349 
1350  static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
1351  // These are somewhat magic numbers justified by looking at the output of
1352  // Intel's IACA, running some kernels and making sure when we take
1353  // legalization into account the throughput will be overestimated.
1355  { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1362 
1363  { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1371 
1373 
1398 
1408  };
1409 
1410  std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
1411  std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);
1412 
1413  if (ST->hasSSE2() && !ST->hasAVX()) {
1414  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
1415  LTDest.second, LTSrc.second))
1416  return LTSrc.first * Entry->Cost;
1417  }
1418 
1419  EVT SrcTy = TLI->getValueType(DL, Src);
1420  EVT DstTy = TLI->getValueType(DL, Dst);
1421 
1422  // The function getSimpleVT only handles simple value types.
1423  if (!SrcTy.isSimple() || !DstTy.isSimple())
1424  return BaseT::getCastInstrCost(Opcode, Dst, Src);
1425 
1426  if (ST->hasDQI())
1427  if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
1428  DstTy.getSimpleVT(),
1429  SrcTy.getSimpleVT()))
1430  return Entry->Cost;
1431 
1432  if (ST->hasAVX512())
1433  if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
1434  DstTy.getSimpleVT(),
1435  SrcTy.getSimpleVT()))
1436  return Entry->Cost;
1437 
1438  if (ST->hasAVX2()) {
1439  if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
1440  DstTy.getSimpleVT(),
1441  SrcTy.getSimpleVT()))
1442  return Entry->Cost;
1443  }
1444 
1445  if (ST->hasAVX()) {
1446  if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
1447  DstTy.getSimpleVT(),
1448  SrcTy.getSimpleVT()))
1449  return Entry->Cost;
1450  }
1451 
1452  if (ST->hasSSE41()) {
1453  if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
1454  DstTy.getSimpleVT(),
1455  SrcTy.getSimpleVT()))
1456  return Entry->Cost;
1457  }
1458 
1459  if (ST->hasSSE2()) {
1460  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
1461  DstTy.getSimpleVT(),
1462  SrcTy.getSimpleVT()))
1463  return Entry->Cost;
1464  }
1465 
1466  return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
1467 }
1468 
1469 int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1470  const Instruction *I) {
1471  // Legalize the type.
1472  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1473 
1474  MVT MTy = LT.second;
1475 
1476  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1477  assert(ISD && "Invalid opcode");
1478 
1479  static const CostTblEntry SSE2CostTbl[] = {
1480  { ISD::SETCC, MVT::v2i64, 8 },
1481  { ISD::SETCC, MVT::v4i32, 1 },
1482  { ISD::SETCC, MVT::v8i16, 1 },
1483  { ISD::SETCC, MVT::v16i8, 1 },
1484  };
1485 
1486  static const CostTblEntry SSE42CostTbl[] = {
1487  { ISD::SETCC, MVT::v2f64, 1 },
1488  { ISD::SETCC, MVT::v4f32, 1 },
1489  { ISD::SETCC, MVT::v2i64, 1 },
1490  };
1491 
1492  static const CostTblEntry AVX1CostTbl[] = {
1493  { ISD::SETCC, MVT::v4f64, 1 },
1494  { ISD::SETCC, MVT::v8f32, 1 },
1495  // AVX1 does not support 8-wide integer compare.
1496  { ISD::SETCC, MVT::v4i64, 4 },
1497  { ISD::SETCC, MVT::v8i32, 4 },
1498  { ISD::SETCC, MVT::v16i16, 4 },
1499  { ISD::SETCC, MVT::v32i8, 4 },
1500  };
1501 
1502  static const CostTblEntry AVX2CostTbl[] = {
1503  { ISD::SETCC, MVT::v4i64, 1 },
1504  { ISD::SETCC, MVT::v8i32, 1 },
1505  { ISD::SETCC, MVT::v16i16, 1 },
1506  { ISD::SETCC, MVT::v32i8, 1 },
1507  };
1508 
1509  static const CostTblEntry AVX512CostTbl[] = {
1510  { ISD::SETCC, MVT::v8i64, 1 },
1511  { ISD::SETCC, MVT::v16i32, 1 },
1512  { ISD::SETCC, MVT::v8f64, 1 },
1513  { ISD::SETCC, MVT::v16f32, 1 },
1514  };
1515 
1516  static const CostTblEntry AVX512BWCostTbl[] = {
1517  { ISD::SETCC, MVT::v32i16, 1 },
1518  { ISD::SETCC, MVT::v64i8, 1 },
1519  };
1520 
1521  if (ST->hasBWI())
1522  if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
1523  return LT.first * Entry->Cost;
1524 
1525  if (ST->hasAVX512())
1526  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
1527  return LT.first * Entry->Cost;
1528 
1529  if (ST->hasAVX2())
1530  if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
1531  return LT.first * Entry->Cost;
1532 
1533  if (ST->hasAVX())
1534  if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
1535  return LT.first * Entry->Cost;
1536 
1537  if (ST->hasSSE42())
1538  if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
1539  return LT.first * Entry->Cost;
1540 
1541  if (ST->hasSSE2())
1542  if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
1543  return LT.first * Entry->Cost;
1544 
1545  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
1546 }
1547 
1549 
1552  unsigned ScalarizationCostPassed) {
1553  // Costs should match the codegen from:
1554  // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
1555  // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
1556  // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
1557  // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
1558  // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
1559  static const CostTblEntry AVX512CDCostTbl[] = {
1560  { ISD::CTLZ, MVT::v8i64, 1 },
1561  { ISD::CTLZ, MVT::v16i32, 1 },
1562  { ISD::CTLZ, MVT::v32i16, 8 },
1563  { ISD::CTLZ, MVT::v64i8, 20 },
1564  { ISD::CTLZ, MVT::v4i64, 1 },
1565  { ISD::CTLZ, MVT::v8i32, 1 },
1566  { ISD::CTLZ, MVT::v16i16, 4 },
1567  { ISD::CTLZ, MVT::v32i8, 10 },
1568  { ISD::CTLZ, MVT::v2i64, 1 },
1569  { ISD::CTLZ, MVT::v4i32, 1 },
1570  { ISD::CTLZ, MVT::v8i16, 4 },
1571  { ISD::CTLZ, MVT::v16i8, 4 },
1572  };
1573  static const CostTblEntry AVX512BWCostTbl[] = {
1574  { ISD::BITREVERSE, MVT::v8i64, 5 },
1575  { ISD::BITREVERSE, MVT::v16i32, 5 },
1576  { ISD::BITREVERSE, MVT::v32i16, 5 },
1577  { ISD::BITREVERSE, MVT::v64i8, 5 },
1578  { ISD::CTLZ, MVT::v8i64, 23 },
1579  { ISD::CTLZ, MVT::v16i32, 22 },
1580  { ISD::CTLZ, MVT::v32i16, 18 },
1581  { ISD::CTLZ, MVT::v64i8, 17 },
1582  { ISD::CTPOP, MVT::v8i64, 7 },
1583  { ISD::CTPOP, MVT::v16i32, 11 },
1584  { ISD::CTPOP, MVT::v32i16, 9 },
1585  { ISD::CTPOP, MVT::v64i8, 6 },
1586  { ISD::CTTZ, MVT::v8i64, 10 },
1587  { ISD::CTTZ, MVT::v16i32, 14 },
1588  { ISD::CTTZ, MVT::v32i16, 12 },
1589  { ISD::CTTZ, MVT::v64i8, 9 },
1590  };
1591  static const CostTblEntry AVX512CostTbl[] = {
1592  { ISD::BITREVERSE, MVT::v8i64, 36 },
1593  { ISD::BITREVERSE, MVT::v16i32, 24 },
1594  { ISD::CTLZ, MVT::v8i64, 29 },
1595  { ISD::CTLZ, MVT::v16i32, 35 },
1596  { ISD::CTPOP, MVT::v8i64, 16 },
1597  { ISD::CTPOP, MVT::v16i32, 24 },
1598  { ISD::CTTZ, MVT::v8i64, 20 },
1599  { ISD::CTTZ, MVT::v16i32, 28 },
1600  };
1601  static const CostTblEntry XOPCostTbl[] = {
1602  { ISD::BITREVERSE, MVT::v4i64, 4 },
1603  { ISD::BITREVERSE, MVT::v8i32, 4 },
1604  { ISD::BITREVERSE, MVT::v16i16, 4 },
1605  { ISD::BITREVERSE, MVT::v32i8, 4 },
1606  { ISD::BITREVERSE, MVT::v2i64, 1 },
1607  { ISD::BITREVERSE, MVT::v4i32, 1 },
1608  { ISD::BITREVERSE, MVT::v8i16, 1 },
1609  { ISD::BITREVERSE, MVT::v16i8, 1 },
1610  { ISD::BITREVERSE, MVT::i64, 3 },
1611  { ISD::BITREVERSE, MVT::i32, 3 },
1612  { ISD::BITREVERSE, MVT::i16, 3 },
1613  { ISD::BITREVERSE, MVT::i8, 3 }
1614  };
1615  static const CostTblEntry AVX2CostTbl[] = {
1616  { ISD::BITREVERSE, MVT::v4i64, 5 },
1617  { ISD::BITREVERSE, MVT::v8i32, 5 },
1618  { ISD::BITREVERSE, MVT::v16i16, 5 },
1619  { ISD::BITREVERSE, MVT::v32i8, 5 },
1620  { ISD::BSWAP, MVT::v4i64, 1 },
1621  { ISD::BSWAP, MVT::v8i32, 1 },
1622  { ISD::BSWAP, MVT::v16i16, 1 },
1623  { ISD::CTLZ, MVT::v4i64, 23 },
1624  { ISD::CTLZ, MVT::v8i32, 18 },
1625  { ISD::CTLZ, MVT::v16i16, 14 },
1626  { ISD::CTLZ, MVT::v32i8, 9 },
1627  { ISD::CTPOP, MVT::v4i64, 7 },
1628  { ISD::CTPOP, MVT::v8i32, 11 },
1629  { ISD::CTPOP, MVT::v16i16, 9 },
1630  { ISD::CTPOP, MVT::v32i8, 6 },
1631  { ISD::CTTZ, MVT::v4i64, 10 },
1632  { ISD::CTTZ, MVT::v8i32, 14 },
1633  { ISD::CTTZ, MVT::v16i16, 12 },
1634  { ISD::CTTZ, MVT::v32i8, 9 },
1635  { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/
1636  { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
1637  { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
1638  { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/
1639  { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
1640  { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
1641  };
1642  static const CostTblEntry AVX1CostTbl[] = {
1643  { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert
1644  { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert
1645  { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert
1646  { ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert
1647  { ISD::BSWAP, MVT::v4i64, 4 },
1648  { ISD::BSWAP, MVT::v8i32, 4 },
1649  { ISD::BSWAP, MVT::v16i16, 4 },
1650  { ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert
1651  { ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert
1652  { ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert
1653  { ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
1654  { ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert
1655  { ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert
1656  { ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert
1657  { ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert
1658  { ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert
1659  { ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert
1660  { ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert
1661  { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
1662  { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
1663  { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
1664  { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
1665  { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/
1666  { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/
1667  { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/
1668  };
1669  static const CostTblEntry GLMCostTbl[] = {
1670  { ISD::FSQRT, MVT::f32, 19 }, // sqrtss
1671  { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps
1672  { ISD::FSQRT, MVT::f64, 34 }, // sqrtsd
1673  { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd
1674  };
1675  static const CostTblEntry SLMCostTbl[] = {
1676  { ISD::FSQRT, MVT::f32, 20 }, // sqrtss
1677  { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps
1678  { ISD::FSQRT, MVT::f64, 35 }, // sqrtsd
1679  { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd
1680  };
1681  static const CostTblEntry SSE42CostTbl[] = {
1682  { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
1683  { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
1684  };
1685  static const CostTblEntry SSSE3CostTbl[] = {
1686  { ISD::BITREVERSE, MVT::v2i64, 5 },
1687  { ISD::BITREVERSE, MVT::v4i32, 5 },
1688  { ISD::BITREVERSE, MVT::v8i16, 5 },
1689  { ISD::BITREVERSE, MVT::v16i8, 5 },
1690  { ISD::BSWAP, MVT::v2i64, 1 },
1691  { ISD::BSWAP, MVT::v4i32, 1 },
1692  { ISD::BSWAP, MVT::v8i16, 1 },
1693  { ISD::CTLZ, MVT::v2i64, 23 },
1694  { ISD::CTLZ, MVT::v4i32, 18 },
1695  { ISD::CTLZ, MVT::v8i16, 14 },
1696  { ISD::CTLZ, MVT::v16i8, 9 },
1697  { ISD::CTPOP, MVT::v2i64, 7 },
1698  { ISD::CTPOP, MVT::v4i32, 11 },
1699  { ISD::CTPOP, MVT::v8i16, 9 },
1700  { ISD::CTPOP, MVT::v16i8, 6 },
1701  { ISD::CTTZ, MVT::v2i64, 10 },
1702  { ISD::CTTZ, MVT::v4i32, 14 },
1703  { ISD::CTTZ, MVT::v8i16, 12 },
1704  { ISD::CTTZ, MVT::v16i8, 9 }
1705  };
1706  static const CostTblEntry SSE2CostTbl[] = {
1707  { ISD::BITREVERSE, MVT::v2i64, 29 },
1708  { ISD::BITREVERSE, MVT::v4i32, 27 },
1709  { ISD::BITREVERSE, MVT::v8i16, 27 },
1710  { ISD::BITREVERSE, MVT::v16i8, 20 },
1711  { ISD::BSWAP, MVT::v2i64, 7 },
1712  { ISD::BSWAP, MVT::v4i32, 7 },
1713  { ISD::BSWAP, MVT::v8i16, 7 },
1714  { ISD::CTLZ, MVT::v2i64, 25 },
1715  { ISD::CTLZ, MVT::v4i32, 26 },
1716  { ISD::CTLZ, MVT::v8i16, 20 },
1717  { ISD::CTLZ, MVT::v16i8, 17 },
1718  { ISD::CTPOP, MVT::v2i64, 12 },
1719  { ISD::CTPOP, MVT::v4i32, 15 },
1720  { ISD::CTPOP, MVT::v8i16, 13 },
1721  { ISD::CTPOP, MVT::v16i8, 10 },
1722  { ISD::CTTZ, MVT::v2i64, 14 },
1723  { ISD::CTTZ, MVT::v4i32, 18 },
1724  { ISD::CTTZ, MVT::v8i16, 16 },
1725  { ISD::CTTZ, MVT::v16i8, 13 },
1726  { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/
1727  { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/
1728  };
1729  static const CostTblEntry SSE1CostTbl[] = {
1730  { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
1731  { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
1732  };
1733  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
1734  { ISD::BITREVERSE, MVT::i64, 14 }
1735  };
1736  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1737  { ISD::BITREVERSE, MVT::i32, 14 },
1738  { ISD::BITREVERSE, MVT::i16, 14 },
1739  { ISD::BITREVERSE, MVT::i8, 11 }
1740  };
1741 
1742  unsigned ISD = ISD::DELETED_NODE;
1743  switch (IID) {
1744  default:
1745  break;
1746  case Intrinsic::bitreverse:
1747  ISD = ISD::BITREVERSE;
1748  break;
1749  case Intrinsic::bswap:
1750  ISD = ISD::BSWAP;
1751  break;
1752  case Intrinsic::ctlz:
1753  ISD = ISD::CTLZ;
1754  break;
1755  case Intrinsic::ctpop:
1756  ISD = ISD::CTPOP;
1757  break;
1758  case Intrinsic::cttz:
1759  ISD = ISD::CTTZ;
1760  break;
1761  case Intrinsic::sqrt:
1762  ISD = ISD::FSQRT;
1763  break;
1764  }
1765 
1766  // Legalize the type.
1767  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
1768  MVT MTy = LT.second;
1769 
1770  // Attempt to lookup cost.
1771  if (ST->isGLM())
1772  if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
1773  return LT.first * Entry->Cost;
1774 
1775  if (ST->isSLM())
1776  if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
1777  return LT.first * Entry->Cost;
1778 
1779  if (ST->hasCDI())
1780  if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
1781  return LT.first * Entry->Cost;
1782 
1783  if (ST->hasBWI())
1784  if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
1785  return LT.first * Entry->Cost;
1786 
1787  if (ST->hasAVX512())
1788  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
1789  return LT.first * Entry->Cost;
1790 
1791  if (ST->hasXOP())
1792  if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
1793  return LT.first * Entry->Cost;
1794 
1795  if (ST->hasAVX2())
1796  if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
1797  return LT.first * Entry->Cost;
1798 
1799  if (ST->hasAVX())
1800  if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
1801  return LT.first * Entry->Cost;
1802 
1803  if (ST->hasSSE42())
1804  if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
1805  return LT.first * Entry->Cost;
1806 
1807  if (ST->hasSSSE3())
1808  if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
1809  return LT.first * Entry->Cost;
1810 
1811  if (ST->hasSSE2())
1812  if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
1813  return LT.first * Entry->Cost;
1814 
1815  if (ST->hasSSE1())
1816  if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
1817  return LT.first * Entry->Cost;
1818 
1819  if (ST->is64Bit())
1820  if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
1821  return LT.first * Entry->Cost;
1822 
1823  if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
1824  return LT.first * Entry->Cost;
1825 
1826  return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed);
1827 }
1828 
1830  ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) {
1831  return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF);
1832 }
1833 
1834 int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
1835  assert(Val->isVectorTy() && "This must be a vector type");
1836 
1837  Type *ScalarType = Val->getScalarType();
1838 
1839  if (Index != -1U) {
1840  // Legalize the type.
1841  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
1842 
1843  // This type is legalized to a scalar type.
1844  if (!LT.second.isVector())
1845  return 0;
1846 
1847  // The type may be split. Normalize the index to the new type.
1848  unsigned Width = LT.second.getVectorNumElements();
1849  Index = Index % Width;
1850 
1851  // Floating point scalars are already located in index #0.
1852  if (ScalarType->isFloatingPointTy() && Index == 0)
1853  return 0;
1854  }
1855 
1856  // Add to the base cost if we know that the extracted element of a vector is
1857  // destined to be moved to and used in the integer register file.
1858  int RegisterFileMoveCost = 0;
1859  if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
1860  RegisterFileMoveCost = 1;
1861 
1862  return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
1863 }
1864 
1865 int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
1866  unsigned AddressSpace, const Instruction *I) {
1867  // Handle non-power-of-two vectors such as <3 x float>
1868  if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
1869  unsigned NumElem = VTy->getVectorNumElements();
1870 
1871  // Handle a few common cases:
1872  // <3 x float>
1873  if (NumElem == 3 && VTy->getScalarSizeInBits() == 32)
1874  // Cost = 64 bit store + extract + 32 bit store.
1875  return 3;
1876 
1877  // <3 x double>
1878  if (NumElem == 3 && VTy->getScalarSizeInBits() == 64)
1879  // Cost = 128 bit store + unpack + 64 bit store.
1880  return 3;
1881 
1882  // Assume that all other non-power-of-two numbers are scalarized.
1883  if (!isPowerOf2_32(NumElem)) {
1884  int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
1885  AddressSpace);
1886  int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load,
1887  Opcode == Instruction::Store);
1888  return NumElem * Cost + SplitCost;
1889  }
1890  }
1891 
1892  // Legalize the type.
1893  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
1894  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1895  "Invalid Opcode");
1896 
1897  // Each load/store unit costs 1.
1898  int Cost = LT.first * 1;
1899 
1900  // This isn't exactly right. We're using slow unaligned 32-byte accesses as a
1901  // proxy for a double-pumped AVX memory interface such as on Sandybridge.
1902  if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow())
1903  Cost *= 2;
1904 
1905  return Cost;
1906 }
1907 
1908 int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
1909  unsigned Alignment,
1910  unsigned AddressSpace) {
1911  VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy);
1912  if (!SrcVTy)
1913  // To calculate scalar take the regular cost, without mask
1914  return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace);
1915 
1916  unsigned NumElem = SrcVTy->getVectorNumElements();
1917  VectorType *MaskTy =
1918  VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
1919  if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy)) ||
1920  (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy)) ||
1921  !isPowerOf2_32(NumElem)) {
1922  // Scalarization
1923  int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
1924  int ScalarCompareCost = getCmpSelInstrCost(
1925  Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr);
1926  int BranchCost = getCFInstrCost(Instruction::Br);
1927  int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
1928 
1929  int ValueSplitCost = getScalarizationOverhead(
1930  SrcVTy, Opcode == Instruction::Load, Opcode == Instruction::Store);
1931  int MemopCost =
1932  NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
1933  Alignment, AddressSpace);
1934  return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
1935  }
1936 
1937  // Legalize the type.
1938  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
1939  auto VT = TLI->getValueType(DL, SrcVTy);
1940  int Cost = 0;
1941  if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
1942  LT.second.getVectorNumElements() == NumElem)
1943  // Promotion requires expand/truncate for data and a shuffle for mask.
1944  Cost += getShuffleCost(TTI::SK_Select, SrcVTy, 0, nullptr) +
1945  getShuffleCost(TTI::SK_Select, MaskTy, 0, nullptr);
1946 
1947  else if (LT.second.getVectorNumElements() > NumElem) {
1948  VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(),
1949  LT.second.getVectorNumElements());
1950  // Expanding requires fill mask with zeroes
1951  Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy);
1952  }
1953  if (!ST->hasAVX512())
1954  return Cost + LT.first*4; // Each maskmov costs 4
1955 
1956  // AVX-512 masked load/store is cheapper
1957  return Cost+LT.first;
1958 }
1959 
1961  const SCEV *Ptr) {
1962  // Address computations in vectorized code with non-consecutive addresses will
1963  // likely result in more instructions compared to scalar code where the
1964  // computation can more often be merged into the index mode. The resulting
1965  // extra micro-ops can significantly decrease throughput.
1966  unsigned NumVectorInstToHideOverhead = 10;
1967 
1968  // Cost modeling of Strided Access Computation is hidden by the indexing
1969  // modes of X86 regardless of the stride value. We dont believe that there
1970  // is a difference between constant strided access in gerenal and constant
1971  // strided value which is less than or equal to 64.
1972  // Even in the case of (loop invariant) stride whose value is not known at
1973  // compile time, the address computation will not incur more than one extra
1974  // ADD instruction.
1975  if (Ty->isVectorTy() && SE) {
1976  if (!BaseT::isStridedAccess(Ptr))
1977  return NumVectorInstToHideOverhead;
1978  if (!BaseT::getConstantStrideStep(SE, Ptr))
1979  return 1;
1980  }
1981 
1982  return BaseT::getAddressComputationCost(Ty, SE, Ptr);
1983 }
1984 
1985 int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
1986  bool IsPairwise) {
1987 
1988  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1989 
1990  MVT MTy = LT.second;
1991 
1992  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1993  assert(ISD && "Invalid opcode");
1994 
1995  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
1996  // and make it as the cost.
1997 
1998  static const CostTblEntry SSE42CostTblPairWise[] = {
1999  { ISD::FADD, MVT::v2f64, 2 },
2000  { ISD::FADD, MVT::v4f32, 4 },
2001  { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
2002  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
2003  { ISD::ADD, MVT::v8i16, 5 },
2004  };
2005 
2006  static const CostTblEntry AVX1CostTblPairWise[] = {
2007  { ISD::FADD, MVT::v4f32, 4 },
2008  { ISD::FADD, MVT::v4f64, 5 },
2009  { ISD::FADD, MVT::v8f32, 7 },
2010  { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
2011  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
2012  { ISD::ADD, MVT::v4i64, 5 }, // The data reported by the IACA tool is "4.8".
2013  { ISD::ADD, MVT::v8i16, 5 },
2014  { ISD::ADD, MVT::v8i32, 5 },
2015  };
2016 
2017  static const CostTblEntry SSE42CostTblNoPairWise[] = {
2018  { ISD::FADD, MVT::v2f64, 2 },
2019  { ISD::FADD, MVT::v4f32, 4 },
2020  { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
2021  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
2022  { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
2023  };
2024 
2025  static const CostTblEntry AVX1CostTblNoPairWise[] = {
2026  { ISD::FADD, MVT::v4f32, 3 },
2027  { ISD::FADD, MVT::v4f64, 3 },
2028  { ISD::FADD, MVT::v8f32, 4 },
2029  { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
2030  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "2.8".
2031  { ISD::ADD, MVT::v4i64, 3 },
2032  { ISD::ADD, MVT::v8i16, 4 },
2033  { ISD::ADD, MVT::v8i32, 5 },
2034  };
2035 
2036  if (IsPairwise) {
2037  if (ST->hasAVX())
2038  if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
2039  return LT.first * Entry->Cost;
2040 
2041  if (ST->hasSSE42())
2042  if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
2043  return LT.first * Entry->Cost;
2044  } else {
2045  if (ST->hasAVX())
2046  if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
2047  return LT.first * Entry->Cost;
2048 
2049  if (ST->hasSSE42())
2050  if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
2051  return LT.first * Entry->Cost;
2052  }
2053 
2054  return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise);
2055 }
2056 
2058  bool IsPairwise, bool IsUnsigned) {
2059  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2060 
2061  MVT MTy = LT.second;
2062 
2063  int ISD;
2064  if (ValTy->isIntOrIntVectorTy()) {
2065  ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
2066  } else {
2067  assert(ValTy->isFPOrFPVectorTy() &&
2068  "Expected float point or integer vector type.");
2069  ISD = ISD::FMINNUM;
2070  }
2071 
2072  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
2073  // and make it as the cost.
2074 
2075  static const CostTblEntry SSE42CostTblPairWise[] = {
2076  {ISD::FMINNUM, MVT::v2f64, 3},
2077  {ISD::FMINNUM, MVT::v4f32, 2},
2078  {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
2079  {ISD::UMIN, MVT::v2i64, 8}, // The data reported by the IACA is "8.6"
2080  {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
2081  {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
2082  {ISD::SMIN, MVT::v8i16, 2},
2083  {ISD::UMIN, MVT::v8i16, 2},
2084  };
2085 
2086  static const CostTblEntry AVX1CostTblPairWise[] = {
2087  {ISD::FMINNUM, MVT::v4f32, 1},
2088  {ISD::FMINNUM, MVT::v4f64, 1},
2089  {ISD::FMINNUM, MVT::v8f32, 2},
2090  {ISD::SMIN, MVT::v2i64, 3},
2091  {ISD::UMIN, MVT::v2i64, 3},
2092  {ISD::SMIN, MVT::v4i32, 1},
2093  {ISD::UMIN, MVT::v4i32, 1},
2094  {ISD::SMIN, MVT::v8i16, 1},
2095  {ISD::UMIN, MVT::v8i16, 1},
2096  {ISD::SMIN, MVT::v8i32, 3},
2097  {ISD::UMIN, MVT::v8i32, 3},
2098  };
2099 
2100  static const CostTblEntry AVX2CostTblPairWise[] = {
2101  {ISD::SMIN, MVT::v4i64, 2},
2102  {ISD::UMIN, MVT::v4i64, 2},
2103  {ISD::SMIN, MVT::v8i32, 1},
2104  {ISD::UMIN, MVT::v8i32, 1},
2105  {ISD::SMIN, MVT::v16i16, 1},
2106  {ISD::UMIN, MVT::v16i16, 1},
2107  {ISD::SMIN, MVT::v32i8, 2},
2108  {ISD::UMIN, MVT::v32i8, 2},
2109  };
2110 
2111  static const CostTblEntry AVX512CostTblPairWise[] = {
2112  {ISD::FMINNUM, MVT::v8f64, 1},
2113  {ISD::FMINNUM, MVT::v16f32, 2},
2114  {ISD::SMIN, MVT::v8i64, 2},
2115  {ISD::UMIN, MVT::v8i64, 2},
2116  {ISD::SMIN, MVT::v16i32, 1},
2117  {ISD::UMIN, MVT::v16i32, 1},
2118  };
2119 
2120  static const CostTblEntry SSE42CostTblNoPairWise[] = {
2121  {ISD::FMINNUM, MVT::v2f64, 3},
2122  {ISD::FMINNUM, MVT::v4f32, 3},
2123  {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
2124  {ISD::UMIN, MVT::v2i64, 9}, // The data reported by the IACA is "8.6"
2125  {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
2126  {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
2127  {ISD::SMIN, MVT::v8i16, 1}, // The data reported by the IACA is "1.5"
2128  {ISD::UMIN, MVT::v8i16, 2}, // The data reported by the IACA is "1.8"
2129  };
2130 
2131  static const CostTblEntry AVX1CostTblNoPairWise[] = {
2132  {ISD::FMINNUM, MVT::v4f32, 1},
2133  {ISD::FMINNUM, MVT::v4f64, 1},
2134  {ISD::FMINNUM, MVT::v8f32, 1},
2135  {ISD::SMIN, MVT::v2i64, 3},
2136  {ISD::UMIN, MVT::v2i64, 3},
2137  {ISD::SMIN, MVT::v4i32, 1},
2138  {ISD::UMIN, MVT::v4i32, 1},
2139  {ISD::SMIN, MVT::v8i16, 1},
2140  {ISD::UMIN, MVT::v8i16, 1},
2141  {ISD::SMIN, MVT::v8i32, 2},
2142  {ISD::UMIN, MVT::v8i32, 2},
2143  };
2144 
2145  static const CostTblEntry AVX2CostTblNoPairWise[] = {
2146  {ISD::SMIN, MVT::v4i64, 1},
2147  {ISD::UMIN, MVT::v4i64, 1},
2148  {ISD::SMIN, MVT::v8i32, 1},
2149  {ISD::UMIN, MVT::v8i32, 1},
2150  {ISD::SMIN, MVT::v16i16, 1},
2151  {ISD::UMIN, MVT::v16i16, 1},
2152  {ISD::SMIN, MVT::v32i8, 1},
2153  {ISD::UMIN, MVT::v32i8, 1},
2154  };
2155 
2156  static const CostTblEntry AVX512CostTblNoPairWise[] = {
2157  {ISD::FMINNUM, MVT::v8f64, 1},
2158  {ISD::FMINNUM, MVT::v16f32, 2},
2159  {ISD::SMIN, MVT::v8i64, 1},
2160  {ISD::UMIN, MVT::v8i64, 1},
2161  {ISD::SMIN, MVT::v16i32, 1},
2162  {ISD::UMIN, MVT::v16i32, 1},
2163  };
2164 
2165  if (IsPairwise) {
2166  if (ST->hasAVX512())
2167  if (const auto *Entry = CostTableLookup(AVX512CostTblPairWise, ISD, MTy))
2168  return LT.first * Entry->Cost;
2169 
2170  if (ST->hasAVX2())
2171  if (const auto *Entry = CostTableLookup(AVX2CostTblPairWise, ISD, MTy))
2172  return LT.first * Entry->Cost;
2173 
2174  if (ST->hasAVX())
2175  if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
2176  return LT.first * Entry->Cost;
2177 
2178  if (ST->hasSSE42())
2179  if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
2180  return LT.first * Entry->Cost;
2181  } else {
2182  if (ST->hasAVX512())
2183  if (const auto *Entry =
2184  CostTableLookup(AVX512CostTblNoPairWise, ISD, MTy))
2185  return LT.first * Entry->Cost;
2186 
2187  if (ST->hasAVX2())
2188  if (const auto *Entry = CostTableLookup(AVX2CostTblNoPairWise, ISD, MTy))
2189  return LT.first * Entry->Cost;
2190 
2191  if (ST->hasAVX())
2192  if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
2193  return LT.first * Entry->Cost;
2194 
2195  if (ST->hasSSE42())
2196  if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
2197  return LT.first * Entry->Cost;
2198  }
2199 
2200  return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned);
2201 }
2202 
2203 /// Calculate the cost of materializing a 64-bit value. This helper
2204 /// method might only calculate a fraction of a larger immediate. Therefore it
2205 /// is valid to return a cost of ZERO.
2206 int X86TTIImpl::getIntImmCost(int64_t Val) {
2207  if (Val == 0)
2208  return TTI::TCC_Free;
2209 
2210  if (isInt<32>(Val))
2211  return TTI::TCC_Basic;
2212 
2213  return 2 * TTI::TCC_Basic;
2214 }
2215 
2216 int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
2217  assert(Ty->isIntegerTy());
2218 
2219  unsigned BitSize = Ty->getPrimitiveSizeInBits();
2220  if (BitSize == 0)
2221  return ~0U;
2222 
2223  // Never hoist constants larger than 128bit, because this might lead to
2224  // incorrect code generation or assertions in codegen.
2225  // Fixme: Create a cost model for types larger than i128 once the codegen
2226  // issues have been fixed.
2227  if (BitSize > 128)
2228  return TTI::TCC_Free;
2229 
2230  if (Imm == 0)
2231  return TTI::TCC_Free;
2232 
2233  // Sign-extend all constants to a multiple of 64-bit.
2234  APInt ImmVal = Imm;
2235  if (BitSize & 0x3f)
2236  ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
2237 
2238  // Split the constant into 64-bit chunks and calculate the cost for each
2239  // chunk.
2240  int Cost = 0;
2241  for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
2242  APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
2243  int64_t Val = Tmp.getSExtValue();
2244  Cost += getIntImmCost(Val);
2245  }
2246  // We need at least one instruction to materialize the constant.
2247  return std::max(1, Cost);
2248 }
2249 
2250 int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
2251  Type *Ty) {
2252  assert(Ty->isIntegerTy());
2253 
2254  unsigned BitSize = Ty->getPrimitiveSizeInBits();
2255  // There is no cost model for constants with a bit size of 0. Return TCC_Free
2256  // here, so that constant hoisting will ignore this constant.
2257  if (BitSize == 0)
2258  return TTI::TCC_Free;
2259 
2260  unsigned ImmIdx = ~0U;
2261  switch (Opcode) {
2262  default:
2263  return TTI::TCC_Free;
2264  case Instruction::GetElementPtr:
2265  // Always hoist the base address of a GetElementPtr. This prevents the
2266  // creation of new constants for every base constant that gets constant
2267  // folded with the offset.
2268  if (Idx == 0)
2269  return 2 * TTI::TCC_Basic;
2270  return TTI::TCC_Free;
2271  case Instruction::Store:
2272  ImmIdx = 0;
2273  break;
2274  case Instruction::ICmp:
2275  // This is an imperfect hack to prevent constant hoisting of
2276  // compares that might be trying to check if a 64-bit value fits in
2277  // 32-bits. The backend can optimize these cases using a right shift by 32.
2278  // Ideally we would check the compare predicate here. There also other
2279  // similar immediates the backend can use shifts for.
2280  if (Idx == 1 && Imm.getBitWidth() == 64) {
2281  uint64_t ImmVal = Imm.getZExtValue();
2282  if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
2283  return TTI::TCC_Free;
2284  }
2285  ImmIdx = 1;
2286  break;
2287  case Instruction::And:
2288  // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
2289  // by using a 32-bit operation with implicit zero extension. Detect such
2290  // immediates here as the normal path expects bit 31 to be sign extended.
2291  if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue()))
2292  return TTI::TCC_Free;
2294  case Instruction::Add:
2295  case Instruction::Sub:
2296  case Instruction::Mul:
2297  case Instruction::UDiv:
2298  case Instruction::SDiv:
2299  case Instruction::URem:
2300  case Instruction::SRem:
2301  case Instruction::Or:
2302  case Instruction::Xor:
2303  ImmIdx = 1;
2304  break;
2305  // Always return TCC_Free for the shift value of a shift instruction.
2306  case Instruction::Shl:
2307  case Instruction::LShr:
2308  case Instruction::AShr:
2309  if (Idx == 1)
2310  return TTI::TCC_Free;
2311  break;
2312  case Instruction::Trunc:
2313  case Instruction::ZExt:
2314  case Instruction::SExt:
2315  case Instruction::IntToPtr:
2316  case Instruction::PtrToInt:
2317  case Instruction::BitCast:
2318  case Instruction::PHI:
2319  case Instruction::Call:
2320  case Instruction::Select:
2321  case Instruction::Ret:
2322  case Instruction::Load:
2323  break;
2324  }
2325 
2326  if (Idx == ImmIdx) {
2327  int NumConstants = (BitSize + 63) / 64;
2328  int Cost = X86TTIImpl::getIntImmCost(Imm, Ty);
2329  return (Cost <= NumConstants * TTI::TCC_Basic)
2330  ? static_cast<int>(TTI::TCC_Free)
2331  : Cost;
2332  }
2333 
2334  return X86TTIImpl::getIntImmCost(Imm, Ty);
2335 }
2336 
2337 int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
2338  Type *Ty) {
2339  assert(Ty->isIntegerTy());
2340 
2341  unsigned BitSize = Ty->getPrimitiveSizeInBits();
2342  // There is no cost model for constants with a bit size of 0. Return TCC_Free
2343  // here, so that constant hoisting will ignore this constant.
2344  if (BitSize == 0)
2345  return TTI::TCC_Free;
2346 
2347  switch (IID) {
2348  default:
2349  return TTI::TCC_Free;
2350  case Intrinsic::sadd_with_overflow:
2351  case Intrinsic::uadd_with_overflow:
2352  case Intrinsic::ssub_with_overflow:
2353  case Intrinsic::usub_with_overflow:
2354  case Intrinsic::smul_with_overflow:
2355  case Intrinsic::umul_with_overflow:
2356  if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
2357  return TTI::TCC_Free;
2358  break;
2359  case Intrinsic::experimental_stackmap:
2360  if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
2361  return TTI::TCC_Free;
2362  break;
2363  case Intrinsic::experimental_patchpoint_void:
2364  case Intrinsic::experimental_patchpoint_i64:
2365  if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
2366  return TTI::TCC_Free;
2367  break;
2368  }
2369  return X86TTIImpl::getIntImmCost(Imm, Ty);
2370 }
2371 
2372 unsigned X86TTIImpl::getUserCost(const User *U,
2373  ArrayRef<const Value *> Operands) {
2374  if (isa<StoreInst>(U)) {
2375  Value *Ptr = U->getOperand(1);
2376  // Store instruction with index and scale costs 2 Uops.
2377  // Check the preceding GEP to identify non-const indices.
2378  if (auto GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
2379  if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
2380  return TTI::TCC_Basic * 2;
2381  }
2382  return TTI::TCC_Basic;
2383  }
2384  return BaseT::getUserCost(U, Operands);
2385 }
2386 
2387 // Return an average cost of Gather / Scatter instruction, maybe improved later
2388 int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
2389  unsigned Alignment, unsigned AddressSpace) {
2390 
2391  assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
2392  unsigned VF = SrcVTy->getVectorNumElements();
2393 
2394  // Try to reduce index size from 64 bit (default for GEP)
2395  // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
2396  // operation will use 16 x 64 indices which do not fit in a zmm and needs
2397  // to split. Also check that the base pointer is the same for all lanes,
2398  // and that there's at most one variable index.
2399  auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) {
2400  unsigned IndexSize = DL.getPointerSizeInBits();
2402  if (IndexSize < 64 || !GEP)
2403  return IndexSize;
2404 
2405  unsigned NumOfVarIndices = 0;
2406  Value *Ptrs = GEP->getPointerOperand();
2407  if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
2408  return IndexSize;
2409  for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
2410  if (isa<Constant>(GEP->getOperand(i)))
2411  continue;
2412  Type *IndxTy = GEP->getOperand(i)->getType();
2413  if (IndxTy->isVectorTy())
2414  IndxTy = IndxTy->getVectorElementType();
2415  if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
2416  !isa<SExtInst>(GEP->getOperand(i))) ||
2417  ++NumOfVarIndices > 1)
2418  return IndexSize; // 64
2419  }
2420  return (unsigned)32;
2421  };
2422 
2423 
2424  // Trying to reduce IndexSize to 32 bits for vector 16.
2425  // By default the IndexSize is equal to pointer size.
2426  unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
2427  ? getIndexSizeInBits(Ptr, DL)
2429 
2430  Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(),
2431  IndexSize), VF);
2432  std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
2433  std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
2434  int SplitFactor = std::max(IdxsLT.first, SrcLT.first);
2435  if (SplitFactor > 1) {
2436  // Handle splitting of vector of pointers
2437  Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
2438  return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
2439  AddressSpace);
2440  }
2441 
2442  // The gather / scatter cost is given by Intel architects. It is a rough
2443  // number since we are looking at one instruction in a time.
2444  const int GSOverhead = (Opcode == Instruction::Load)
2445  ? ST->getGatherOverhead()
2446  : ST->getScatterOverhead();
2447  return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
2448  Alignment, AddressSpace);
2449 }
2450 
2451 /// Return the cost of full scalarization of gather / scatter operation.
2452 ///
2453 /// Opcode - Load or Store instruction.
2454 /// SrcVTy - The type of the data vector that should be gathered or scattered.
2455 /// VariableMask - The mask is non-constant at compile time.
2456 /// Alignment - Alignment for one element.
2457 /// AddressSpace - pointer[s] address space.
2458 ///
2459 int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
2460  bool VariableMask, unsigned Alignment,
2461  unsigned AddressSpace) {
2462  unsigned VF = SrcVTy->getVectorNumElements();
2463 
2464  int MaskUnpackCost = 0;
2465  if (VariableMask) {
2466  VectorType *MaskTy =
2467  VectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
2468  MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true);
2469  int ScalarCompareCost =
2470  getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()),
2471  nullptr);
2472  int BranchCost = getCFInstrCost(Instruction::Br);
2473  MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
2474  }
2475 
2476  // The cost of the scalar loads/stores.
2477  int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
2478  Alignment, AddressSpace);
2479 
2480  int InsertExtractCost = 0;
2481  if (Opcode == Instruction::Load)
2482  for (unsigned i = 0; i < VF; ++i)
2483  // Add the cost of inserting each scalar load into the vector
2484  InsertExtractCost +=
2485  getVectorInstrCost(Instruction::InsertElement, SrcVTy, i);
2486  else
2487  for (unsigned i = 0; i < VF; ++i)
2488  // Add the cost of extracting each element out of the data vector
2489  InsertExtractCost +=
2490  getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i);
2491 
2492  return MemoryOpCost + MaskUnpackCost + InsertExtractCost;
2493 }
2494 
2495 /// Calculate the cost of Gather / Scatter operation
2496 int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
2497  Value *Ptr, bool VariableMask,
2498  unsigned Alignment) {
2499  assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
2500  unsigned VF = SrcVTy->getVectorNumElements();
2501  PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
2502  if (!PtrTy && Ptr->getType()->isVectorTy())
2503  PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType());
2504  assert(PtrTy && "Unexpected type for Ptr argument");
2505  unsigned AddressSpace = PtrTy->getAddressSpace();
2506 
2507  bool Scalarize = false;
2508  if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) ||
2509  (Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy)))
2510  Scalarize = true;
2511  // Gather / Scatter for vector 2 is not profitable on KNL / SKX
2512  // Vector-4 of gather/scatter instruction does not exist on KNL.
2513  // We can extend it to 8 elements, but zeroing upper bits of
2514  // the mask vector will add more instructions. Right now we give the scalar
2515  // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction
2516  // is better in the VariableMask case.
2517  if (ST->hasAVX512() && (VF == 2 || (VF == 4 && !ST->hasVLX())))
2518  Scalarize = true;
2519 
2520  if (Scalarize)
2521  return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
2522  AddressSpace);
2523 
2524  return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
2525 }
2526 
2529  // X86 specific here are "instruction number 1st priority".
2530  return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
2531  C1.NumIVMuls, C1.NumBaseAdds,
2532  C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
2533  std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
2534  C2.NumIVMuls, C2.NumBaseAdds,
2535  C2.ScaleCost, C2.ImmCost, C2.SetupCost);
2536 }
2537 
2539  return ST->hasMacroFusion();
2540 }
2541 
2543  // The backend can't handle a single element vector.
2544  if (isa<VectorType>(DataTy) && DataTy->getVectorNumElements() == 1)
2545  return false;
2546  Type *ScalarTy = DataTy->getScalarType();
2547  int DataWidth = isa<PointerType>(ScalarTy) ?
2549 
2550  return ((DataWidth == 32 || DataWidth == 64) && ST->hasAVX()) ||
2551  ((DataWidth == 8 || DataWidth == 16) && ST->hasBWI());
2552 }
2553 
2555  return isLegalMaskedLoad(DataType);
2556 }
2557 
2559  // This function is called now in two cases: from the Loop Vectorizer
2560  // and from the Scalarizer.
2561  // When the Loop Vectorizer asks about legality of the feature,
2562  // the vectorization factor is not calculated yet. The Loop Vectorizer
2563  // sends a scalar type and the decision is based on the width of the
2564  // scalar element.
2565  // Later on, the cost model will estimate usage this intrinsic based on
2566  // the vector type.
2567  // The Scalarizer asks again about legality. It sends a vector type.
2568  // In this case we can reject non-power-of-2 vectors.
2569  // We also reject single element vectors as the type legalizer can't
2570  // scalarize it.
2571  if (isa<VectorType>(DataTy)) {
2572  unsigned NumElts = DataTy->getVectorNumElements();
2573  if (NumElts == 1 || !isPowerOf2_32(NumElts))
2574  return false;
2575  }
2576  Type *ScalarTy = DataTy->getScalarType();
2577  int DataWidth = isa<PointerType>(ScalarTy) ?
2579 
2580  // Some CPUs have better gather performance than others.
2581  // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
2582  // enable gather with a -march.
2583  return (DataWidth == 32 || DataWidth == 64) &&
2584  (ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2()));
2585 }
2586 
2588  // AVX2 doesn't support scatter
2589  if (!ST->hasAVX512())
2590  return false;
2591  return isLegalMaskedGather(DataType);
2592 }
2593 
2594 bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
2595  EVT VT = TLI->getValueType(DL, DataType);
2596  return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
2597 }
2598 
2600  return false;
2601 }
2602 
2604  const Function *Callee) const {
2605  const TargetMachine &TM = getTLI()->getTargetMachine();
2606 
2607  // Work this as a subsetting of subtarget features.
2608  const FeatureBitset &CallerBits =
2609  TM.getSubtargetImpl(*Caller)->getFeatureBits();
2610  const FeatureBitset &CalleeBits =
2611  TM.getSubtargetImpl(*Callee)->getFeatureBits();
2612 
2613  // FIXME: This is likely too limiting as it will include subtarget features
2614  // that we might not care about for inlining, but it is conservatively
2615  // correct.
2616  return (CallerBits & CalleeBits) == CalleeBits;
2617 }
2618 
2620 X86TTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const {
2621  // Only enable vector loads for equality comparison.
2622  // Right now the vector version is not as fast, see #33329.
2623  static const auto ThreeWayOptions = [this]() {
2625  if (ST->is64Bit()) {
2626  Options.LoadSizes.push_back(8);
2627  }
2628  Options.LoadSizes.push_back(4);
2629  Options.LoadSizes.push_back(2);
2630  Options.LoadSizes.push_back(1);
2631  return Options;
2632  }();
2633  static const auto EqZeroOptions = [this]() {
2635  // TODO: enable AVX512 when the DAG is ready.
2636  // if (ST->hasAVX512()) Options.LoadSizes.push_back(64);
2637  if (ST->hasAVX2()) Options.LoadSizes.push_back(32);
2638  if (ST->hasSSE2()) Options.LoadSizes.push_back(16);
2639  if (ST->is64Bit()) {
2640  Options.LoadSizes.push_back(8);
2641  }
2642  Options.LoadSizes.push_back(4);
2643  Options.LoadSizes.push_back(2);
2644  Options.LoadSizes.push_back(1);
2645  return Options;
2646  }();
2647  return IsZeroCmp ? &EqZeroOptions : &ThreeWayOptions;
2648 }
2649 
2651  // TODO: We expect this to be beneficial regardless of arch,
2652  // but there are currently some unexplained performance artifacts on Atom.
2653  // As a temporary solution, disable on Atom.
2654  return !(ST->isAtom());
2655 }
2656 
2657 // Get estimation for interleaved load/store operations for AVX2.
2658 // \p Factor is the interleaved-access factor (stride) - number of
2659 // (interleaved) elements in the group.
2660 // \p Indices contains the indices for a strided load: when the
2661 // interleaved load has gaps they indicate which elements are used.
2662 // If Indices is empty (or if the number of indices is equal to the size
2663 // of the interleaved-access as given in \p Factor) the access has no gaps.
2664 //
2665 // As opposed to AVX-512, AVX2 does not have generic shuffles that allow
2666 // computing the cost using a generic formula as a function of generic
2667 // shuffles. We therefore use a lookup table instead, filled according to
2668 // the instruction sequences that codegen currently generates.
2670  unsigned Factor,
2671  ArrayRef<unsigned> Indices,
2672  unsigned Alignment,
2673  unsigned AddressSpace) {
2674 
2675  // We currently Support only fully-interleaved groups, with no gaps.
2676  // TODO: Support also strided loads (interleaved-groups with gaps).
2677  if (Indices.size() && Indices.size() != Factor)
2678  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2679  Alignment, AddressSpace);
2680 
2681  // VecTy for interleave memop is <VF*Factor x Elt>.
2682  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
2683  // VecTy = <12 x i32>.
2684  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
2685 
2686  // This function can be called with VecTy=<6xi128>, Factor=3, in which case
2687  // the VF=2, while v2i128 is an unsupported MVT vector type
2688  // (see MachineValueType.h::getVectorVT()).
2689  if (!LegalVT.isVector())
2690  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2691  Alignment, AddressSpace);
2692 
2693  unsigned VF = VecTy->getVectorNumElements() / Factor;
2694  Type *ScalarTy = VecTy->getVectorElementType();
2695 
2696  // Calculate the number of memory operations (NumOfMemOps), required
2697  // for load/store the VecTy.
2698  unsigned VecTySize = DL.getTypeStoreSize(VecTy);
2699  unsigned LegalVTSize = LegalVT.getStoreSize();
2700  unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
2701 
2702  // Get the cost of one memory operation.
2703  Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
2704  LegalVT.getVectorNumElements());
2705  unsigned MemOpCost =
2706  getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
2707 
2708  VectorType *VT = VectorType::get(ScalarTy, VF);
2709  EVT ETy = TLI->getValueType(DL, VT);
2710  if (!ETy.isSimple())
2711  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2712  Alignment, AddressSpace);
2713 
2714  // TODO: Complete for other data-types and strides.
2715  // Each combination of Stride, ElementTy and VF results in a different
2716  // sequence; The cost tables are therefore accessed with:
2717  // Factor (stride) and VectorType=VFxElemType.
2718  // The Cost accounts only for the shuffle sequence;
2719  // The cost of the loads/stores is accounted for separately.
2720  //
2721  static const CostTblEntry AVX2InterleavedLoadTbl[] = {
2722  { 2, MVT::v4i64, 6 }, //(load 8i64 and) deinterleave into 2 x 4i64
2723  { 2, MVT::v4f64, 6 }, //(load 8f64 and) deinterleave into 2 x 4f64
2724 
2725  { 3, MVT::v2i8, 10 }, //(load 6i8 and) deinterleave into 3 x 2i8
2726  { 3, MVT::v4i8, 4 }, //(load 12i8 and) deinterleave into 3 x 4i8
2727  { 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8
2728  { 3, MVT::v16i8, 11}, //(load 48i8 and) deinterleave into 3 x 16i8
2729  { 3, MVT::v32i8, 13}, //(load 96i8 and) deinterleave into 3 x 32i8
2730  { 3, MVT::v8f32, 17 }, //(load 24f32 and)deinterleave into 3 x 8f32
2731 
2732  { 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8
2733  { 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8
2734  { 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8
2735  { 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8
2736  { 4, MVT::v32i8, 80 }, //(load 128i8 and) deinterleave into 4 x 32i8
2737 
2738  { 8, MVT::v8f32, 40 } //(load 64f32 and)deinterleave into 8 x 8f32
2739  };
2740 
2741  static const CostTblEntry AVX2InterleavedStoreTbl[] = {
2742  { 2, MVT::v4i64, 6 }, //interleave into 2 x 4i64 into 8i64 (and store)
2743  { 2, MVT::v4f64, 6 }, //interleave into 2 x 4f64 into 8f64 (and store)
2744 
2745  { 3, MVT::v2i8, 7 }, //interleave 3 x 2i8 into 6i8 (and store)
2746  { 3, MVT::v4i8, 8 }, //interleave 3 x 4i8 into 12i8 (and store)
2747  { 3, MVT::v8i8, 11 }, //interleave 3 x 8i8 into 24i8 (and store)
2748  { 3, MVT::v16i8, 11 }, //interleave 3 x 16i8 into 48i8 (and store)
2749  { 3, MVT::v32i8, 13 }, //interleave 3 x 32i8 into 96i8 (and store)
2750 
2751  { 4, MVT::v2i8, 12 }, //interleave 4 x 2i8 into 8i8 (and store)
2752  { 4, MVT::v4i8, 9 }, //interleave 4 x 4i8 into 16i8 (and store)
2753  { 4, MVT::v8i8, 10 }, //interleave 4 x 8i8 into 32i8 (and store)
2754  { 4, MVT::v16i8, 10 }, //interleave 4 x 16i8 into 64i8 (and store)
2755  { 4, MVT::v32i8, 12 } //interleave 4 x 32i8 into 128i8 (and store)
2756  };
2757 
2758  if (Opcode == Instruction::Load) {
2759  if (const auto *Entry =
2760  CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT()))
2761  return NumOfMemOps * MemOpCost + Entry->Cost;
2762  } else {
2763  assert(Opcode == Instruction::Store &&
2764  "Expected Store Instruction at this point");
2765  if (const auto *Entry =
2766  CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT()))
2767  return NumOfMemOps * MemOpCost + Entry->Cost;
2768  }
2769 
2770  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2771  Alignment, AddressSpace);
2772 }
2773 
2774 // Get estimation for interleaved load/store operations and strided load.
2775 // \p Indices contains indices for strided load.
2776 // \p Factor - the factor of interleaving.
2777 // AVX-512 provides 3-src shuffles that significantly reduces the cost.
2779  unsigned Factor,
2780  ArrayRef<unsigned> Indices,
2781  unsigned Alignment,
2782  unsigned AddressSpace) {
2783 
2784  // VecTy for interleave memop is <VF*Factor x Elt>.
2785  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
2786  // VecTy = <12 x i32>.
2787 
2788  // Calculate the number of memory operations (NumOfMemOps), required
2789  // for load/store the VecTy.
2790  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
2791  unsigned VecTySize = DL.getTypeStoreSize(VecTy);
2792  unsigned LegalVTSize = LegalVT.getStoreSize();
2793  unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
2794 
2795  // Get the cost of one memory operation.
2796  Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
2797  LegalVT.getVectorNumElements());
2798  unsigned MemOpCost =
2799  getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
2800 
2801  unsigned VF = VecTy->getVectorNumElements() / Factor;
2802  MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
2803 
2804  if (Opcode == Instruction::Load) {
2805  // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
2806  // contain the cost of the optimized shuffle sequence that the
2807  // X86InterleavedAccess pass will generate.
2808  // The cost of loads and stores are computed separately from the table.
2809 
2810  // X86InterleavedAccess support only the following interleaved-access group.
2811  static const CostTblEntry AVX512InterleavedLoadTbl[] = {
2812  {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
2813  {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
2814  {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
2815  };
2816 
2817  if (const auto *Entry =
2818  CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
2819  return NumOfMemOps * MemOpCost + Entry->Cost;
2820  //If an entry does not exist, fallback to the default implementation.
2821 
2822  // Kind of shuffle depends on number of loaded values.
2823  // If we load the entire data in one register, we can use a 1-src shuffle.
2824  // Otherwise, we'll merge 2 sources in each operation.
2825  TTI::ShuffleKind ShuffleKind =
2826  (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
2827 
2828  unsigned ShuffleCost =
2829  getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr);
2830 
2831  unsigned NumOfLoadsInInterleaveGrp =
2832  Indices.size() ? Indices.size() : Factor;
2833  Type *ResultTy = VectorType::get(VecTy->getVectorElementType(),
2834  VecTy->getVectorNumElements() / Factor);
2835  unsigned NumOfResults =
2836  getTLI()->getTypeLegalizationCost(DL, ResultTy).first *
2837  NumOfLoadsInInterleaveGrp;
2838 
2839  // About a half of the loads may be folded in shuffles when we have only
2840  // one result. If we have more than one result, we do not fold loads at all.
2841  unsigned NumOfUnfoldedLoads =
2842  NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
2843 
2844  // Get a number of shuffle operations per result.
2845  unsigned NumOfShufflesPerResult =
2846  std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
2847 
2848  // The SK_MergeTwoSrc shuffle clobbers one of src operands.
2849  // When we have more than one destination, we need additional instructions
2850  // to keep sources.
2851  unsigned NumOfMoves = 0;
2852  if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
2853  NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
2854 
2855  int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
2856  NumOfUnfoldedLoads * MemOpCost + NumOfMoves;
2857 
2858  return Cost;
2859  }
2860 
2861  // Store.
2862  assert(Opcode == Instruction::Store &&
2863  "Expected Store Instruction at this point");
2864  // X86InterleavedAccess support only the following interleaved-access group.
2865  static const CostTblEntry AVX512InterleavedStoreTbl[] = {
2866  {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
2867  {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
2868  {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
2869 
2870  {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
2871  {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
2872  {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
2873  {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
2874  };
2875 
2876  if (const auto *Entry =
2877  CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
2878  return NumOfMemOps * MemOpCost + Entry->Cost;
2879  //If an entry does not exist, fallback to the default implementation.
2880 
2881  // There is no strided stores meanwhile. And store can't be folded in
2882  // shuffle.
2883  unsigned NumOfSources = Factor; // The number of values to be merged.
2884  unsigned ShuffleCost =
2885  getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr);
2886  unsigned NumOfShufflesPerStore = NumOfSources - 1;
2887 
2888  // The SK_MergeTwoSrc shuffle clobbers one of src operands.
2889  // We need additional instructions to keep sources.
2890  unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
2891  int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
2892  NumOfMoves;
2893  return Cost;
2894 }
2895 
2896 int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
2897  unsigned Factor,
2898  ArrayRef<unsigned> Indices,
2899  unsigned Alignment,
2900  unsigned AddressSpace) {
2901  auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) {
2902  Type *EltTy = VecTy->getVectorElementType();
2903  if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
2904  EltTy->isIntegerTy(32) || EltTy->isPointerTy())
2905  return true;
2906  if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8))
2907  return HasBW;
2908  return false;
2909  };
2910  if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
2911  return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
2912  Alignment, AddressSpace);
2913  if (ST->hasAVX2())
2914  return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
2915  Alignment, AddressSpace);
2916 
2917  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2918  Alignment, AddressSpace);
2919 }
bool hasAVX() const
Definition: X86Subtarget.h:544
Type * getVectorElementType() const
Definition: Type.h:368
constexpr bool isUInt< 32 >(uint64_t x)
Definition: MathExtras.h:349
X = FP_ROUND(Y, TRUNC) - Rounding &#39;Y&#39; from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:506
bool is64Bit() const
Is this x86_64? (disregarding specific ABI / programming model)
Definition: X86Subtarget.h:508
void push_back(const T &Elt)
Definition: SmallVector.h:213
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:563
unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >())
Definition: BasicTTIImpl.h:507
static IntegerType * getInt1Ty(LLVMContext &C)
Definition: Type.cpp:173
llvm::Optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1547
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:840
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:42
bool hasSSE41() const
Definition: X86Subtarget.h:542
Compute iterated dominance frontiers using a linear time algorithm.
Definition: AllocatorList.h:24
unsigned minRequiredElementSize(const Value *Val, bool &isSigned)
static MVT getVectorVT(MVT VT, unsigned NumElements)
Cost tables and simple lookup functions.
int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty)
int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, bool VariableMask, unsigned Alignment)
Calculate the cost of Gather / Scatter operation.
bool hasPOPCNT() const
Definition: X86Subtarget.h:552
bool hasAVX2() const
Definition: X86Subtarget.h:545
const Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
bool isVector() const
Return true if this is a vector value type.
unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:446
This file a TargetTransformInfo::Concept conforming object specific to the X86 target machine...
The main scalar evolution driver.
unsigned getRegisterBitWidth(bool Vector) const
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:253
unsigned getVectorNumElements() const
bool isLegalMaskedScatter(Type *DataType)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:908
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:360
Type Conversion Cost Table.
Definition: CostTable.h:45
[US]{MIN/MAX} - Binary minimum or maximum or signed or unsigned integers.
Definition: ISDOpcodes.h:360
Hexagon Common GEP
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:227
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace)
unsigned getAtomicMemIntrinsicMaxElementSize() const
Cost Table Entry.
Definition: CostTable.h:25
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1493
unsigned getNumberOfRegisters(bool Vector)
int getGatherOverhead() const
Definition: X86Subtarget.h:604
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:130
unsigned getMaxInterleaveFactor(unsigned VF)
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:210
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
bool isUnalignedMem32Slow() const
Definition: X86Subtarget.h:603
unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef< Value * > Args, FastMathFlags FMF, unsigned VF=1)
Get intrinsic cost based on arguments.
Definition: BasicTTIImpl.h:899
bool isLegalMaskedStore(Type *DataType)
unsigned getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
int getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value *> Args=ArrayRef< const Value *>())
const FeatureBitset & getFeatureBits() const
Shift and rotation operations.
Definition: ISDOpcodes.h:380
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:197
unsigned getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwise)
Try to calculate arithmetic and shuffle op costs for reduction operations.
const TypeConversionCostTblEntry * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntry > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table, TypeTy must be comparable to CompareTy by ==. ...
Definition: CostTable.h:55
unsigned getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isLSRCostLess(TargetTransformInfo::LSRCost &C1, TargetTransformInfo::LSRCost &C2)
bool hasVLX() const
Definition: X86Subtarget.h:638
unsigned getSizeInBits() const
unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace)
Definition: BasicTTIImpl.h:783
unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I)
Definition: BasicTTIImpl.h:705
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1559
unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:567
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:245
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:448
PopcntSupportKind
Flags indicating the kind of support for population count.
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:890
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:201
llvm::Optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:33
Selects elements from the corresponding lane of either source operand.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:203
int getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr)
Reverse the order of the vector.
amdgpu Simplify well known AMD library false Value * Callee
bool hasDQI() const
Definition: X86Subtarget.h:636
MVT getVectorElementType() const
Value * getOperand(unsigned i) const
Definition: User.h:170
Class to represent pointers.
Definition: DerivedTypes.h:467
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:492
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return &#39;this&#39;.
Definition: Type.h:301
bool isFloatTy() const
Return true if this is &#39;float&#39;, a 32-bit IEEE fp type.
Definition: Type.h:147
an instruction for type-safe pointer arithmetic to access elements of arrays and structs ...
Definition: Instructions.h:837
bool isSLM() const
Definition: X86Subtarget.h:685
bool hasSSSE3() const
Definition: X86Subtarget.h:541
If not nullptr, enable inline expansion of memcmp.
* if(!EatIfPresent(lltok::kw_thread_local)) return false
ParseOptionalThreadLocal := /*empty.
Container class for subtarget features.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:429
Machine Value Type.
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46
Simple binary floating point operators.
Definition: ISDOpcodes.h:260
bool isLegalMaskedGather(Type *DataType)
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:149
unsigned getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwise, bool)
Try to calculate op costs for min/max reduction operations.
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:221
Expected to fold away in lowering.
unsigned getUserCost(const User *U, ArrayRef< const Value * > Operands)
bool isLegalMaskedLoad(Type *DataType)
const TTI::MemCmpExpansionOptions * enableMemCmpExpansion(bool IsZeroCmp) const
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:495
Merge elements from two source vectors into one with any shuffle mask.
int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace)
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:281
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
bool hasXOP() const
Definition: X86Subtarget.h:568
int getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace)
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:524
bool hasSSE42() const
Definition: X86Subtarget.h:543
Extended Value Type.
Definition: ValueTypes.h:34
int getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm)
unsigned getUserCost(const User *U, ArrayRef< const Value *> Operands)
const TargetMachine & getTargetMachine() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace, const Instruction *I=nullptr)
int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp)
const SCEVConstant * getConstantStrideStep(ScalarEvolution *SE, const SCEV *Ptr)
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
OperandValueProperties
Additional properties of an operand&#39;s values.
unsigned getCFInstrCost(unsigned Opcode)
Definition: BasicTTIImpl.h:700
bool isAtom() const
TODO: to be removed later and replaced with suitable properties.
Definition: X86Subtarget.h:684
unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp)
Definition: BasicTTIImpl.h:554
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:940
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:240
unsigned getNumOperands() const
Definition: User.h:192
constexpr bool isInt< 32 >(int64_t x)
Definition: MathExtras.h:309
unsigned getPreferVectorWidth() const
Definition: X86Subtarget.h:655
AddressSpace
Definition: NVPTXBaseInfo.h:22
bool hasVBMI() const
Definition: X86Subtarget.h:578
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:383
int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I=nullptr)
int getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwiseForm, bool IsUnsigned)
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target&#39;s TargetSubtargetInf...
unsigned getVectorNumElements() const
Definition: DerivedTypes.h:462
Class to represent vector types.
Definition: DerivedTypes.h:393
Class for arbitrary precision integers.
Definition: APInt.h:69
int getScatterOverhead() const
Definition: X86Subtarget.h:605
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:438
unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:752
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const
const CostTblEntry * CostTableLookup(ArrayRef< CostTblEntry > Tbl, int ISD, MVT Ty)
Find in cost table, TypeTy must be comparable to CompareTy by ==.
Definition: CostTable.h:32
This class represents an analyzed expression in the program.
bool isGLM() const
Definition: X86Subtarget.h:686
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, const Instruction *I=nullptr)
#define I(x, y, z)
Definition: MD5.cpp:58
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:323
unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: BasicTTIImpl.h:745
InsertSubvector. Index indicates start offset.
unsigned Insns
TODO: Some of these could be merged.
bool hasCDI() const
Definition: X86Subtarget.h:632
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:185
const unsigned Kind
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
The cost of a typical &#39;add&#39; instruction.
int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace)
bool hasSSE1() const
Definition: X86Subtarget.h:538
unsigned getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition: Type.cpp:115
LLVM Value Representation.
Definition: Value.h:73
uint64_t getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type...
Definition: DataLayout.h:411
bool hasDivRemOp(Type *DataType, bool IsSigned)
static VectorType * get(Type *ElementType, unsigned NumElements)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:593
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition: Compiler.h:235
bool hasMacroFusion() const
Definition: X86Subtarget.h:622
Broadcast element 0 to all other elements.
bool hasAVX512() const
Definition: X86Subtarget.h:546
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:59
Convenience struct for specifying and reasoning about fast-math flags.
Definition: Operator.h:160
bool hasBWI() const
Definition: X86Subtarget.h:637
OperandValueKind
Additional information about an operand&#39;s possible values.
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:413
This pass exposes codegen information to IR-level passes.
Conversion operators.
Definition: ISDOpcodes.h:435
bool hasFastGather() const
Definition: X86Subtarget.h:617
CacheLevel
The possible cache levels.
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:444
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:126
bool isDoubleTy() const
Return true if this is &#39;double&#39;, a 64-bit IEEE fp type.
Definition: Type.h:150
int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef< Type *> Tys, FastMathFlags FMF, unsigned ScalarizationCostPassed=UINT_MAX)
static IntegerType * getInt8Ty(LLVMContext &C)
Definition: Type.cpp:174
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
std::pair< int, MVT > getTypeLegalizationCost(const DataLayout &DL, Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
bool hasSSE2() const
Definition: X86Subtarget.h:539
int getIntImmCost(int64_t)
Calculate the cost of materializing a 64-bit value.
This file describes how to lower LLVM code to machine code.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Shuffle elements of single source vector with any shuffle mask.