LLVM  7.0.0svn
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 /// \file
10 /// This file implements a TargetTransformInfo analysis pass specific to the
11 /// X86 target machine. It uses the target's detailed information to provide
12 /// more precise answers to certain TTI queries, while letting the target
13 /// independent and default TTI implementations handle the rest.
14 ///
15 //===----------------------------------------------------------------------===//
16 /// About Cost Model numbers used below it's necessary to say the following:
17 /// the numbers correspond to some "generic" X86 CPU instead of usage of
18 /// concrete CPU model. Usually the numbers correspond to CPU where the feature
19 /// apeared at the first time. For example, if we do Subtarget.hasSSE42() in
20 /// the lookups below the cost is based on Nehalem as that was the first CPU
21 /// to support that feature level and thus has most likely the worst case cost.
22 /// Some examples of other technologies/CPUs:
23 /// SSE 3 - Pentium4 / Athlon64
24 /// SSE 4.1 - Penryn
25 /// SSE 4.2 - Nehalem
26 /// AVX - Sandy Bridge
27 /// AVX2 - Haswell
28 /// AVX-512 - Xeon Phi / Skylake
29 /// And some examples of instruction target dependent costs (latency)
30 /// divss sqrtss rsqrtss
31 /// AMD K7 11-16 19 3
32 /// Piledriver 9-24 13-15 5
33 /// Jaguar 14 16 2
34 /// Pentium II,III 18 30 2
35 /// Nehalem 7-14 7-18 3
36 /// Haswell 10-13 11 5
37 /// TODO: Develop and implement the target dependent cost model and
38 /// specialize cost numbers for different Cost Model Targets such as throughput,
39 /// code size, latency and uop count.
40 //===----------------------------------------------------------------------===//
41 
42 #include "X86TargetTransformInfo.h"
45 #include "llvm/CodeGen/CostTable.h"
47 #include "llvm/IR/IntrinsicInst.h"
48 #include "llvm/Support/Debug.h"
49 
50 using namespace llvm;
51 
52 #define DEBUG_TYPE "x86tti"
53 
54 //===----------------------------------------------------------------------===//
55 //
56 // X86 cost model.
57 //
58 //===----------------------------------------------------------------------===//
59 
61 X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
62  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
63  // TODO: Currently the __builtin_popcount() implementation using SSE3
64  // instructions is inefficient. Once the problem is fixed, we should
65  // call ST->hasSSE3() instead of ST->hasPOPCNT().
67 }
68 
71  switch (Level) {
73  // - Penryn
74  // - Nehalem
75  // - Westmere
76  // - Sandy Bridge
77  // - Ivy Bridge
78  // - Haswell
79  // - Broadwell
80  // - Skylake
81  // - Kabylake
82  return 32 * 1024; // 32 KByte
84  // - Penryn
85  // - Nehalem
86  // - Westmere
87  // - Sandy Bridge
88  // - Ivy Bridge
89  // - Haswell
90  // - Broadwell
91  // - Skylake
92  // - Kabylake
93  return 256 * 1024; // 256 KByte
94  }
95 
96  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
97 }
98 
101  // - Penryn
102  // - Nehalem
103  // - Westmere
104  // - Sandy Bridge
105  // - Ivy Bridge
106  // - Haswell
107  // - Broadwell
108  // - Skylake
109  // - Kabylake
110  switch (Level) {
114  return 8;
115  }
116 
117  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
118 }
119 
120 unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
121  if (Vector && !ST->hasSSE1())
122  return 0;
123 
124  if (ST->is64Bit()) {
125  if (Vector && ST->hasAVX512())
126  return 32;
127  return 16;
128  }
129  return 8;
130 }
131 
132 unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const {
133  unsigned PreferVectorWidth = ST->getPreferVectorWidth();
134  if (Vector) {
135  if (ST->hasAVX512() && PreferVectorWidth >= 512)
136  return 512;
137  if (ST->hasAVX() && PreferVectorWidth >= 256)
138  return 256;
139  if (ST->hasSSE1() && PreferVectorWidth >= 128)
140  return 128;
141  return 0;
142  }
143 
144  if (ST->is64Bit())
145  return 64;
146 
147  return 32;
148 }
149 
150 unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
151  return getRegisterBitWidth(true);
152 }
153 
154 unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
155  // If the loop will not be vectorized, don't interleave the loop.
156  // Let regular unroll to unroll the loop, which saves the overflow
157  // check and memory check cost.
158  if (VF == 1)
159  return 1;
160 
161  if (ST->isAtom())
162  return 1;
163 
164  // Sandybridge and Haswell have multiple execution ports and pipelined
165  // vector units.
166  if (ST->hasAVX())
167  return 4;
168 
169  return 2;
170 }
171 
173  unsigned Opcode, Type *Ty,
175  TTI::OperandValueProperties Opd1PropInfo,
176  TTI::OperandValueProperties Opd2PropInfo,
178  // Legalize the type.
179  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
180 
181  int ISD = TLI->InstructionOpcodeToISD(Opcode);
182  assert(ISD && "Invalid opcode");
183 
184  static const CostTblEntry SLMCostTable[] = {
185  { ISD::MUL, MVT::v4i32, 11 }, // pmulld
186  { ISD::MUL, MVT::v8i16, 2 }, // pmullw
187  { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
188  { ISD::FMUL, MVT::f64, 2 }, // mulsd
189  { ISD::FMUL, MVT::v2f64, 4 }, // mulpd
190  { ISD::FMUL, MVT::v4f32, 2 }, // mulps
191  { ISD::FDIV, MVT::f32, 17 }, // divss
192  { ISD::FDIV, MVT::v4f32, 39 }, // divps
193  { ISD::FDIV, MVT::f64, 32 }, // divsd
194  { ISD::FDIV, MVT::v2f64, 69 }, // divpd
195  { ISD::FADD, MVT::v2f64, 2 }, // addpd
196  { ISD::FSUB, MVT::v2f64, 2 }, // subpd
197  // v2i64/v4i64 mul is custom lowered as a series of long:
198  // multiplies(3), shifts(3) and adds(2)
199  // slm muldq version throughput is 2 and addq throughput 4
200  // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
201  // 3X4 (addq throughput) = 17
202  { ISD::MUL, MVT::v2i64, 17 },
203  // slm addq\subq throughput is 4
204  { ISD::ADD, MVT::v2i64, 4 },
205  { ISD::SUB, MVT::v2i64, 4 },
206  };
207 
208  if (ST->isSLM()) {
209  if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
210  // Check if the operands can be shrinked into a smaller datatype.
211  bool Op1Signed = false;
212  unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
213  bool Op2Signed = false;
214  unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
215 
216  bool signedMode = Op1Signed | Op2Signed;
217  unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
218 
219  if (OpMinSize <= 7)
220  return LT.first * 3; // pmullw/sext
221  if (!signedMode && OpMinSize <= 8)
222  return LT.first * 3; // pmullw/zext
223  if (OpMinSize <= 15)
224  return LT.first * 5; // pmullw/pmulhw/pshuf
225  if (!signedMode && OpMinSize <= 16)
226  return LT.first * 5; // pmullw/pmulhw/pshuf
227  }
228  if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
229  LT.second)) {
230  return LT.first * Entry->Cost;
231  }
232  }
233 
234  if (ISD == ISD::SDIV &&
236  Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
237  // On X86, vector signed division by constants power-of-two are
238  // normally expanded to the sequence SRA + SRL + ADD + SRA.
239  // The OperandValue properties many not be same as that of previous
240  // operation;conservatively assume OP_None.
241  int Cost = 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info,
244  Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
247  Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info,
250 
251  return Cost;
252  }
253 
254  static const CostTblEntry AVX512BWUniformConstCostTable[] = {
255  { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand.
256  { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand.
257  { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb.
258 
259  { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence
260  { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence
261  };
262 
264  ST->hasBWI()) {
265  if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
266  LT.second))
267  return LT.first * Entry->Cost;
268  }
269 
270  static const CostTblEntry AVX512UniformConstCostTable[] = {
271  { ISD::SRA, MVT::v2i64, 1 },
272  { ISD::SRA, MVT::v4i64, 1 },
273  { ISD::SRA, MVT::v8i64, 1 },
274 
275  { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
276  { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
277  };
278 
280  ST->hasAVX512()) {
281  if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
282  LT.second))
283  return LT.first * Entry->Cost;
284  }
285 
286  static const CostTblEntry AVX2UniformConstCostTable[] = {
287  { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand.
288  { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand.
289  { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb.
290 
291  { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
292 
293  { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
294  { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
295  { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
296  { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
297  };
298 
300  ST->hasAVX2()) {
301  if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
302  LT.second))
303  return LT.first * Entry->Cost;
304  }
305 
306  static const CostTblEntry SSE2UniformConstCostTable[] = {
307  { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand.
308  { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand.
309  { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
310 
311  { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split.
312  { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split.
313  { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
314 
315  { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
316  { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
317  { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
318  { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
319  { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split.
320  { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
321  { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split.
322  { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
323  };
324 
326  ST->hasSSE2()) {
327  // pmuldq sequence.
328  if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
329  return LT.first * 32;
330  if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
331  return LT.first * 15;
332 
333  // XOP has faster vXi8 shifts.
334  if ((ISD != ISD::SHL && ISD != ISD::SRL && ISD != ISD::SRA) ||
335  !ST->hasXOP())
336  if (const auto *Entry =
337  CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
338  return LT.first * Entry->Cost;
339  }
340 
341  static const CostTblEntry AVX2UniformCostTable[] = {
342  // Uniform splats are cheaper for the following instructions.
343  { ISD::SHL, MVT::v16i16, 1 }, // psllw.
344  { ISD::SRL, MVT::v16i16, 1 }, // psrlw.
345  { ISD::SRA, MVT::v16i16, 1 }, // psraw.
346  };
347 
348  if (ST->hasAVX2() &&
350  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
351  if (const auto *Entry =
352  CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
353  return LT.first * Entry->Cost;
354  }
355 
356  static const CostTblEntry SSE2UniformCostTable[] = {
357  // Uniform splats are cheaper for the following instructions.
358  { ISD::SHL, MVT::v8i16, 1 }, // psllw.
359  { ISD::SHL, MVT::v4i32, 1 }, // pslld
360  { ISD::SHL, MVT::v2i64, 1 }, // psllq.
361 
362  { ISD::SRL, MVT::v8i16, 1 }, // psrlw.
363  { ISD::SRL, MVT::v4i32, 1 }, // psrld.
364  { ISD::SRL, MVT::v2i64, 1 }, // psrlq.
365 
366  { ISD::SRA, MVT::v8i16, 1 }, // psraw.
367  { ISD::SRA, MVT::v4i32, 1 }, // psrad.
368  };
369 
370  if (ST->hasSSE2() &&
372  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
373  if (const auto *Entry =
374  CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
375  return LT.first * Entry->Cost;
376  }
377 
378  static const CostTblEntry AVX512DQCostTable[] = {
379  { ISD::MUL, MVT::v2i64, 1 },
380  { ISD::MUL, MVT::v4i64, 1 },
381  { ISD::MUL, MVT::v8i64, 1 }
382  };
383 
384  // Look for AVX512DQ lowering tricks for custom cases.
385  if (ST->hasDQI())
386  if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
387  return LT.first * Entry->Cost;
388 
389  static const CostTblEntry AVX512BWCostTable[] = {
390  { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
391  { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
392  { ISD::SRA, MVT::v8i16, 1 }, // vpsravw
393 
394  { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
395  { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
396  { ISD::SRA, MVT::v16i16, 1 }, // vpsravw
397 
398  { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
399  { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
400  { ISD::SRA, MVT::v32i16, 1 }, // vpsravw
401 
402  { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence.
403  { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence.
404  { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence.
405 
406  { ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence.
407  { ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence.
408  { ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence.
409 
410  // Vectorizing division is a bad idea. See the SSE2 table for more comments.
411  { ISD::SDIV, MVT::v64i8, 64*20 },
412  { ISD::SDIV, MVT::v32i16, 32*20 },
413  { ISD::UDIV, MVT::v64i8, 64*20 },
414  { ISD::UDIV, MVT::v32i16, 32*20 }
415  };
416 
417  // Look for AVX512BW lowering tricks for custom cases.
418  if (ST->hasBWI())
419  if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
420  return LT.first * Entry->Cost;
421 
422  static const CostTblEntry AVX512CostTable[] = {
423  { ISD::SHL, MVT::v16i32, 1 },
424  { ISD::SRL, MVT::v16i32, 1 },
425  { ISD::SRA, MVT::v16i32, 1 },
426 
427  { ISD::SHL, MVT::v8i64, 1 },
428  { ISD::SRL, MVT::v8i64, 1 },
429 
430  { ISD::SRA, MVT::v2i64, 1 },
431  { ISD::SRA, MVT::v4i64, 1 },
432  { ISD::SRA, MVT::v8i64, 1 },
433 
434  { ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence.
435  { ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence.
436  { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org)
437  { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org)
438  { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org)
439  { ISD::MUL, MVT::v8i64, 8 }, // 3*pmuludq/3*shift/2*add
440 
441  // Vectorizing division is a bad idea. See the SSE2 table for more comments.
442  { ISD::SDIV, MVT::v16i32, 16*20 },
443  { ISD::SDIV, MVT::v8i64, 8*20 },
444  { ISD::UDIV, MVT::v16i32, 16*20 },
445  { ISD::UDIV, MVT::v8i64, 8*20 }
446  };
447 
448  if (ST->hasAVX512())
449  if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
450  return LT.first * Entry->Cost;
451 
452  static const CostTblEntry AVX2ShiftCostTable[] = {
453  // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
454  // customize them to detect the cases where shift amount is a scalar one.
455  { ISD::SHL, MVT::v4i32, 1 },
456  { ISD::SRL, MVT::v4i32, 1 },
457  { ISD::SRA, MVT::v4i32, 1 },
458  { ISD::SHL, MVT::v8i32, 1 },
459  { ISD::SRL, MVT::v8i32, 1 },
460  { ISD::SRA, MVT::v8i32, 1 },
461  { ISD::SHL, MVT::v2i64, 1 },
462  { ISD::SRL, MVT::v2i64, 1 },
463  { ISD::SHL, MVT::v4i64, 1 },
464  { ISD::SRL, MVT::v4i64, 1 },
465  };
466 
467  // Look for AVX2 lowering tricks.
468  if (ST->hasAVX2()) {
469  if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
472  // On AVX2, a packed v16i16 shift left by a constant build_vector
473  // is lowered into a vector multiply (vpmullw).
474  return LT.first;
475 
476  if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
477  return LT.first * Entry->Cost;
478  }
479 
480  static const CostTblEntry XOPShiftCostTable[] = {
481  // 128bit shifts take 1cy, but right shifts require negation beforehand.
482  { ISD::SHL, MVT::v16i8, 1 },
483  { ISD::SRL, MVT::v16i8, 2 },
484  { ISD::SRA, MVT::v16i8, 2 },
485  { ISD::SHL, MVT::v8i16, 1 },
486  { ISD::SRL, MVT::v8i16, 2 },
487  { ISD::SRA, MVT::v8i16, 2 },
488  { ISD::SHL, MVT::v4i32, 1 },
489  { ISD::SRL, MVT::v4i32, 2 },
490  { ISD::SRA, MVT::v4i32, 2 },
491  { ISD::SHL, MVT::v2i64, 1 },
492  { ISD::SRL, MVT::v2i64, 2 },
493  { ISD::SRA, MVT::v2i64, 2 },
494  // 256bit shifts require splitting if AVX2 didn't catch them above.
495  { ISD::SHL, MVT::v32i8, 2+2 },
496  { ISD::SRL, MVT::v32i8, 4+2 },
497  { ISD::SRA, MVT::v32i8, 4+2 },
498  { ISD::SHL, MVT::v16i16, 2+2 },
499  { ISD::SRL, MVT::v16i16, 4+2 },
500  { ISD::SRA, MVT::v16i16, 4+2 },
501  { ISD::SHL, MVT::v8i32, 2+2 },
502  { ISD::SRL, MVT::v8i32, 4+2 },
503  { ISD::SRA, MVT::v8i32, 4+2 },
504  { ISD::SHL, MVT::v4i64, 2+2 },
505  { ISD::SRL, MVT::v4i64, 4+2 },
506  { ISD::SRA, MVT::v4i64, 4+2 },
507  };
508 
509  // Look for XOP lowering tricks.
510  if (ST->hasXOP())
511  if (const auto *Entry = CostTableLookup(XOPShiftCostTable, ISD, LT.second))
512  return LT.first * Entry->Cost;
513 
514  static const CostTblEntry SSE2UniformShiftCostTable[] = {
515  // Uniform splats are cheaper for the following instructions.
516  { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split.
517  { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split.
518  { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split.
519 
520  { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split.
521  { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split.
522  { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split.
523 
524  { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split.
525  { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split.
526  { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle.
527  { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split.
528  };
529 
530  if (ST->hasSSE2() &&
532  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
533 
534  // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
535  if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2())
536  return LT.first * 4; // 2*psrad + shuffle.
537 
538  if (const auto *Entry =
539  CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
540  return LT.first * Entry->Cost;
541  }
542 
543  if (ISD == ISD::SHL &&
545  MVT VT = LT.second;
546  // Vector shift left by non uniform constant can be lowered
547  // into vector multiply.
548  if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
549  ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
550  ISD = ISD::MUL;
551  }
552 
553  static const CostTblEntry AVX2CostTable[] = {
554  { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence.
555  { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
556 
557  { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence.
558  { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
559 
560  { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence.
561  { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence.
562  { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence.
563  { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence.
564 
565  { ISD::SUB, MVT::v32i8, 1 }, // psubb
566  { ISD::ADD, MVT::v32i8, 1 }, // paddb
567  { ISD::SUB, MVT::v16i16, 1 }, // psubw
568  { ISD::ADD, MVT::v16i16, 1 }, // paddw
569  { ISD::SUB, MVT::v8i32, 1 }, // psubd
570  { ISD::ADD, MVT::v8i32, 1 }, // paddd
571  { ISD::SUB, MVT::v4i64, 1 }, // psubq
572  { ISD::ADD, MVT::v4i64, 1 }, // paddq
573 
574  { ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence.
575  { ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence.
576  { ISD::MUL, MVT::v16i16, 1 }, // pmullw
577  { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org)
578  { ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add
579 
580  { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
581  { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
582  { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
583  { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/
584  { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
585  { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
586  };
587 
588  // Look for AVX2 lowering tricks for custom cases.
589  if (ST->hasAVX2())
590  if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
591  return LT.first * Entry->Cost;
592 
593  static const CostTblEntry AVX1CostTable[] = {
594  // We don't have to scalarize unsupported ops. We can issue two half-sized
595  // operations and we only need to extract the upper YMM half.
596  // Two ops + 1 extract + 1 insert = 4.
597  { ISD::MUL, MVT::v16i16, 4 },
598  { ISD::MUL, MVT::v8i32, 4 },
599  { ISD::SUB, MVT::v32i8, 4 },
600  { ISD::ADD, MVT::v32i8, 4 },
601  { ISD::SUB, MVT::v16i16, 4 },
602  { ISD::ADD, MVT::v16i16, 4 },
603  { ISD::SUB, MVT::v8i32, 4 },
604  { ISD::ADD, MVT::v8i32, 4 },
605  { ISD::SUB, MVT::v4i64, 4 },
606  { ISD::ADD, MVT::v4i64, 4 },
607 
608  // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
609  // are lowered as a series of long multiplies(3), shifts(3) and adds(2)
610  // Because we believe v4i64 to be a legal type, we must also include the
611  // extract+insert in the cost table. Therefore, the cost here is 18
612  // instead of 8.
613  { ISD::MUL, MVT::v4i64, 18 },
614 
615  { ISD::MUL, MVT::v32i8, 26 }, // extend/pmullw/trunc sequence.
616 
617  { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
618  { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
619  { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
620  { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/
621  { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/
622  { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/
623 
624  // Vectorizing division is a bad idea. See the SSE2 table for more comments.
625  { ISD::SDIV, MVT::v32i8, 32*20 },
626  { ISD::SDIV, MVT::v16i16, 16*20 },
627  { ISD::SDIV, MVT::v8i32, 8*20 },
628  { ISD::SDIV, MVT::v4i64, 4*20 },
629  { ISD::UDIV, MVT::v32i8, 32*20 },
630  { ISD::UDIV, MVT::v16i16, 16*20 },
631  { ISD::UDIV, MVT::v8i32, 8*20 },
632  { ISD::UDIV, MVT::v4i64, 4*20 },
633  };
634 
635  if (ST->hasAVX())
636  if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
637  return LT.first * Entry->Cost;
638 
639  static const CostTblEntry SSE42CostTable[] = {
640  { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/
641  { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
642  { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/
643  { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
644  };
645 
646  if (ST->hasSSE42())
647  if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
648  return LT.first * Entry->Cost;
649 
650  static const CostTblEntry SSE41CostTable[] = {
651  { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence.
652  { ISD::SHL, MVT::v32i8, 2*11+2 }, // pblendvb sequence + split.
653  { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence.
654  { ISD::SHL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
655  { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
656  { ISD::SHL, MVT::v8i32, 2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split
657 
658  { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence.
659  { ISD::SRL, MVT::v32i8, 2*12+2 }, // pblendvb sequence + split.
660  { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence.
661  { ISD::SRL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
662  { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend.
663  { ISD::SRL, MVT::v8i32, 2*11+2 }, // Shift each lane + blend + split.
664 
665  { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence.
666  { ISD::SRA, MVT::v32i8, 2*24+2 }, // pblendvb sequence + split.
667  { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence.
668  { ISD::SRA, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
669  { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
670  { ISD::SRA, MVT::v8i32, 2*12+2 }, // Shift each lane + blend + split.
671 
672  { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org)
673  };
674 
675  if (ST->hasSSE41())
676  if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
677  return LT.first * Entry->Cost;
678 
679  static const CostTblEntry SSE2CostTable[] = {
680  // We don't correctly identify costs of casts because they are marked as
681  // custom.
682  { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence.
683  { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence.
684  { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
685  { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
686  { ISD::SHL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
687 
688  { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence.
689  { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence.
690  { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
691  { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
692  { ISD::SRL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
693 
694  { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence.
695  { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence.
696  { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend.
697  { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence.
698  { ISD::SRA, MVT::v4i64, 2*12+2 }, // srl/xor/sub sequence+split.
699 
700  { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence.
701  { ISD::MUL, MVT::v8i16, 1 }, // pmullw
702  { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
703  { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
704 
705  { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/
706  { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
707  { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
708  { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
709 
710  // It is not a good idea to vectorize division. We have to scalarize it and
711  // in the process we will often end up having to spilling regular
712  // registers. The overhead of division is going to dominate most kernels
713  // anyways so try hard to prevent vectorization of division - it is
714  // generally a bad idea. Assume somewhat arbitrarily that we have to be able
715  // to hide "20 cycles" for each lane.
716  { ISD::SDIV, MVT::v16i8, 16*20 },
717  { ISD::SDIV, MVT::v8i16, 8*20 },
718  { ISD::SDIV, MVT::v4i32, 4*20 },
719  { ISD::SDIV, MVT::v2i64, 2*20 },
720  { ISD::UDIV, MVT::v16i8, 16*20 },
721  { ISD::UDIV, MVT::v8i16, 8*20 },
722  { ISD::UDIV, MVT::v4i32, 4*20 },
723  { ISD::UDIV, MVT::v2i64, 2*20 },
724  };
725 
726  if (ST->hasSSE2())
727  if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
728  return LT.first * Entry->Cost;
729 
730  static const CostTblEntry SSE1CostTable[] = {
731  { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/
732  { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
733  };
734 
735  if (ST->hasSSE1())
736  if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
737  return LT.first * Entry->Cost;
738 
739  // Fallback to the default implementation.
740  return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
741 }
742 
744  Type *SubTp) {
745  // 64-bit packed float vectors (v2f32) are widened to type v4f32.
746  // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
747  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
748 
749  // For Broadcasts we are splatting the first element from the first input
750  // register, so only need to reference that input and all the output
751  // registers are the same.
752  if (Kind == TTI::SK_Broadcast)
753  LT.first = 1;
754 
755  // We are going to permute multiple sources and the result will be in multiple
756  // destinations. Providing an accurate cost only for splits where the element
757  // type remains the same.
758  if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
759  MVT LegalVT = LT.second;
760  if (LegalVT.isVector() &&
761  LegalVT.getVectorElementType().getSizeInBits() ==
763  LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) {
764 
765  unsigned VecTySize = DL.getTypeStoreSize(Tp);
766  unsigned LegalVTSize = LegalVT.getStoreSize();
767  // Number of source vectors after legalization:
768  unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
769  // Number of destination vectors after legalization:
770  unsigned NumOfDests = LT.first;
771 
772  Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(),
773  LegalVT.getVectorNumElements());
774 
775  unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
776  return NumOfShuffles *
777  getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
778  }
779 
780  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
781  }
782 
783  // For 2-input shuffles, we must account for splitting the 2 inputs into many.
784  if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
785  // We assume that source and destination have the same vector type.
786  int NumOfDests = LT.first;
787  int NumOfShufflesPerDest = LT.first * 2 - 1;
788  LT.first = NumOfDests * NumOfShufflesPerDest;
789  }
790 
791  static const CostTblEntry AVX512VBMIShuffleTbl[] = {
792  { TTI::SK_Reverse, MVT::v64i8, 1 }, // vpermb
793  { TTI::SK_Reverse, MVT::v32i8, 1 }, // vpermb
794 
795  { TTI::SK_PermuteSingleSrc, MVT::v64i8, 1 }, // vpermb
796  { TTI::SK_PermuteSingleSrc, MVT::v32i8, 1 }, // vpermb
797 
798  { TTI::SK_PermuteTwoSrc, MVT::v64i8, 1 }, // vpermt2b
799  { TTI::SK_PermuteTwoSrc, MVT::v32i8, 1 }, // vpermt2b
800  { TTI::SK_PermuteTwoSrc, MVT::v16i8, 1 } // vpermt2b
801  };
802 
803  if (ST->hasVBMI())
804  if (const auto *Entry =
805  CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
806  return LT.first * Entry->Cost;
807 
808  static const CostTblEntry AVX512BWShuffleTbl[] = {
809  { TTI::SK_Broadcast, MVT::v32i16, 1 }, // vpbroadcastw
810  { TTI::SK_Broadcast, MVT::v64i8, 1 }, // vpbroadcastb
811 
812  { TTI::SK_Reverse, MVT::v32i16, 1 }, // vpermw
813  { TTI::SK_Reverse, MVT::v16i16, 1 }, // vpermw
814  { TTI::SK_Reverse, MVT::v64i8, 2 }, // pshufb + vshufi64x2
815 
816  { TTI::SK_PermuteSingleSrc, MVT::v32i16, 1 }, // vpermw
817  { TTI::SK_PermuteSingleSrc, MVT::v16i16, 1 }, // vpermw
818  { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // vpermw
819  { TTI::SK_PermuteSingleSrc, MVT::v64i8, 8 }, // extend to v32i16
820  { TTI::SK_PermuteSingleSrc, MVT::v32i8, 3 }, // vpermw + zext/trunc
821 
822  { TTI::SK_PermuteTwoSrc, MVT::v32i16, 1 }, // vpermt2w
823  { TTI::SK_PermuteTwoSrc, MVT::v16i16, 1 }, // vpermt2w
824  { TTI::SK_PermuteTwoSrc, MVT::v8i16, 1 }, // vpermt2w
825  { TTI::SK_PermuteTwoSrc, MVT::v32i8, 3 }, // zext + vpermt2w + trunc
826  { TTI::SK_PermuteTwoSrc, MVT::v64i8, 19 }, // 6 * v32i8 + 1
827  { TTI::SK_PermuteTwoSrc, MVT::v16i8, 3 } // zext + vpermt2w + trunc
828  };
829 
830  if (ST->hasBWI())
831  if (const auto *Entry =
832  CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
833  return LT.first * Entry->Cost;
834 
835  static const CostTblEntry AVX512ShuffleTbl[] = {
836  { TTI::SK_Broadcast, MVT::v8f64, 1 }, // vbroadcastpd
837  { TTI::SK_Broadcast, MVT::v16f32, 1 }, // vbroadcastps
838  { TTI::SK_Broadcast, MVT::v8i64, 1 }, // vpbroadcastq
839  { TTI::SK_Broadcast, MVT::v16i32, 1 }, // vpbroadcastd
840 
841  { TTI::SK_Reverse, MVT::v8f64, 1 }, // vpermpd
842  { TTI::SK_Reverse, MVT::v16f32, 1 }, // vpermps
843  { TTI::SK_Reverse, MVT::v8i64, 1 }, // vpermq
844  { TTI::SK_Reverse, MVT::v16i32, 1 }, // vpermd
845 
846  { TTI::SK_PermuteSingleSrc, MVT::v8f64, 1 }, // vpermpd
847  { TTI::SK_PermuteSingleSrc, MVT::v4f64, 1 }, // vpermpd
848  { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // vpermpd
849  { TTI::SK_PermuteSingleSrc, MVT::v16f32, 1 }, // vpermps
850  { TTI::SK_PermuteSingleSrc, MVT::v8f32, 1 }, // vpermps
851  { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // vpermps
852  { TTI::SK_PermuteSingleSrc, MVT::v8i64, 1 }, // vpermq
853  { TTI::SK_PermuteSingleSrc, MVT::v4i64, 1 }, // vpermq
854  { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // vpermq
855  { TTI::SK_PermuteSingleSrc, MVT::v16i32, 1 }, // vpermd
856  { TTI::SK_PermuteSingleSrc, MVT::v8i32, 1 }, // vpermd
857  { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }, // vpermd
858  { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb
859 
860  { TTI::SK_PermuteTwoSrc, MVT::v8f64, 1 }, // vpermt2pd
861  { TTI::SK_PermuteTwoSrc, MVT::v16f32, 1 }, // vpermt2ps
862  { TTI::SK_PermuteTwoSrc, MVT::v8i64, 1 }, // vpermt2q
863  { TTI::SK_PermuteTwoSrc, MVT::v16i32, 1 }, // vpermt2d
864  { TTI::SK_PermuteTwoSrc, MVT::v4f64, 1 }, // vpermt2pd
865  { TTI::SK_PermuteTwoSrc, MVT::v8f32, 1 }, // vpermt2ps
866  { TTI::SK_PermuteTwoSrc, MVT::v4i64, 1 }, // vpermt2q
867  { TTI::SK_PermuteTwoSrc, MVT::v8i32, 1 }, // vpermt2d
868  { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // vpermt2pd
869  { TTI::SK_PermuteTwoSrc, MVT::v4f32, 1 }, // vpermt2ps
870  { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // vpermt2q
871  { TTI::SK_PermuteTwoSrc, MVT::v4i32, 1 } // vpermt2d
872  };
873 
874  if (ST->hasAVX512())
875  if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
876  return LT.first * Entry->Cost;
877 
878  static const CostTblEntry AVX2ShuffleTbl[] = {
879  { TTI::SK_Broadcast, MVT::v4f64, 1 }, // vbroadcastpd
880  { TTI::SK_Broadcast, MVT::v8f32, 1 }, // vbroadcastps
881  { TTI::SK_Broadcast, MVT::v4i64, 1 }, // vpbroadcastq
882  { TTI::SK_Broadcast, MVT::v8i32, 1 }, // vpbroadcastd
883  { TTI::SK_Broadcast, MVT::v16i16, 1 }, // vpbroadcastw
884  { TTI::SK_Broadcast, MVT::v32i8, 1 }, // vpbroadcastb
885 
886  { TTI::SK_Reverse, MVT::v4f64, 1 }, // vpermpd
887  { TTI::SK_Reverse, MVT::v8f32, 1 }, // vpermps
888  { TTI::SK_Reverse, MVT::v4i64, 1 }, // vpermq
889  { TTI::SK_Reverse, MVT::v8i32, 1 }, // vpermd
890  { TTI::SK_Reverse, MVT::v16i16, 2 }, // vperm2i128 + pshufb
891  { TTI::SK_Reverse, MVT::v32i8, 2 }, // vperm2i128 + pshufb
892 
893  { TTI::SK_Alternate, MVT::v16i16, 1 }, // vpblendw
894  { TTI::SK_Alternate, MVT::v32i8, 1 }, // vpblendvb
895 
896  { TTI::SK_PermuteSingleSrc, MVT::v4f64, 1 }, // vpermpd
897  { TTI::SK_PermuteSingleSrc, MVT::v8f32, 1 }, // vpermps
898  { TTI::SK_PermuteSingleSrc, MVT::v4i64, 1 }, // vpermq
899  { TTI::SK_PermuteSingleSrc, MVT::v8i32, 1 }, // vpermd
900  { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vperm2i128 + 2*vpshufb
901  // + vpblendvb
902  { TTI::SK_PermuteSingleSrc, MVT::v32i8, 4 }, // vperm2i128 + 2*vpshufb
903  // + vpblendvb
904 
905  { TTI::SK_PermuteTwoSrc, MVT::v4f64, 3 }, // 2*vpermpd + vblendpd
906  { TTI::SK_PermuteTwoSrc, MVT::v8f32, 3 }, // 2*vpermps + vblendps
907  { TTI::SK_PermuteTwoSrc, MVT::v4i64, 3 }, // 2*vpermq + vpblendd
908  { TTI::SK_PermuteTwoSrc, MVT::v8i32, 3 }, // 2*vpermd + vpblendd
909  { TTI::SK_PermuteTwoSrc, MVT::v16i16, 7 }, // 2*vperm2i128 + 4*vpshufb
910  // + vpblendvb
911  { TTI::SK_PermuteTwoSrc, MVT::v32i8, 7 }, // 2*vperm2i128 + 4*vpshufb
912  // + vpblendvb
913  };
914 
915  if (ST->hasAVX2())
916  if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
917  return LT.first * Entry->Cost;
918 
919  static const CostTblEntry XOPShuffleTbl[] = {
920  { TTI::SK_PermuteSingleSrc, MVT::v4f64, 2 }, // vperm2f128 + vpermil2pd
921  { TTI::SK_PermuteSingleSrc, MVT::v8f32, 2 }, // vperm2f128 + vpermil2ps
922  { TTI::SK_PermuteSingleSrc, MVT::v4i64, 2 }, // vperm2f128 + vpermil2pd
923  { TTI::SK_PermuteSingleSrc, MVT::v8i32, 2 }, // vperm2f128 + vpermil2ps
924  { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vextractf128 + 2*vpperm
925  // + vinsertf128
926  { TTI::SK_PermuteSingleSrc, MVT::v32i8, 4 }, // vextractf128 + 2*vpperm
927  // + vinsertf128
928 
929  { TTI::SK_PermuteTwoSrc, MVT::v16i16, 9 }, // 2*vextractf128 + 6*vpperm
930  // + vinsertf128
931  { TTI::SK_PermuteTwoSrc, MVT::v8i16, 1 }, // vpperm
932  { TTI::SK_PermuteTwoSrc, MVT::v32i8, 9 }, // 2*vextractf128 + 6*vpperm
933  // + vinsertf128
934  { TTI::SK_PermuteTwoSrc, MVT::v16i8, 1 }, // vpperm
935  };
936 
937  if (ST->hasXOP())
938  if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
939  return LT.first * Entry->Cost;
940 
941  static const CostTblEntry AVX1ShuffleTbl[] = {
942  { TTI::SK_Broadcast, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd
943  { TTI::SK_Broadcast, MVT::v8f32, 2 }, // vperm2f128 + vpermilps
944  { TTI::SK_Broadcast, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd
945  { TTI::SK_Broadcast, MVT::v8i32, 2 }, // vperm2f128 + vpermilps
946  { TTI::SK_Broadcast, MVT::v16i16, 3 }, // vpshuflw + vpshufd + vinsertf128
947  { TTI::SK_Broadcast, MVT::v32i8, 2 }, // vpshufb + vinsertf128
948 
949  { TTI::SK_Reverse, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd
950  { TTI::SK_Reverse, MVT::v8f32, 2 }, // vperm2f128 + vpermilps
951  { TTI::SK_Reverse, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd
952  { TTI::SK_Reverse, MVT::v8i32, 2 }, // vperm2f128 + vpermilps
953  { TTI::SK_Reverse, MVT::v16i16, 4 }, // vextractf128 + 2*pshufb
954  // + vinsertf128
955  { TTI::SK_Reverse, MVT::v32i8, 4 }, // vextractf128 + 2*pshufb
956  // + vinsertf128
957 
958  { TTI::SK_Alternate, MVT::v4i64, 1 }, // vblendpd
959  { TTI::SK_Alternate, MVT::v4f64, 1 }, // vblendpd
960  { TTI::SK_Alternate, MVT::v8i32, 1 }, // vblendps
961  { TTI::SK_Alternate, MVT::v8f32, 1 }, // vblendps
962  { TTI::SK_Alternate, MVT::v16i16, 3 }, // vpand + vpandn + vpor
963  { TTI::SK_Alternate, MVT::v32i8, 3 }, // vpand + vpandn + vpor
964 
965  { TTI::SK_PermuteSingleSrc, MVT::v4f64, 3 }, // 2*vperm2f128 + vshufpd
966  { TTI::SK_PermuteSingleSrc, MVT::v4i64, 3 }, // 2*vperm2f128 + vshufpd
967  { TTI::SK_PermuteSingleSrc, MVT::v8f32, 4 }, // 2*vperm2f128 + 2*vshufps
968  { TTI::SK_PermuteSingleSrc, MVT::v8i32, 4 }, // 2*vperm2f128 + 2*vshufps
969  { TTI::SK_PermuteSingleSrc, MVT::v16i16, 8 }, // vextractf128 + 4*pshufb
970  // + 2*por + vinsertf128
971  { TTI::SK_PermuteSingleSrc, MVT::v32i8, 8 }, // vextractf128 + 4*pshufb
972  // + 2*por + vinsertf128
973 
974  { TTI::SK_PermuteTwoSrc, MVT::v4f64, 4 }, // 2*vperm2f128 + 2*vshufpd
975  { TTI::SK_PermuteTwoSrc, MVT::v8f32, 4 }, // 2*vperm2f128 + 2*vshufps
976  { TTI::SK_PermuteTwoSrc, MVT::v4i64, 4 }, // 2*vperm2f128 + 2*vshufpd
977  { TTI::SK_PermuteTwoSrc, MVT::v8i32, 4 }, // 2*vperm2f128 + 2*vshufps
978  { TTI::SK_PermuteTwoSrc, MVT::v16i16, 15 }, // 2*vextractf128 + 8*pshufb
979  // + 4*por + vinsertf128
980  { TTI::SK_PermuteTwoSrc, MVT::v32i8, 15 }, // 2*vextractf128 + 8*pshufb
981  // + 4*por + vinsertf128
982  };
983 
984  if (ST->hasAVX())
985  if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
986  return LT.first * Entry->Cost;
987 
988  static const CostTblEntry SSE41ShuffleTbl[] = {
989  { TTI::SK_Alternate, MVT::v2i64, 1 }, // pblendw
990  { TTI::SK_Alternate, MVT::v2f64, 1 }, // movsd
991  { TTI::SK_Alternate, MVT::v4i32, 1 }, // pblendw
992  { TTI::SK_Alternate, MVT::v4f32, 1 }, // blendps
993  { TTI::SK_Alternate, MVT::v8i16, 1 }, // pblendw
994  { TTI::SK_Alternate, MVT::v16i8, 1 } // pblendvb
995  };
996 
997  if (ST->hasSSE41())
998  if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
999  return LT.first * Entry->Cost;
1000 
1001  static const CostTblEntry SSSE3ShuffleTbl[] = {
1002  { TTI::SK_Broadcast, MVT::v8i16, 1 }, // pshufb
1003  { TTI::SK_Broadcast, MVT::v16i8, 1 }, // pshufb
1004 
1005  { TTI::SK_Reverse, MVT::v8i16, 1 }, // pshufb
1006  { TTI::SK_Reverse, MVT::v16i8, 1 }, // pshufb
1007 
1008  { TTI::SK_Alternate, MVT::v8i16, 3 }, // 2*pshufb + por
1009  { TTI::SK_Alternate, MVT::v16i8, 3 }, // 2*pshufb + por
1010 
1011  { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // pshufb
1012  { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb
1013 
1014  { TTI::SK_PermuteTwoSrc, MVT::v8i16, 3 }, // 2*pshufb + por
1015  { TTI::SK_PermuteTwoSrc, MVT::v16i8, 3 }, // 2*pshufb + por
1016  };
1017 
1018  if (ST->hasSSSE3())
1019  if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
1020  return LT.first * Entry->Cost;
1021 
1022  static const CostTblEntry SSE2ShuffleTbl[] = {
1023  { TTI::SK_Broadcast, MVT::v2f64, 1 }, // shufpd
1024  { TTI::SK_Broadcast, MVT::v2i64, 1 }, // pshufd
1025  { TTI::SK_Broadcast, MVT::v4i32, 1 }, // pshufd
1026  { TTI::SK_Broadcast, MVT::v8i16, 2 }, // pshuflw + pshufd
1027  { TTI::SK_Broadcast, MVT::v16i8, 3 }, // unpck + pshuflw + pshufd
1028 
1029  { TTI::SK_Reverse, MVT::v2f64, 1 }, // shufpd
1030  { TTI::SK_Reverse, MVT::v2i64, 1 }, // pshufd
1031  { TTI::SK_Reverse, MVT::v4i32, 1 }, // pshufd
1032  { TTI::SK_Reverse, MVT::v8i16, 3 }, // pshuflw + pshufhw + pshufd
1033  { TTI::SK_Reverse, MVT::v16i8, 9 }, // 2*pshuflw + 2*pshufhw
1034  // + 2*pshufd + 2*unpck + packus
1035 
1036  { TTI::SK_Alternate, MVT::v2i64, 1 }, // movsd
1037  { TTI::SK_Alternate, MVT::v2f64, 1 }, // movsd
1038  { TTI::SK_Alternate, MVT::v4i32, 2 }, // 2*shufps
1039  { TTI::SK_Alternate, MVT::v8i16, 3 }, // pand + pandn + por
1040  { TTI::SK_Alternate, MVT::v16i8, 3 }, // pand + pandn + por
1041 
1042  { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // shufpd
1043  { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // pshufd
1044  { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }, // pshufd
1045  { TTI::SK_PermuteSingleSrc, MVT::v8i16, 5 }, // 2*pshuflw + 2*pshufhw
1046  // + pshufd/unpck
1047  { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
1048  // + 2*pshufd + 2*unpck + 2*packus
1049 
1050  { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
1051  { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
1052  { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
1053  { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
1054  { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
1055  };
1056 
1057  if (ST->hasSSE2())
1058  if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
1059  return LT.first * Entry->Cost;
1060 
1061  static const CostTblEntry SSE1ShuffleTbl[] = {
1062  { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
1063  { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
1064  { TTI::SK_Alternate, MVT::v4f32, 2 }, // 2*shufps
1065  { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
1066  { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
1067  };
1068 
1069  if (ST->hasSSE1())
1070  if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
1071  return LT.first * Entry->Cost;
1072 
1073  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
1074 }
1075 
1076 int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1077  const Instruction *I) {
1078  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1079  assert(ISD && "Invalid opcode");
1080 
1081  // FIXME: Need a better design of the cost table to handle non-simple types of
1082  // potential massive combinations (elem_num x src_type x dst_type).
1083 
1084  static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
1091 
1098 
1105 
1112  };
1113 
1114  // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
1115  // 256-bit wide vectors.
1116 
1117  static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
1121 
1126 
1127  // v16i1 -> v16i32 - load + broadcast
1138 
1149 
1173 
1182  };
1183 
1184  static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
1201 
1208 
1211 
1213  };
1214 
1215  static const TypeConversionCostTblEntry AVXConversionTbl[] = {
1232 
1240 
1253 
1267  // The generic code to compute the scalar overhead is currently broken.
1268  // Workaround this limitation by estimating the scalarization overhead
1269  // here. We have roughly 10 instructions per scalar element.
1270  // Multiply that by the vector width.
1271  // FIXME: remove that when PR19268 is fixed.
1276 
1279  // This node is expanded into scalarized operations but BasicTTI is overly
1280  // optimistic estimating its cost. It computes 3 per element (one
1281  // vector-extract, one scalar conversion and one vector-insert). The
1282  // problem is that the inserts form a read-modify-write chain so latency
1283  // should be factored in too. Inflating the cost per element by 1.
1286 
1289  };
1290 
1291  static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
1298 
1317 
1325 
1326  };
1327 
1328  static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
1329  // These are somewhat magic numbers justified by looking at the output of
1330  // Intel's IACA, running some kernels and making sure when we take
1331  // legalization into account the throughput will be overestimated.
1333  { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1340 
1341  { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1349 
1351 
1376 
1386  };
1387 
1388  std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
1389  std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);
1390 
1391  if (ST->hasSSE2() && !ST->hasAVX()) {
1392  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
1393  LTDest.second, LTSrc.second))
1394  return LTSrc.first * Entry->Cost;
1395  }
1396 
1397  EVT SrcTy = TLI->getValueType(DL, Src);
1398  EVT DstTy = TLI->getValueType(DL, Dst);
1399 
1400  // The function getSimpleVT only handles simple value types.
1401  if (!SrcTy.isSimple() || !DstTy.isSimple())
1402  return BaseT::getCastInstrCost(Opcode, Dst, Src);
1403 
1404  if (ST->hasDQI())
1405  if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
1406  DstTy.getSimpleVT(),
1407  SrcTy.getSimpleVT()))
1408  return Entry->Cost;
1409 
1410  if (ST->hasAVX512())
1411  if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
1412  DstTy.getSimpleVT(),
1413  SrcTy.getSimpleVT()))
1414  return Entry->Cost;
1415 
1416  if (ST->hasAVX2()) {
1417  if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
1418  DstTy.getSimpleVT(),
1419  SrcTy.getSimpleVT()))
1420  return Entry->Cost;
1421  }
1422 
1423  if (ST->hasAVX()) {
1424  if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
1425  DstTy.getSimpleVT(),
1426  SrcTy.getSimpleVT()))
1427  return Entry->Cost;
1428  }
1429 
1430  if (ST->hasSSE41()) {
1431  if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
1432  DstTy.getSimpleVT(),
1433  SrcTy.getSimpleVT()))
1434  return Entry->Cost;
1435  }
1436 
1437  if (ST->hasSSE2()) {
1438  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
1439  DstTy.getSimpleVT(),
1440  SrcTy.getSimpleVT()))
1441  return Entry->Cost;
1442  }
1443 
1444  return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
1445 }
1446 
1447 int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1448  const Instruction *I) {
1449  // Legalize the type.
1450  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1451 
1452  MVT MTy = LT.second;
1453 
1454  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1455  assert(ISD && "Invalid opcode");
1456 
1457  static const CostTblEntry SSE2CostTbl[] = {
1458  { ISD::SETCC, MVT::v2i64, 8 },
1459  { ISD::SETCC, MVT::v4i32, 1 },
1460  { ISD::SETCC, MVT::v8i16, 1 },
1461  { ISD::SETCC, MVT::v16i8, 1 },
1462  };
1463 
1464  static const CostTblEntry SSE42CostTbl[] = {
1465  { ISD::SETCC, MVT::v2f64, 1 },
1466  { ISD::SETCC, MVT::v4f32, 1 },
1467  { ISD::SETCC, MVT::v2i64, 1 },
1468  };
1469 
1470  static const CostTblEntry AVX1CostTbl[] = {
1471  { ISD::SETCC, MVT::v4f64, 1 },
1472  { ISD::SETCC, MVT::v8f32, 1 },
1473  // AVX1 does not support 8-wide integer compare.
1474  { ISD::SETCC, MVT::v4i64, 4 },
1475  { ISD::SETCC, MVT::v8i32, 4 },
1476  { ISD::SETCC, MVT::v16i16, 4 },
1477  { ISD::SETCC, MVT::v32i8, 4 },
1478  };
1479 
1480  static const CostTblEntry AVX2CostTbl[] = {
1481  { ISD::SETCC, MVT::v4i64, 1 },
1482  { ISD::SETCC, MVT::v8i32, 1 },
1483  { ISD::SETCC, MVT::v16i16, 1 },
1484  { ISD::SETCC, MVT::v32i8, 1 },
1485  };
1486 
1487  static const CostTblEntry AVX512CostTbl[] = {
1488  { ISD::SETCC, MVT::v8i64, 1 },
1489  { ISD::SETCC, MVT::v16i32, 1 },
1490  { ISD::SETCC, MVT::v8f64, 1 },
1491  { ISD::SETCC, MVT::v16f32, 1 },
1492  };
1493 
1494  if (ST->hasAVX512())
1495  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
1496  return LT.first * Entry->Cost;
1497 
1498  if (ST->hasAVX2())
1499  if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
1500  return LT.first * Entry->Cost;
1501 
1502  if (ST->hasAVX())
1503  if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
1504  return LT.first * Entry->Cost;
1505 
1506  if (ST->hasSSE42())
1507  if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
1508  return LT.first * Entry->Cost;
1509 
1510  if (ST->hasSSE2())
1511  if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
1512  return LT.first * Entry->Cost;
1513 
1514  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
1515 }
1516 
1518 
1521  unsigned ScalarizationCostPassed) {
1522  // Costs should match the codegen from:
1523  // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
1524  // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
1525  // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
1526  // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
1527  // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
1528  static const CostTblEntry AVX512CDCostTbl[] = {
1529  { ISD::CTLZ, MVT::v8i64, 1 },
1530  { ISD::CTLZ, MVT::v16i32, 1 },
1531  { ISD::CTLZ, MVT::v32i16, 8 },
1532  { ISD::CTLZ, MVT::v64i8, 20 },
1533  { ISD::CTLZ, MVT::v4i64, 1 },
1534  { ISD::CTLZ, MVT::v8i32, 1 },
1535  { ISD::CTLZ, MVT::v16i16, 4 },
1536  { ISD::CTLZ, MVT::v32i8, 10 },
1537  { ISD::CTLZ, MVT::v2i64, 1 },
1538  { ISD::CTLZ, MVT::v4i32, 1 },
1539  { ISD::CTLZ, MVT::v8i16, 4 },
1540  { ISD::CTLZ, MVT::v16i8, 4 },
1541  };
1542  static const CostTblEntry AVX512BWCostTbl[] = {
1543  { ISD::BITREVERSE, MVT::v8i64, 5 },
1544  { ISD::BITREVERSE, MVT::v16i32, 5 },
1545  { ISD::BITREVERSE, MVT::v32i16, 5 },
1546  { ISD::BITREVERSE, MVT::v64i8, 5 },
1547  { ISD::CTLZ, MVT::v8i64, 23 },
1548  { ISD::CTLZ, MVT::v16i32, 22 },
1549  { ISD::CTLZ, MVT::v32i16, 18 },
1550  { ISD::CTLZ, MVT::v64i8, 17 },
1551  { ISD::CTPOP, MVT::v8i64, 7 },
1552  { ISD::CTPOP, MVT::v16i32, 11 },
1553  { ISD::CTPOP, MVT::v32i16, 9 },
1554  { ISD::CTPOP, MVT::v64i8, 6 },
1555  { ISD::CTTZ, MVT::v8i64, 10 },
1556  { ISD::CTTZ, MVT::v16i32, 14 },
1557  { ISD::CTTZ, MVT::v32i16, 12 },
1558  { ISD::CTTZ, MVT::v64i8, 9 },
1559  };
1560  static const CostTblEntry AVX512CostTbl[] = {
1561  { ISD::BITREVERSE, MVT::v8i64, 36 },
1562  { ISD::BITREVERSE, MVT::v16i32, 24 },
1563  { ISD::CTLZ, MVT::v8i64, 29 },
1564  { ISD::CTLZ, MVT::v16i32, 35 },
1565  { ISD::CTPOP, MVT::v8i64, 16 },
1566  { ISD::CTPOP, MVT::v16i32, 24 },
1567  { ISD::CTTZ, MVT::v8i64, 20 },
1568  { ISD::CTTZ, MVT::v16i32, 28 },
1569  };
1570  static const CostTblEntry XOPCostTbl[] = {
1571  { ISD::BITREVERSE, MVT::v4i64, 4 },
1572  { ISD::BITREVERSE, MVT::v8i32, 4 },
1573  { ISD::BITREVERSE, MVT::v16i16, 4 },
1574  { ISD::BITREVERSE, MVT::v32i8, 4 },
1575  { ISD::BITREVERSE, MVT::v2i64, 1 },
1576  { ISD::BITREVERSE, MVT::v4i32, 1 },
1577  { ISD::BITREVERSE, MVT::v8i16, 1 },
1578  { ISD::BITREVERSE, MVT::v16i8, 1 },
1579  { ISD::BITREVERSE, MVT::i64, 3 },
1580  { ISD::BITREVERSE, MVT::i32, 3 },
1581  { ISD::BITREVERSE, MVT::i16, 3 },
1582  { ISD::BITREVERSE, MVT::i8, 3 }
1583  };
1584  static const CostTblEntry AVX2CostTbl[] = {
1585  { ISD::BITREVERSE, MVT::v4i64, 5 },
1586  { ISD::BITREVERSE, MVT::v8i32, 5 },
1587  { ISD::BITREVERSE, MVT::v16i16, 5 },
1588  { ISD::BITREVERSE, MVT::v32i8, 5 },
1589  { ISD::BSWAP, MVT::v4i64, 1 },
1590  { ISD::BSWAP, MVT::v8i32, 1 },
1591  { ISD::BSWAP, MVT::v16i16, 1 },
1592  { ISD::CTLZ, MVT::v4i64, 23 },
1593  { ISD::CTLZ, MVT::v8i32, 18 },
1594  { ISD::CTLZ, MVT::v16i16, 14 },
1595  { ISD::CTLZ, MVT::v32i8, 9 },
1596  { ISD::CTPOP, MVT::v4i64, 7 },
1597  { ISD::CTPOP, MVT::v8i32, 11 },
1598  { ISD::CTPOP, MVT::v16i16, 9 },
1599  { ISD::CTPOP, MVT::v32i8, 6 },
1600  { ISD::CTTZ, MVT::v4i64, 10 },
1601  { ISD::CTTZ, MVT::v8i32, 14 },
1602  { ISD::CTTZ, MVT::v16i16, 12 },
1603  { ISD::CTTZ, MVT::v32i8, 9 },
1604  { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/
1605  { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
1606  { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
1607  { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/
1608  { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
1609  { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
1610  };
1611  static const CostTblEntry AVX1CostTbl[] = {
1612  { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert
1613  { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert
1614  { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert
1615  { ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert
1616  { ISD::BSWAP, MVT::v4i64, 4 },
1617  { ISD::BSWAP, MVT::v8i32, 4 },
1618  { ISD::BSWAP, MVT::v16i16, 4 },
1619  { ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert
1620  { ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert
1621  { ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert
1622  { ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
1623  { ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert
1624  { ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert
1625  { ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert
1626  { ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert
1627  { ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert
1628  { ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert
1629  { ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert
1630  { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
1631  { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
1632  { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
1633  { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
1634  { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/
1635  { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/
1636  { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/
1637  };
1638  static const CostTblEntry SSE42CostTbl[] = {
1639  { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
1640  { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
1641  };
1642  static const CostTblEntry SSSE3CostTbl[] = {
1643  { ISD::BITREVERSE, MVT::v2i64, 5 },
1644  { ISD::BITREVERSE, MVT::v4i32, 5 },
1645  { ISD::BITREVERSE, MVT::v8i16, 5 },
1646  { ISD::BITREVERSE, MVT::v16i8, 5 },
1647  { ISD::BSWAP, MVT::v2i64, 1 },
1648  { ISD::BSWAP, MVT::v4i32, 1 },
1649  { ISD::BSWAP, MVT::v8i16, 1 },
1650  { ISD::CTLZ, MVT::v2i64, 23 },
1651  { ISD::CTLZ, MVT::v4i32, 18 },
1652  { ISD::CTLZ, MVT::v8i16, 14 },
1653  { ISD::CTLZ, MVT::v16i8, 9 },
1654  { ISD::CTPOP, MVT::v2i64, 7 },
1655  { ISD::CTPOP, MVT::v4i32, 11 },
1656  { ISD::CTPOP, MVT::v8i16, 9 },
1657  { ISD::CTPOP, MVT::v16i8, 6 },
1658  { ISD::CTTZ, MVT::v2i64, 10 },
1659  { ISD::CTTZ, MVT::v4i32, 14 },
1660  { ISD::CTTZ, MVT::v8i16, 12 },
1661  { ISD::CTTZ, MVT::v16i8, 9 }
1662  };
1663  static const CostTblEntry SSE2CostTbl[] = {
1664  { ISD::BITREVERSE, MVT::v2i64, 29 },
1665  { ISD::BITREVERSE, MVT::v4i32, 27 },
1666  { ISD::BITREVERSE, MVT::v8i16, 27 },
1667  { ISD::BITREVERSE, MVT::v16i8, 20 },
1668  { ISD::BSWAP, MVT::v2i64, 7 },
1669  { ISD::BSWAP, MVT::v4i32, 7 },
1670  { ISD::BSWAP, MVT::v8i16, 7 },
1671  { ISD::CTLZ, MVT::v2i64, 25 },
1672  { ISD::CTLZ, MVT::v4i32, 26 },
1673  { ISD::CTLZ, MVT::v8i16, 20 },
1674  { ISD::CTLZ, MVT::v16i8, 17 },
1675  { ISD::CTPOP, MVT::v2i64, 12 },
1676  { ISD::CTPOP, MVT::v4i32, 15 },
1677  { ISD::CTPOP, MVT::v8i16, 13 },
1678  { ISD::CTPOP, MVT::v16i8, 10 },
1679  { ISD::CTTZ, MVT::v2i64, 14 },
1680  { ISD::CTTZ, MVT::v4i32, 18 },
1681  { ISD::CTTZ, MVT::v8i16, 16 },
1682  { ISD::CTTZ, MVT::v16i8, 13 },
1683  { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/
1684  { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/
1685  };
1686  static const CostTblEntry SSE1CostTbl[] = {
1687  { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
1688  { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
1689  };
1690  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
1691  { ISD::BITREVERSE, MVT::i64, 14 }
1692  };
1693  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1694  { ISD::BITREVERSE, MVT::i32, 14 },
1695  { ISD::BITREVERSE, MVT::i16, 14 },
1696  { ISD::BITREVERSE, MVT::i8, 11 }
1697  };
1698 
1699  unsigned ISD = ISD::DELETED_NODE;
1700  switch (IID) {
1701  default:
1702  break;
1703  case Intrinsic::bitreverse:
1704  ISD = ISD::BITREVERSE;
1705  break;
1706  case Intrinsic::bswap:
1707  ISD = ISD::BSWAP;
1708  break;
1709  case Intrinsic::ctlz:
1710  ISD = ISD::CTLZ;
1711  break;
1712  case Intrinsic::ctpop:
1713  ISD = ISD::CTPOP;
1714  break;
1715  case Intrinsic::cttz:
1716  ISD = ISD::CTTZ;
1717  break;
1718  case Intrinsic::sqrt:
1719  ISD = ISD::FSQRT;
1720  break;
1721  }
1722 
1723  // Legalize the type.
1724  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
1725  MVT MTy = LT.second;
1726 
1727  // Attempt to lookup cost.
1728  if (ST->hasCDI())
1729  if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
1730  return LT.first * Entry->Cost;
1731 
1732  if (ST->hasBWI())
1733  if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
1734  return LT.first * Entry->Cost;
1735 
1736  if (ST->hasAVX512())
1737  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
1738  return LT.first * Entry->Cost;
1739 
1740  if (ST->hasXOP())
1741  if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
1742  return LT.first * Entry->Cost;
1743 
1744  if (ST->hasAVX2())
1745  if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
1746  return LT.first * Entry->Cost;
1747 
1748  if (ST->hasAVX())
1749  if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
1750  return LT.first * Entry->Cost;
1751 
1752  if (ST->hasSSE42())
1753  if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
1754  return LT.first * Entry->Cost;
1755 
1756  if (ST->hasSSSE3())
1757  if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
1758  return LT.first * Entry->Cost;
1759 
1760  if (ST->hasSSE2())
1761  if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
1762  return LT.first * Entry->Cost;
1763 
1764  if (ST->hasSSE1())
1765  if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
1766  return LT.first * Entry->Cost;
1767 
1768  if (ST->is64Bit())
1769  if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
1770  return LT.first * Entry->Cost;
1771 
1772  if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
1773  return LT.first * Entry->Cost;
1774 
1775  return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed);
1776 }
1777 
1779  ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) {
1780  return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF);
1781 }
1782 
1783 int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
1784  assert(Val->isVectorTy() && "This must be a vector type");
1785 
1786  Type *ScalarType = Val->getScalarType();
1787 
1788  if (Index != -1U) {
1789  // Legalize the type.
1790  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
1791 
1792  // This type is legalized to a scalar type.
1793  if (!LT.second.isVector())
1794  return 0;
1795 
1796  // The type may be split. Normalize the index to the new type.
1797  unsigned Width = LT.second.getVectorNumElements();
1798  Index = Index % Width;
1799 
1800  // Floating point scalars are already located in index #0.
1801  if (ScalarType->isFloatingPointTy() && Index == 0)
1802  return 0;
1803  }
1804 
1805  // Add to the base cost if we know that the extracted element of a vector is
1806  // destined to be moved to and used in the integer register file.
1807  int RegisterFileMoveCost = 0;
1808  if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
1809  RegisterFileMoveCost = 1;
1810 
1811  return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
1812 }
1813 
1814 int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
1815  unsigned AddressSpace, const Instruction *I) {
1816  // Handle non-power-of-two vectors such as <3 x float>
1817  if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
1818  unsigned NumElem = VTy->getVectorNumElements();
1819 
1820  // Handle a few common cases:
1821  // <3 x float>
1822  if (NumElem == 3 && VTy->getScalarSizeInBits() == 32)
1823  // Cost = 64 bit store + extract + 32 bit store.
1824  return 3;
1825 
1826  // <3 x double>
1827  if (NumElem == 3 && VTy->getScalarSizeInBits() == 64)
1828  // Cost = 128 bit store + unpack + 64 bit store.
1829  return 3;
1830 
1831  // Assume that all other non-power-of-two numbers are scalarized.
1832  if (!isPowerOf2_32(NumElem)) {
1833  int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
1834  AddressSpace);
1835  int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load,
1836  Opcode == Instruction::Store);
1837  return NumElem * Cost + SplitCost;
1838  }
1839  }
1840 
1841  // Legalize the type.
1842  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
1843  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1844  "Invalid Opcode");
1845 
1846  // Each load/store unit costs 1.
1847  int Cost = LT.first * 1;
1848 
1849  // This isn't exactly right. We're using slow unaligned 32-byte accesses as a
1850  // proxy for a double-pumped AVX memory interface such as on Sandybridge.
1851  if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow())
1852  Cost *= 2;
1853 
1854  return Cost;
1855 }
1856 
1857 int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
1858  unsigned Alignment,
1859  unsigned AddressSpace) {
1860  VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy);
1861  if (!SrcVTy)
1862  // To calculate scalar take the regular cost, without mask
1863  return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace);
1864 
1865  unsigned NumElem = SrcVTy->getVectorNumElements();
1866  VectorType *MaskTy =
1867  VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
1868  if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy)) ||
1869  (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy)) ||
1870  !isPowerOf2_32(NumElem)) {
1871  // Scalarization
1872  int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
1873  int ScalarCompareCost = getCmpSelInstrCost(
1874  Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr);
1875  int BranchCost = getCFInstrCost(Instruction::Br);
1876  int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
1877 
1878  int ValueSplitCost = getScalarizationOverhead(
1879  SrcVTy, Opcode == Instruction::Load, Opcode == Instruction::Store);
1880  int MemopCost =
1881  NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
1882  Alignment, AddressSpace);
1883  return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
1884  }
1885 
1886  // Legalize the type.
1887  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
1888  auto VT = TLI->getValueType(DL, SrcVTy);
1889  int Cost = 0;
1890  if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
1891  LT.second.getVectorNumElements() == NumElem)
1892  // Promotion requires expand/truncate for data and a shuffle for mask.
1893  Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, nullptr) +
1894  getShuffleCost(TTI::SK_Alternate, MaskTy, 0, nullptr);
1895 
1896  else if (LT.second.getVectorNumElements() > NumElem) {
1897  VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(),
1898  LT.second.getVectorNumElements());
1899  // Expanding requires fill mask with zeroes
1900  Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy);
1901  }
1902  if (!ST->hasAVX512())
1903  return Cost + LT.first*4; // Each maskmov costs 4
1904 
1905  // AVX-512 masked load/store is cheapper
1906  return Cost+LT.first;
1907 }
1908 
1910  const SCEV *Ptr) {
1911  // Address computations in vectorized code with non-consecutive addresses will
1912  // likely result in more instructions compared to scalar code where the
1913  // computation can more often be merged into the index mode. The resulting
1914  // extra micro-ops can significantly decrease throughput.
1915  unsigned NumVectorInstToHideOverhead = 10;
1916 
1917  // Cost modeling of Strided Access Computation is hidden by the indexing
1918  // modes of X86 regardless of the stride value. We dont believe that there
1919  // is a difference between constant strided access in gerenal and constant
1920  // strided value which is less than or equal to 64.
1921  // Even in the case of (loop invariant) stride whose value is not known at
1922  // compile time, the address computation will not incur more than one extra
1923  // ADD instruction.
1924  if (Ty->isVectorTy() && SE) {
1925  if (!BaseT::isStridedAccess(Ptr))
1926  return NumVectorInstToHideOverhead;
1927  if (!BaseT::getConstantStrideStep(SE, Ptr))
1928  return 1;
1929  }
1930 
1931  return BaseT::getAddressComputationCost(Ty, SE, Ptr);
1932 }
1933 
1934 int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
1935  bool IsPairwise) {
1936 
1937  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1938 
1939  MVT MTy = LT.second;
1940 
1941  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1942  assert(ISD && "Invalid opcode");
1943 
1944  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
1945  // and make it as the cost.
1946 
1947  static const CostTblEntry SSE42CostTblPairWise[] = {
1948  { ISD::FADD, MVT::v2f64, 2 },
1949  { ISD::FADD, MVT::v4f32, 4 },
1950  { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
1951  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
1952  { ISD::ADD, MVT::v8i16, 5 },
1953  };
1954 
1955  static const CostTblEntry AVX1CostTblPairWise[] = {
1956  { ISD::FADD, MVT::v4f32, 4 },
1957  { ISD::FADD, MVT::v4f64, 5 },
1958  { ISD::FADD, MVT::v8f32, 7 },
1959  { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
1960  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
1961  { ISD::ADD, MVT::v4i64, 5 }, // The data reported by the IACA tool is "4.8".
1962  { ISD::ADD, MVT::v8i16, 5 },
1963  { ISD::ADD, MVT::v8i32, 5 },
1964  };
1965 
1966  static const CostTblEntry SSE42CostTblNoPairWise[] = {
1967  { ISD::FADD, MVT::v2f64, 2 },
1968  { ISD::FADD, MVT::v4f32, 4 },
1969  { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
1970  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
1971  { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
1972  };
1973 
1974  static const CostTblEntry AVX1CostTblNoPairWise[] = {
1975  { ISD::FADD, MVT::v4f32, 3 },
1976  { ISD::FADD, MVT::v4f64, 3 },
1977  { ISD::FADD, MVT::v8f32, 4 },
1978  { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
1979  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "2.8".
1980  { ISD::ADD, MVT::v4i64, 3 },
1981  { ISD::ADD, MVT::v8i16, 4 },
1982  { ISD::ADD, MVT::v8i32, 5 },
1983  };
1984 
1985  if (IsPairwise) {
1986  if (ST->hasAVX())
1987  if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
1988  return LT.first * Entry->Cost;
1989 
1990  if (ST->hasSSE42())
1991  if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
1992  return LT.first * Entry->Cost;
1993  } else {
1994  if (ST->hasAVX())
1995  if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
1996  return LT.first * Entry->Cost;
1997 
1998  if (ST->hasSSE42())
1999  if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
2000  return LT.first * Entry->Cost;
2001  }
2002 
2003  return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise);
2004 }
2005 
2007  bool IsPairwise, bool IsUnsigned) {
2008  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2009 
2010  MVT MTy = LT.second;
2011 
2012  int ISD;
2013  if (ValTy->isIntOrIntVectorTy()) {
2014  ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
2015  } else {
2016  assert(ValTy->isFPOrFPVectorTy() &&
2017  "Expected float point or integer vector type.");
2018  ISD = ISD::FMINNUM;
2019  }
2020 
2021  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
2022  // and make it as the cost.
2023 
2024  static const CostTblEntry SSE42CostTblPairWise[] = {
2025  {ISD::FMINNUM, MVT::v2f64, 3},
2026  {ISD::FMINNUM, MVT::v4f32, 2},
2027  {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
2028  {ISD::UMIN, MVT::v2i64, 8}, // The data reported by the IACA is "8.6"
2029  {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
2030  {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
2031  {ISD::SMIN, MVT::v8i16, 2},
2032  {ISD::UMIN, MVT::v8i16, 2},
2033  };
2034 
2035  static const CostTblEntry AVX1CostTblPairWise[] = {
2036  {ISD::FMINNUM, MVT::v4f32, 1},
2037  {ISD::FMINNUM, MVT::v4f64, 1},
2038  {ISD::FMINNUM, MVT::v8f32, 2},
2039  {ISD::SMIN, MVT::v2i64, 3},
2040  {ISD::UMIN, MVT::v2i64, 3},
2041  {ISD::SMIN, MVT::v4i32, 1},
2042  {ISD::UMIN, MVT::v4i32, 1},
2043  {ISD::SMIN, MVT::v8i16, 1},
2044  {ISD::UMIN, MVT::v8i16, 1},
2045  {ISD::SMIN, MVT::v8i32, 3},
2046  {ISD::UMIN, MVT::v8i32, 3},
2047  };
2048 
2049  static const CostTblEntry AVX2CostTblPairWise[] = {
2050  {ISD::SMIN, MVT::v4i64, 2},
2051  {ISD::UMIN, MVT::v4i64, 2},
2052  {ISD::SMIN, MVT::v8i32, 1},
2053  {ISD::UMIN, MVT::v8i32, 1},
2054  {ISD::SMIN, MVT::v16i16, 1},
2055  {ISD::UMIN, MVT::v16i16, 1},
2056  {ISD::SMIN, MVT::v32i8, 2},
2057  {ISD::UMIN, MVT::v32i8, 2},
2058  };
2059 
2060  static const CostTblEntry AVX512CostTblPairWise[] = {
2061  {ISD::FMINNUM, MVT::v8f64, 1},
2062  {ISD::FMINNUM, MVT::v16f32, 2},
2063  {ISD::SMIN, MVT::v8i64, 2},
2064  {ISD::UMIN, MVT::v8i64, 2},
2065  {ISD::SMIN, MVT::v16i32, 1},
2066  {ISD::UMIN, MVT::v16i32, 1},
2067  };
2068 
2069  static const CostTblEntry SSE42CostTblNoPairWise[] = {
2070  {ISD::FMINNUM, MVT::v2f64, 3},
2071  {ISD::FMINNUM, MVT::v4f32, 3},
2072  {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
2073  {ISD::UMIN, MVT::v2i64, 9}, // The data reported by the IACA is "8.6"
2074  {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
2075  {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
2076  {ISD::SMIN, MVT::v8i16, 1}, // The data reported by the IACA is "1.5"
2077  {ISD::UMIN, MVT::v8i16, 2}, // The data reported by the IACA is "1.8"
2078  };
2079 
2080  static const CostTblEntry AVX1CostTblNoPairWise[] = {
2081  {ISD::FMINNUM, MVT::v4f32, 1},
2082  {ISD::FMINNUM, MVT::v4f64, 1},
2083  {ISD::FMINNUM, MVT::v8f32, 1},
2084  {ISD::SMIN, MVT::v2i64, 3},
2085  {ISD::UMIN, MVT::v2i64, 3},
2086  {ISD::SMIN, MVT::v4i32, 1},
2087  {ISD::UMIN, MVT::v4i32, 1},
2088  {ISD::SMIN, MVT::v8i16, 1},
2089  {ISD::UMIN, MVT::v8i16, 1},
2090  {ISD::SMIN, MVT::v8i32, 2},
2091  {ISD::UMIN, MVT::v8i32, 2},
2092  };
2093 
2094  static const CostTblEntry AVX2CostTblNoPairWise[] = {
2095  {ISD::SMIN, MVT::v4i64, 1},
2096  {ISD::UMIN, MVT::v4i64, 1},
2097  {ISD::SMIN, MVT::v8i32, 1},
2098  {ISD::UMIN, MVT::v8i32, 1},
2099  {ISD::SMIN, MVT::v16i16, 1},
2100  {ISD::UMIN, MVT::v16i16, 1},
2101  {ISD::SMIN, MVT::v32i8, 1},
2102  {ISD::UMIN, MVT::v32i8, 1},
2103  };
2104 
2105  static const CostTblEntry AVX512CostTblNoPairWise[] = {
2106  {ISD::FMINNUM, MVT::v8f64, 1},
2107  {ISD::FMINNUM, MVT::v16f32, 2},
2108  {ISD::SMIN, MVT::v8i64, 1},
2109  {ISD::UMIN, MVT::v8i64, 1},
2110  {ISD::SMIN, MVT::v16i32, 1},
2111  {ISD::UMIN, MVT::v16i32, 1},
2112  };
2113 
2114  if (IsPairwise) {
2115  if (ST->hasAVX512())
2116  if (const auto *Entry = CostTableLookup(AVX512CostTblPairWise, ISD, MTy))
2117  return LT.first * Entry->Cost;
2118 
2119  if (ST->hasAVX2())
2120  if (const auto *Entry = CostTableLookup(AVX2CostTblPairWise, ISD, MTy))
2121  return LT.first * Entry->Cost;
2122 
2123  if (ST->hasAVX())
2124  if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
2125  return LT.first * Entry->Cost;
2126 
2127  if (ST->hasSSE42())
2128  if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
2129  return LT.first * Entry->Cost;
2130  } else {
2131  if (ST->hasAVX512())
2132  if (const auto *Entry =
2133  CostTableLookup(AVX512CostTblNoPairWise, ISD, MTy))
2134  return LT.first * Entry->Cost;
2135 
2136  if (ST->hasAVX2())
2137  if (const auto *Entry = CostTableLookup(AVX2CostTblNoPairWise, ISD, MTy))
2138  return LT.first * Entry->Cost;
2139 
2140  if (ST->hasAVX())
2141  if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
2142  return LT.first * Entry->Cost;
2143 
2144  if (ST->hasSSE42())
2145  if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
2146  return LT.first * Entry->Cost;
2147  }
2148 
2149  return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned);
2150 }
2151 
2152 /// \brief Calculate the cost of materializing a 64-bit value. This helper
2153 /// method might only calculate a fraction of a larger immediate. Therefore it
2154 /// is valid to return a cost of ZERO.
2155 int X86TTIImpl::getIntImmCost(int64_t Val) {
2156  if (Val == 0)
2157  return TTI::TCC_Free;
2158 
2159  if (isInt<32>(Val))
2160  return TTI::TCC_Basic;
2161 
2162  return 2 * TTI::TCC_Basic;
2163 }
2164 
2165 int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
2166  assert(Ty->isIntegerTy());
2167 
2168  unsigned BitSize = Ty->getPrimitiveSizeInBits();
2169  if (BitSize == 0)
2170  return ~0U;
2171 
2172  // Never hoist constants larger than 128bit, because this might lead to
2173  // incorrect code generation or assertions in codegen.
2174  // Fixme: Create a cost model for types larger than i128 once the codegen
2175  // issues have been fixed.
2176  if (BitSize > 128)
2177  return TTI::TCC_Free;
2178 
2179  if (Imm == 0)
2180  return TTI::TCC_Free;
2181 
2182  // Sign-extend all constants to a multiple of 64-bit.
2183  APInt ImmVal = Imm;
2184  if (BitSize & 0x3f)
2185  ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
2186 
2187  // Split the constant into 64-bit chunks and calculate the cost for each
2188  // chunk.
2189  int Cost = 0;
2190  for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
2191  APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
2192  int64_t Val = Tmp.getSExtValue();
2193  Cost += getIntImmCost(Val);
2194  }
2195  // We need at least one instruction to materialize the constant.
2196  return std::max(1, Cost);
2197 }
2198 
2199 int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
2200  Type *Ty) {
2201  assert(Ty->isIntegerTy());
2202 
2203  unsigned BitSize = Ty->getPrimitiveSizeInBits();
2204  // There is no cost model for constants with a bit size of 0. Return TCC_Free
2205  // here, so that constant hoisting will ignore this constant.
2206  if (BitSize == 0)
2207  return TTI::TCC_Free;
2208 
2209  unsigned ImmIdx = ~0U;
2210  switch (Opcode) {
2211  default:
2212  return TTI::TCC_Free;
2213  case Instruction::GetElementPtr:
2214  // Always hoist the base address of a GetElementPtr. This prevents the
2215  // creation of new constants for every base constant that gets constant
2216  // folded with the offset.
2217  if (Idx == 0)
2218  return 2 * TTI::TCC_Basic;
2219  return TTI::TCC_Free;
2220  case Instruction::Store:
2221  ImmIdx = 0;
2222  break;
2223  case Instruction::ICmp:
2224  // This is an imperfect hack to prevent constant hoisting of
2225  // compares that might be trying to check if a 64-bit value fits in
2226  // 32-bits. The backend can optimize these cases using a right shift by 32.
2227  // Ideally we would check the compare predicate here. There also other
2228  // similar immediates the backend can use shifts for.
2229  if (Idx == 1 && Imm.getBitWidth() == 64) {
2230  uint64_t ImmVal = Imm.getZExtValue();
2231  if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
2232  return TTI::TCC_Free;
2233  }
2234  ImmIdx = 1;
2235  break;
2236  case Instruction::And:
2237  // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
2238  // by using a 32-bit operation with implicit zero extension. Detect such
2239  // immediates here as the normal path expects bit 31 to be sign extended.
2240  if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue()))
2241  return TTI::TCC_Free;
2243  case Instruction::Add:
2244  case Instruction::Sub:
2245  case Instruction::Mul:
2246  case Instruction::UDiv:
2247  case Instruction::SDiv:
2248  case Instruction::URem:
2249  case Instruction::SRem:
2250  case Instruction::Or:
2251  case Instruction::Xor:
2252  ImmIdx = 1;
2253  break;
2254  // Always return TCC_Free for the shift value of a shift instruction.
2255  case Instruction::Shl:
2256  case Instruction::LShr:
2257  case Instruction::AShr:
2258  if (Idx == 1)
2259  return TTI::TCC_Free;
2260  break;
2261  case Instruction::Trunc:
2262  case Instruction::ZExt:
2263  case Instruction::SExt:
2264  case Instruction::IntToPtr:
2265  case Instruction::PtrToInt:
2266  case Instruction::BitCast:
2267  case Instruction::PHI:
2268  case Instruction::Call:
2269  case Instruction::Select:
2270  case Instruction::Ret:
2271  case Instruction::Load:
2272  break;
2273  }
2274 
2275  if (Idx == ImmIdx) {
2276  int NumConstants = (BitSize + 63) / 64;
2277  int Cost = X86TTIImpl::getIntImmCost(Imm, Ty);
2278  return (Cost <= NumConstants * TTI::TCC_Basic)
2279  ? static_cast<int>(TTI::TCC_Free)
2280  : Cost;
2281  }
2282 
2283  return X86TTIImpl::getIntImmCost(Imm, Ty);
2284 }
2285 
2286 int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
2287  Type *Ty) {
2288  assert(Ty->isIntegerTy());
2289 
2290  unsigned BitSize = Ty->getPrimitiveSizeInBits();
2291  // There is no cost model for constants with a bit size of 0. Return TCC_Free
2292  // here, so that constant hoisting will ignore this constant.
2293  if (BitSize == 0)
2294  return TTI::TCC_Free;
2295 
2296  switch (IID) {
2297  default:
2298  return TTI::TCC_Free;
2299  case Intrinsic::sadd_with_overflow:
2300  case Intrinsic::uadd_with_overflow:
2301  case Intrinsic::ssub_with_overflow:
2302  case Intrinsic::usub_with_overflow:
2303  case Intrinsic::smul_with_overflow:
2304  case Intrinsic::umul_with_overflow:
2305  if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
2306  return TTI::TCC_Free;
2307  break;
2308  case Intrinsic::experimental_stackmap:
2309  if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
2310  return TTI::TCC_Free;
2311  break;
2312  case Intrinsic::experimental_patchpoint_void:
2313  case Intrinsic::experimental_patchpoint_i64:
2314  if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
2315  return TTI::TCC_Free;
2316  break;
2317  }
2318  return X86TTIImpl::getIntImmCost(Imm, Ty);
2319 }
2320 
2321 unsigned X86TTIImpl::getUserCost(const User *U,
2322  ArrayRef<const Value *> Operands) {
2323  if (isa<StoreInst>(U)) {
2324  Value *Ptr = U->getOperand(1);
2325  // Store instruction with index and scale costs 2 Uops.
2326  // Check the preceding GEP to identify non-const indices.
2327  if (auto GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
2328  if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
2329  return TTI::TCC_Basic * 2;
2330  }
2331  return TTI::TCC_Basic;
2332  }
2333  return BaseT::getUserCost(U, Operands);
2334 }
2335 
2336 // Return an average cost of Gather / Scatter instruction, maybe improved later
2337 int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
2338  unsigned Alignment, unsigned AddressSpace) {
2339 
2340  assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
2341  unsigned VF = SrcVTy->getVectorNumElements();
2342 
2343  // Try to reduce index size from 64 bit (default for GEP)
2344  // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
2345  // operation will use 16 x 64 indices which do not fit in a zmm and needs
2346  // to split. Also check that the base pointer is the same for all lanes,
2347  // and that there's at most one variable index.
2348  auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) {
2349  unsigned IndexSize = DL.getPointerSizeInBits();
2351  if (IndexSize < 64 || !GEP)
2352  return IndexSize;
2353 
2354  unsigned NumOfVarIndices = 0;
2355  Value *Ptrs = GEP->getPointerOperand();
2356  if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
2357  return IndexSize;
2358  for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
2359  if (isa<Constant>(GEP->getOperand(i)))
2360  continue;
2361  Type *IndxTy = GEP->getOperand(i)->getType();
2362  if (IndxTy->isVectorTy())
2363  IndxTy = IndxTy->getVectorElementType();
2364  if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
2365  !isa<SExtInst>(GEP->getOperand(i))) ||
2366  ++NumOfVarIndices > 1)
2367  return IndexSize; // 64
2368  }
2369  return (unsigned)32;
2370  };
2371 
2372 
2373  // Trying to reduce IndexSize to 32 bits for vector 16.
2374  // By default the IndexSize is equal to pointer size.
2375  unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
2376  ? getIndexSizeInBits(Ptr, DL)
2378 
2379  Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(),
2380  IndexSize), VF);
2381  std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
2382  std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
2383  int SplitFactor = std::max(IdxsLT.first, SrcLT.first);
2384  if (SplitFactor > 1) {
2385  // Handle splitting of vector of pointers
2386  Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
2387  return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
2388  AddressSpace);
2389  }
2390 
2391  // The gather / scatter cost is given by Intel architects. It is a rough
2392  // number since we are looking at one instruction in a time.
2393  const int GSOverhead = (Opcode == Instruction::Load)
2394  ? ST->getGatherOverhead()
2395  : ST->getScatterOverhead();
2396  return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
2397  Alignment, AddressSpace);
2398 }
2399 
2400 /// Return the cost of full scalarization of gather / scatter operation.
2401 ///
2402 /// Opcode - Load or Store instruction.
2403 /// SrcVTy - The type of the data vector that should be gathered or scattered.
2404 /// VariableMask - The mask is non-constant at compile time.
2405 /// Alignment - Alignment for one element.
2406 /// AddressSpace - pointer[s] address space.
2407 ///
2408 int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
2409  bool VariableMask, unsigned Alignment,
2410  unsigned AddressSpace) {
2411  unsigned VF = SrcVTy->getVectorNumElements();
2412 
2413  int MaskUnpackCost = 0;
2414  if (VariableMask) {
2415  VectorType *MaskTy =
2416  VectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
2417  MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true);
2418  int ScalarCompareCost =
2419  getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()),
2420  nullptr);
2421  int BranchCost = getCFInstrCost(Instruction::Br);
2422  MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
2423  }
2424 
2425  // The cost of the scalar loads/stores.
2426  int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
2427  Alignment, AddressSpace);
2428 
2429  int InsertExtractCost = 0;
2430  if (Opcode == Instruction::Load)
2431  for (unsigned i = 0; i < VF; ++i)
2432  // Add the cost of inserting each scalar load into the vector
2433  InsertExtractCost +=
2434  getVectorInstrCost(Instruction::InsertElement, SrcVTy, i);
2435  else
2436  for (unsigned i = 0; i < VF; ++i)
2437  // Add the cost of extracting each element out of the data vector
2438  InsertExtractCost +=
2439  getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i);
2440 
2441  return MemoryOpCost + MaskUnpackCost + InsertExtractCost;
2442 }
2443 
2444 /// Calculate the cost of Gather / Scatter operation
2445 int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
2446  Value *Ptr, bool VariableMask,
2447  unsigned Alignment) {
2448  assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
2449  unsigned VF = SrcVTy->getVectorNumElements();
2450  PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
2451  if (!PtrTy && Ptr->getType()->isVectorTy())
2452  PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType());
2453  assert(PtrTy && "Unexpected type for Ptr argument");
2454  unsigned AddressSpace = PtrTy->getAddressSpace();
2455 
2456  bool Scalarize = false;
2457  if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) ||
2458  (Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy)))
2459  Scalarize = true;
2460  // Gather / Scatter for vector 2 is not profitable on KNL / SKX
2461  // Vector-4 of gather/scatter instruction does not exist on KNL.
2462  // We can extend it to 8 elements, but zeroing upper bits of
2463  // the mask vector will add more instructions. Right now we give the scalar
2464  // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction
2465  // is better in the VariableMask case.
2466  if (ST->hasAVX512() && (VF == 2 || (VF == 4 && !ST->hasVLX())))
2467  Scalarize = true;
2468 
2469  if (Scalarize)
2470  return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
2471  AddressSpace);
2472 
2473  return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
2474 }
2475 
2478  // X86 specific here are "instruction number 1st priority".
2479  return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
2480  C1.NumIVMuls, C1.NumBaseAdds,
2481  C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
2482  std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
2483  C2.NumIVMuls, C2.NumBaseAdds,
2484  C2.ScaleCost, C2.ImmCost, C2.SetupCost);
2485 }
2486 
2488  return ST->hasMacroFusion();
2489 }
2490 
2492  // The backend can't handle a single element vector.
2493  if (isa<VectorType>(DataTy) && DataTy->getVectorNumElements() == 1)
2494  return false;
2495  Type *ScalarTy = DataTy->getScalarType();
2496  int DataWidth = isa<PointerType>(ScalarTy) ?
2498 
2499  return ((DataWidth == 32 || DataWidth == 64) && ST->hasAVX()) ||
2500  ((DataWidth == 8 || DataWidth == 16) && ST->hasBWI());
2501 }
2502 
2504  return isLegalMaskedLoad(DataType);
2505 }
2506 
2508  // This function is called now in two cases: from the Loop Vectorizer
2509  // and from the Scalarizer.
2510  // When the Loop Vectorizer asks about legality of the feature,
2511  // the vectorization factor is not calculated yet. The Loop Vectorizer
2512  // sends a scalar type and the decision is based on the width of the
2513  // scalar element.
2514  // Later on, the cost model will estimate usage this intrinsic based on
2515  // the vector type.
2516  // The Scalarizer asks again about legality. It sends a vector type.
2517  // In this case we can reject non-power-of-2 vectors.
2518  // We also reject single element vectors as the type legalizer can't
2519  // scalarize it.
2520  if (isa<VectorType>(DataTy)) {
2521  unsigned NumElts = DataTy->getVectorNumElements();
2522  if (NumElts == 1 || !isPowerOf2_32(NumElts))
2523  return false;
2524  }
2525  Type *ScalarTy = DataTy->getScalarType();
2526  int DataWidth = isa<PointerType>(ScalarTy) ?
2528 
2529  // Some CPUs have better gather performance than others.
2530  // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
2531  // enable gather with a -march.
2532  return (DataWidth == 32 || DataWidth == 64) &&
2533  (ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2()));
2534 }
2535 
2537  // AVX2 doesn't support scatter
2538  if (!ST->hasAVX512())
2539  return false;
2540  return isLegalMaskedGather(DataType);
2541 }
2542 
2543 bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
2544  EVT VT = TLI->getValueType(DL, DataType);
2545  return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
2546 }
2547 
2549  return false;
2550 }
2551 
2553  const Function *Callee) const {
2554  const TargetMachine &TM = getTLI()->getTargetMachine();
2555 
2556  // Work this as a subsetting of subtarget features.
2557  const FeatureBitset &CallerBits =
2558  TM.getSubtargetImpl(*Caller)->getFeatureBits();
2559  const FeatureBitset &CalleeBits =
2560  TM.getSubtargetImpl(*Callee)->getFeatureBits();
2561 
2562  // FIXME: This is likely too limiting as it will include subtarget features
2563  // that we might not care about for inlining, but it is conservatively
2564  // correct.
2565  return (CallerBits & CalleeBits) == CalleeBits;
2566 }
2567 
2569 X86TTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const {
2570  // Only enable vector loads for equality comparison.
2571  // Right now the vector version is not as fast, see #33329.
2572  static const auto ThreeWayOptions = [this]() {
2574  if (ST->is64Bit()) {
2575  Options.LoadSizes.push_back(8);
2576  }
2577  Options.LoadSizes.push_back(4);
2578  Options.LoadSizes.push_back(2);
2579  Options.LoadSizes.push_back(1);
2580  return Options;
2581  }();
2582  static const auto EqZeroOptions = [this]() {
2584  // TODO: enable AVX512 when the DAG is ready.
2585  // if (ST->hasAVX512()) Options.LoadSizes.push_back(64);
2586  if (ST->hasAVX2()) Options.LoadSizes.push_back(32);
2587  if (ST->hasSSE2()) Options.LoadSizes.push_back(16);
2588  if (ST->is64Bit()) {
2589  Options.LoadSizes.push_back(8);
2590  }
2591  Options.LoadSizes.push_back(4);
2592  Options.LoadSizes.push_back(2);
2593  Options.LoadSizes.push_back(1);
2594  return Options;
2595  }();
2596  return IsZeroCmp ? &EqZeroOptions : &ThreeWayOptions;
2597 }
2598 
2600  // TODO: We expect this to be beneficial regardless of arch,
2601  // but there are currently some unexplained performance artifacts on Atom.
2602  // As a temporary solution, disable on Atom.
2603  return !(ST->isAtom());
2604 }
2605 
2606 // Get estimation for interleaved load/store operations for AVX2.
2607 // \p Factor is the interleaved-access factor (stride) - number of
2608 // (interleaved) elements in the group.
2609 // \p Indices contains the indices for a strided load: when the
2610 // interleaved load has gaps they indicate which elements are used.
2611 // If Indices is empty (or if the number of indices is equal to the size
2612 // of the interleaved-access as given in \p Factor) the access has no gaps.
2613 //
2614 // As opposed to AVX-512, AVX2 does not have generic shuffles that allow
2615 // computing the cost using a generic formula as a function of generic
2616 // shuffles. We therefore use a lookup table instead, filled according to
2617 // the instruction sequences that codegen currently generates.
2619  unsigned Factor,
2620  ArrayRef<unsigned> Indices,
2621  unsigned Alignment,
2622  unsigned AddressSpace) {
2623 
2624  // We currently Support only fully-interleaved groups, with no gaps.
2625  // TODO: Support also strided loads (interleaved-groups with gaps).
2626  if (Indices.size() && Indices.size() != Factor)
2627  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2628  Alignment, AddressSpace);
2629 
2630  // VecTy for interleave memop is <VF*Factor x Elt>.
2631  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
2632  // VecTy = <12 x i32>.
2633  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
2634 
2635  // This function can be called with VecTy=<6xi128>, Factor=3, in which case
2636  // the VF=2, while v2i128 is an unsupported MVT vector type
2637  // (see MachineValueType.h::getVectorVT()).
2638  if (!LegalVT.isVector())
2639  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2640  Alignment, AddressSpace);
2641 
2642  unsigned VF = VecTy->getVectorNumElements() / Factor;
2643  Type *ScalarTy = VecTy->getVectorElementType();
2644 
2645  // Calculate the number of memory operations (NumOfMemOps), required
2646  // for load/store the VecTy.
2647  unsigned VecTySize = DL.getTypeStoreSize(VecTy);
2648  unsigned LegalVTSize = LegalVT.getStoreSize();
2649  unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
2650 
2651  // Get the cost of one memory operation.
2652  Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
2653  LegalVT.getVectorNumElements());
2654  unsigned MemOpCost =
2655  getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
2656 
2657  VectorType *VT = VectorType::get(ScalarTy, VF);
2658  EVT ETy = TLI->getValueType(DL, VT);
2659  if (!ETy.isSimple())
2660  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2661  Alignment, AddressSpace);
2662 
2663  // TODO: Complete for other data-types and strides.
2664  // Each combination of Stride, ElementTy and VF results in a different
2665  // sequence; The cost tables are therefore accessed with:
2666  // Factor (stride) and VectorType=VFxElemType.
2667  // The Cost accounts only for the shuffle sequence;
2668  // The cost of the loads/stores is accounted for separately.
2669  //
2670  static const CostTblEntry AVX2InterleavedLoadTbl[] = {
2671  { 2, MVT::v4i64, 6 }, //(load 8i64 and) deinterleave into 2 x 4i64
2672  { 2, MVT::v4f64, 6 }, //(load 8f64 and) deinterleave into 2 x 4f64
2673 
2674  { 3, MVT::v2i8, 10 }, //(load 6i8 and) deinterleave into 3 x 2i8
2675  { 3, MVT::v4i8, 4 }, //(load 12i8 and) deinterleave into 3 x 4i8
2676  { 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8
2677  { 3, MVT::v16i8, 11}, //(load 48i8 and) deinterleave into 3 x 16i8
2678  { 3, MVT::v32i8, 13}, //(load 96i8 and) deinterleave into 3 x 32i8
2679  { 3, MVT::v8f32, 17 }, //(load 24f32 and)deinterleave into 3 x 8f32
2680 
2681  { 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8
2682  { 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8
2683  { 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8
2684  { 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8
2685  { 4, MVT::v32i8, 80 }, //(load 128i8 and) deinterleave into 4 x 32i8
2686 
2687  { 8, MVT::v8f32, 40 } //(load 64f32 and)deinterleave into 8 x 8f32
2688  };
2689 
2690  static const CostTblEntry AVX2InterleavedStoreTbl[] = {
2691  { 2, MVT::v4i64, 6 }, //interleave into 2 x 4i64 into 8i64 (and store)
2692  { 2, MVT::v4f64, 6 }, //interleave into 2 x 4f64 into 8f64 (and store)
2693 
2694  { 3, MVT::v2i8, 7 }, //interleave 3 x 2i8 into 6i8 (and store)
2695  { 3, MVT::v4i8, 8 }, //interleave 3 x 4i8 into 12i8 (and store)
2696  { 3, MVT::v8i8, 11 }, //interleave 3 x 8i8 into 24i8 (and store)
2697  { 3, MVT::v16i8, 11 }, //interleave 3 x 16i8 into 48i8 (and store)
2698  { 3, MVT::v32i8, 13 }, //interleave 3 x 32i8 into 96i8 (and store)
2699 
2700  { 4, MVT::v2i8, 12 }, //interleave 4 x 2i8 into 8i8 (and store)
2701  { 4, MVT::v4i8, 9 }, //interleave 4 x 4i8 into 16i8 (and store)
2702  { 4, MVT::v8i8, 10 }, //interleave 4 x 8i8 into 32i8 (and store)
2703  { 4, MVT::v16i8, 10 }, //interleave 4 x 16i8 into 64i8 (and store)
2704  { 4, MVT::v32i8, 12 } //interleave 4 x 32i8 into 128i8 (and store)
2705  };
2706 
2707  if (Opcode == Instruction::Load) {
2708  if (const auto *Entry =
2709  CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT()))
2710  return NumOfMemOps * MemOpCost + Entry->Cost;
2711  } else {
2712  assert(Opcode == Instruction::Store &&
2713  "Expected Store Instruction at this point");
2714  if (const auto *Entry =
2715  CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT()))
2716  return NumOfMemOps * MemOpCost + Entry->Cost;
2717  }
2718 
2719  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2720  Alignment, AddressSpace);
2721 }
2722 
2723 // Get estimation for interleaved load/store operations and strided load.
2724 // \p Indices contains indices for strided load.
2725 // \p Factor - the factor of interleaving.
2726 // AVX-512 provides 3-src shuffles that significantly reduces the cost.
2728  unsigned Factor,
2729  ArrayRef<unsigned> Indices,
2730  unsigned Alignment,
2731  unsigned AddressSpace) {
2732 
2733  // VecTy for interleave memop is <VF*Factor x Elt>.
2734  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
2735  // VecTy = <12 x i32>.
2736 
2737  // Calculate the number of memory operations (NumOfMemOps), required
2738  // for load/store the VecTy.
2739  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
2740  unsigned VecTySize = DL.getTypeStoreSize(VecTy);
2741  unsigned LegalVTSize = LegalVT.getStoreSize();
2742  unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
2743 
2744  // Get the cost of one memory operation.
2745  Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
2746  LegalVT.getVectorNumElements());
2747  unsigned MemOpCost =
2748  getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
2749 
2750  unsigned VF = VecTy->getVectorNumElements() / Factor;
2751  MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
2752 
2753  if (Opcode == Instruction::Load) {
2754  // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
2755  // contain the cost of the optimized shuffle sequence that the
2756  // X86InterleavedAccess pass will generate.
2757  // The cost of loads and stores are computed separately from the table.
2758 
2759  // X86InterleavedAccess support only the following interleaved-access group.
2760  static const CostTblEntry AVX512InterleavedLoadTbl[] = {
2761  {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
2762  {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
2763  {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
2764  };
2765 
2766  if (const auto *Entry =
2767  CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
2768  return NumOfMemOps * MemOpCost + Entry->Cost;
2769  //If an entry does not exist, fallback to the default implementation.
2770 
2771  // Kind of shuffle depends on number of loaded values.
2772  // If we load the entire data in one register, we can use a 1-src shuffle.
2773  // Otherwise, we'll merge 2 sources in each operation.
2774  TTI::ShuffleKind ShuffleKind =
2775  (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
2776 
2777  unsigned ShuffleCost =
2778  getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr);
2779 
2780  unsigned NumOfLoadsInInterleaveGrp =
2781  Indices.size() ? Indices.size() : Factor;
2782  Type *ResultTy = VectorType::get(VecTy->getVectorElementType(),
2783  VecTy->getVectorNumElements() / Factor);
2784  unsigned NumOfResults =
2785  getTLI()->getTypeLegalizationCost(DL, ResultTy).first *
2786  NumOfLoadsInInterleaveGrp;
2787 
2788  // About a half of the loads may be folded in shuffles when we have only
2789  // one result. If we have more than one result, we do not fold loads at all.
2790  unsigned NumOfUnfoldedLoads =
2791  NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
2792 
2793  // Get a number of shuffle operations per result.
2794  unsigned NumOfShufflesPerResult =
2795  std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
2796 
2797  // The SK_MergeTwoSrc shuffle clobbers one of src operands.
2798  // When we have more than one destination, we need additional instructions
2799  // to keep sources.
2800  unsigned NumOfMoves = 0;
2801  if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
2802  NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
2803 
2804  int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
2805  NumOfUnfoldedLoads * MemOpCost + NumOfMoves;
2806 
2807  return Cost;
2808  }
2809 
2810  // Store.
2811  assert(Opcode == Instruction::Store &&
2812  "Expected Store Instruction at this point");
2813  // X86InterleavedAccess support only the following interleaved-access group.
2814  static const CostTblEntry AVX512InterleavedStoreTbl[] = {
2815  {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
2816  {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
2817  {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
2818 
2819  {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
2820  {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
2821  {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
2822  {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
2823  };
2824 
2825  if (const auto *Entry =
2826  CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
2827  return NumOfMemOps * MemOpCost + Entry->Cost;
2828  //If an entry does not exist, fallback to the default implementation.
2829 
2830  // There is no strided stores meanwhile. And store can't be folded in
2831  // shuffle.
2832  unsigned NumOfSources = Factor; // The number of values to be merged.
2833  unsigned ShuffleCost =
2834  getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr);
2835  unsigned NumOfShufflesPerStore = NumOfSources - 1;
2836 
2837  // The SK_MergeTwoSrc shuffle clobbers one of src operands.
2838  // We need additional instructions to keep sources.
2839  unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
2840  int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
2841  NumOfMoves;
2842  return Cost;
2843 }
2844 
2845 int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
2846  unsigned Factor,
2847  ArrayRef<unsigned> Indices,
2848  unsigned Alignment,
2849  unsigned AddressSpace) {
2850  auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) {
2851  Type *EltTy = VecTy->getVectorElementType();
2852  if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
2853  EltTy->isIntegerTy(32) || EltTy->isPointerTy())
2854  return true;
2855  if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8))
2856  return HasBW;
2857  return false;
2858  };
2859  if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
2860  return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
2861  Alignment, AddressSpace);
2862  if (ST->hasAVX2())
2863  return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
2864  Alignment, AddressSpace);
2865 
2866  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2867  Alignment, AddressSpace);
2868 }
bool hasAVX() const
Definition: X86Subtarget.h:524
Type * getVectorElementType() const
Definition: Type.h:368
constexpr bool isUInt< 32 >(uint64_t x)
Definition: MathExtras.h:341
X = FP_ROUND(Y, TRUNC) - Rounding &#39;Y&#39; from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:513
bool is64Bit() const
Is this x86_64? (disregarding specific ABI / programming model)
Definition: X86Subtarget.h:488
void push_back(const T &Elt)
Definition: SmallVector.h:212
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:570
unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >())
Definition: BasicTTIImpl.h:477
static IntegerType * getInt1Ty(LLVMContext &C)
Definition: Type.cpp:173
llvm::Optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1542
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:842
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:42
bool hasSSE41() const
Definition: X86Subtarget.h:522
Compute iterated dominance frontiers using a linear time algorithm.
Definition: AllocatorList.h:24
unsigned minRequiredElementSize(const Value *Val, bool &isSigned)
static MVT getVectorVT(MVT VT, unsigned NumElements)
Cost tables and simple lookup functions.
int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty)
int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, bool VariableMask, unsigned Alignment)
Calculate the cost of Gather / Scatter operation.
bool hasPOPCNT() const
Definition: X86Subtarget.h:533
bool hasAVX2() const
Definition: X86Subtarget.h:525
const Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
bool isVector() const
Return true if this is a vector value type.
unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:416
This file a TargetTransformInfo::Concept conforming object specific to the X86 target machine...
The main scalar evolution driver.
unsigned getRegisterBitWidth(bool Vector) const
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:253
unsigned getVectorNumElements() const
bool isLegalMaskedScatter(Type *DataType)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:814
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:354
Type Conversion Cost Table.
Definition: CostTable.h:45
[US]{MIN/MAX} - Binary minimum or maximum or signed or unsigned integers.
Definition: ISDOpcodes.h:360
Hexagon Common GEP
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:227
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace)
unsigned getAtomicMemIntrinsicMaxElementSize() const
Cost Table Entry.
Definition: CostTable.h:25
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1488
unsigned getNumberOfRegisters(bool Vector)
int getGatherOverhead() const
Definition: X86Subtarget.h:581
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:130
unsigned getMaxInterleaveFactor(unsigned VF)
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:210
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
bool isUnalignedMem32Slow() const
Definition: X86Subtarget.h:580
unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef< Value * > Args, FastMathFlags FMF, unsigned VF=1)
Get intrinsic cost based on arguments.
Definition: BasicTTIImpl.h:865
bool isLegalMaskedStore(Type *DataType)
unsigned getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
int getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value *> Args=ArrayRef< const Value *>())
const FeatureBitset & getFeatureBits() const
getFeatureBits - Return the feature bits.
Shift and rotation operations.
Definition: ISDOpcodes.h:380
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:197
unsigned getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwise)
Try to calculate arithmetic and shuffle op costs for reduction operations.
const TypeConversionCostTblEntry * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntry > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table, TypeTy must be comparable to CompareTy by ==. ...
Definition: CostTable.h:55
unsigned getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isLSRCostLess(TargetTransformInfo::LSRCost &C1, TargetTransformInfo::LSRCost &C2)
bool hasVLX() const
Definition: X86Subtarget.h:615
unsigned getSizeInBits() const
unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace)
Definition: BasicTTIImpl.h:749
unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I)
Definition: BasicTTIImpl.h:671
Choose alternate elements from vector.
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1554
unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:533
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:245
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:456
PopcntSupportKind
Flags indicating the kind of support for population count.
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:892
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:201
llvm::Optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:33
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:203
int getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr)
Reverse the order of the vector.
amdgpu Simplify well known AMD library false Value * Callee
bool hasDQI() const
Definition: X86Subtarget.h:613
MVT getVectorElementType() const
Value * getOperand(unsigned i) const
Definition: User.h:154
Class to represent pointers.
Definition: DerivedTypes.h:467
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:499
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return &#39;this&#39;.
Definition: Type.h:301
bool isFloatTy() const
Return true if this is &#39;float&#39;, a 32-bit IEEE fp type.
Definition: Type.h:147
an instruction for type-safe pointer arithmetic to access elements of arrays and structs ...
Definition: Instructions.h:837
bool isSLM() const
Definition: X86Subtarget.h:658
bool hasSSSE3() const
Definition: X86Subtarget.h:521
If not nullptr, enable inline expansion of memcmp.
* if(!EatIfPresent(lltok::kw_thread_local)) return false
ParseOptionalThreadLocal := /*empty.
Container class for subtarget features.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:421
Machine Value Type.
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46
Simple binary floating point operators.
Definition: ISDOpcodes.h:260
bool isLegalMaskedGather(Type *DataType)
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:149
unsigned getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwise, bool)
Try to calculate op costs for min/max reduction operations.
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:221
Expected to fold away in lowering.
unsigned getUserCost(const User *U, ArrayRef< const Value * > Operands)
bool isLegalMaskedLoad(Type *DataType)
const TTI::MemCmpExpansionOptions * enableMemCmpExpansion(bool IsZeroCmp) const
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:495
Merge elements from two source vectors into one with any shuffle mask.
int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace)
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:284
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
bool hasXOP() const
Definition: X86Subtarget.h:549
int getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace)
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:531
bool hasSSE42() const
Definition: X86Subtarget.h:523
Extended Value Type.
Definition: ValueTypes.h:34
int getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm)
unsigned getUserCost(const User *U, ArrayRef< const Value *> Operands)
const TargetMachine & getTargetMachine() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace, const Instruction *I=nullptr)
int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp)
const SCEVConstant * getConstantStrideStep(ScalarEvolution *SE, const SCEV *Ptr)
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
OperandValueProperties
Additional properties of an operand&#39;s values.
unsigned getCFInstrCost(unsigned Opcode)
Definition: BasicTTIImpl.h:666
bool isAtom() const
TODO: to be removed later and replaced with suitable properties.
Definition: X86Subtarget.h:657
unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp)
Definition: BasicTTIImpl.h:524
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:935
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:240
unsigned getNumOperands() const
Definition: User.h:176
constexpr bool isInt< 32 >(int64_t x)
Definition: MathExtras.h:301
unsigned getPreferVectorWidth() const
Definition: X86Subtarget.h:628
AddressSpace
Definition: NVPTXBaseInfo.h:22
bool hasVBMI() const
Definition: X86Subtarget.h:559
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:383
int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I=nullptr)
int getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwiseForm, bool IsUnsigned)
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target&#39;s TargetSubtargetInf...
unsigned getVectorNumElements() const
Definition: DerivedTypes.h:462
Class to represent vector types.
Definition: DerivedTypes.h:393
Class for arbitrary precision integers.
Definition: APInt.h:69
int getScatterOverhead() const
Definition: X86Subtarget.h:582
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:446
unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:718
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const
const CostTblEntry * CostTableLookup(ArrayRef< CostTblEntry > Tbl, int ISD, MVT Ty)
Find in cost table, TypeTy must be comparable to CompareTy by ==.
Definition: CostTable.h:32
This class represents an analyzed expression in the program.
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, const Instruction *I=nullptr)
#define I(x, y, z)
Definition: MD5.cpp:58
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:323
unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: BasicTTIImpl.h:711
InsertSubvector. Index indicates start offset.
unsigned Insns
TODO: Some of these could be merged.
bool hasCDI() const
Definition: X86Subtarget.h:609
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:185
const unsigned Kind
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
The cost of a typical &#39;add&#39; instruction.
int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace)
bool hasSSE1() const
Definition: X86Subtarget.h:518
unsigned getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition: Type.cpp:115
LLVM Value Representation.
Definition: Value.h:73
uint64_t getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type...
Definition: DataLayout.h:405
bool hasDivRemOp(Type *DataType, bool IsSigned)
static VectorType * get(Type *ElementType, unsigned NumElements)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:593
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition: Compiler.h:235
bool hasMacroFusion() const
Definition: X86Subtarget.h:599
Broadcast element 0 to all other elements.
bool hasAVX512() const
Definition: X86Subtarget.h:526
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:59
Convenience struct for specifying and reasoning about fast-math flags.
Definition: Operator.h:160
bool hasBWI() const
Definition: X86Subtarget.h:614
OperandValueKind
Additional information about an operand&#39;s possible values.
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:413
This pass exposes codegen information to IR-level passes.
Conversion operators.
Definition: ISDOpcodes.h:443
bool hasFastGather() const
Definition: X86Subtarget.h:594
CacheLevel
The possible cache levels.
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:452
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:126
bool isDoubleTy() const
Return true if this is &#39;double&#39;, a 64-bit IEEE fp type.
Definition: Type.h:150
int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef< Type *> Tys, FastMathFlags FMF, unsigned ScalarizationCostPassed=UINT_MAX)
static IntegerType * getInt8Ty(LLVMContext &C)
Definition: Type.cpp:174
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
std::pair< int, MVT > getTypeLegalizationCost(const DataLayout &DL, Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
bool hasSSE2() const
Definition: X86Subtarget.h:519
int getIntImmCost(int64_t)
Calculate the cost of materializing a 64-bit value.
This file describes how to lower LLVM code to machine code.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Shuffle elements of single source vector with any shuffle mask.