LLVM  6.0.0svn
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 /// \file
10 /// This file implements a TargetTransformInfo analysis pass specific to the
11 /// X86 target machine. It uses the target's detailed information to provide
12 /// more precise answers to certain TTI queries, while letting the target
13 /// independent and default TTI implementations handle the rest.
14 ///
15 //===----------------------------------------------------------------------===//
16 /// About Cost Model numbers used below it's necessary to say the following:
17 /// the numbers correspond to some "generic" X86 CPU instead of usage of
18 /// concrete CPU model. Usually the numbers correspond to CPU where the feature
19 /// apeared at the first time. For example, if we do Subtarget.hasSSE42() in
20 /// the lookups below the cost is based on Nehalem as that was the first CPU
21 /// to support that feature level and thus has most likely the worst case cost.
22 /// Some examples of other technologies/CPUs:
23 /// SSE 3 - Pentium4 / Athlon64
24 /// SSE 4.1 - Penryn
25 /// SSE 4.2 - Nehalem
26 /// AVX - Sandy Bridge
27 /// AVX2 - Haswell
28 /// AVX-512 - Xeon Phi / Skylake
29 /// And some examples of instruction target dependent costs (latency)
30 /// divss sqrtss rsqrtss
31 /// AMD K7 11-16 19 3
32 /// Piledriver 9-24 13-15 5
33 /// Jaguar 14 16 2
34 /// Pentium II,III 18 30 2
35 /// Nehalem 7-14 7-18 3
36 /// Haswell 10-13 11 5
37 /// TODO: Develop and implement the target dependent cost model and
38 /// specialize cost numbers for different Cost Model Targets such as throughput,
39 /// code size, latency and uop count.
40 //===----------------------------------------------------------------------===//
41 
42 #include "X86TargetTransformInfo.h"
45 #include "llvm/CodeGen/CostTable.h"
47 #include "llvm/IR/IntrinsicInst.h"
48 #include "llvm/Support/Debug.h"
49 
50 using namespace llvm;
51 
52 #define DEBUG_TYPE "x86tti"
53 
54 //===----------------------------------------------------------------------===//
55 //
56 // X86 cost model.
57 //
58 //===----------------------------------------------------------------------===//
59 
61 X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
62  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
63  // TODO: Currently the __builtin_popcount() implementation using SSE3
64  // instructions is inefficient. Once the problem is fixed, we should
65  // call ST->hasSSE3() instead of ST->hasPOPCNT().
67 }
68 
71  switch (Level) {
73  // - Penry
74  // - Nehalem
75  // - Westmere
76  // - Sandy Bridge
77  // - Ivy Bridge
78  // - Haswell
79  // - Broadwell
80  // - Skylake
81  // - Kabylake
82  return 32 * 1024; // 32 KByte
84  // - Penry
85  // - Nehalem
86  // - Westmere
87  // - Sandy Bridge
88  // - Ivy Bridge
89  // - Haswell
90  // - Broadwell
91  // - Skylake
92  // - Kabylake
93  return 256 * 1024; // 256 KByte
94  }
95 
96  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
97 }
98 
101  // - Penry
102  // - Nehalem
103  // - Westmere
104  // - Sandy Bridge
105  // - Ivy Bridge
106  // - Haswell
107  // - Broadwell
108  // - Skylake
109  // - Kabylake
110  switch (Level) {
114  return 8;
115  }
116 
117  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
118 }
119 
120 unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
121  if (Vector && !ST->hasSSE1())
122  return 0;
123 
124  if (ST->is64Bit()) {
125  if (Vector && ST->hasAVX512())
126  return 32;
127  return 16;
128  }
129  return 8;
130 }
131 
132 unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const {
133  if (Vector) {
134  if (ST->hasAVX512())
135  return 512;
136  if (ST->hasAVX())
137  return 256;
138  if (ST->hasSSE1())
139  return 128;
140  return 0;
141  }
142 
143  if (ST->is64Bit())
144  return 64;
145 
146  return 32;
147 }
148 
149 unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
150  return getRegisterBitWidth(true);
151 }
152 
153 unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
154  // If the loop will not be vectorized, don't interleave the loop.
155  // Let regular unroll to unroll the loop, which saves the overflow
156  // check and memory check cost.
157  if (VF == 1)
158  return 1;
159 
160  if (ST->isAtom())
161  return 1;
162 
163  // Sandybridge and Haswell have multiple execution ports and pipelined
164  // vector units.
165  if (ST->hasAVX())
166  return 4;
167 
168  return 2;
169 }
170 
172  unsigned Opcode, Type *Ty,
174  TTI::OperandValueProperties Opd1PropInfo,
175  TTI::OperandValueProperties Opd2PropInfo,
177  // Legalize the type.
178  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
179 
180  int ISD = TLI->InstructionOpcodeToISD(Opcode);
181  assert(ISD && "Invalid opcode");
182 
183  static const CostTblEntry SLMCostTable[] = {
184  { ISD::MUL, MVT::v4i32, 11 }, // pmulld
185  { ISD::MUL, MVT::v8i16, 2 }, // pmullw
186  { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
187  { ISD::FMUL, MVT::f64, 2 }, // mulsd
188  { ISD::FMUL, MVT::v2f64, 4 }, // mulpd
189  { ISD::FMUL, MVT::v4f32, 2 }, // mulps
190  { ISD::FDIV, MVT::f32, 17 }, // divss
191  { ISD::FDIV, MVT::v4f32, 39 }, // divps
192  { ISD::FDIV, MVT::f64, 32 }, // divsd
193  { ISD::FDIV, MVT::v2f64, 69 }, // divpd
194  { ISD::FADD, MVT::v2f64, 2 }, // addpd
195  { ISD::FSUB, MVT::v2f64, 2 }, // subpd
196  // v2i64/v4i64 mul is custom lowered as a series of long:
197  // multiplies(3), shifts(3) and adds(2)
198  // slm muldq version throughput is 2 and addq throughput 4
199  // thus: 3X2 (muldq throughput) + 3X1 (shift throuput) +
200  // 3X4 (addq throughput) = 17
201  { ISD::MUL, MVT::v2i64, 17 },
202  // slm addq\subq throughput is 4
203  { ISD::ADD, MVT::v2i64, 4 },
204  { ISD::SUB, MVT::v2i64, 4 },
205  };
206 
207  if (ST->isSLM()) {
208  if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
209  // Check if the operands can be shrinked into a smaller datatype.
210  bool Op1Signed = false;
211  unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
212  bool Op2Signed = false;
213  unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
214 
215  bool signedMode = Op1Signed | Op2Signed;
216  unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
217 
218  if (OpMinSize <= 7)
219  return LT.first * 3; // pmullw/sext
220  if (!signedMode && OpMinSize <= 8)
221  return LT.first * 3; // pmullw/zext
222  if (OpMinSize <= 15)
223  return LT.first * 5; // pmullw/pmulhw/pshuf
224  if (!signedMode && OpMinSize <= 16)
225  return LT.first * 5; // pmullw/pmulhw/pshuf
226  }
227  if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
228  LT.second)) {
229  return LT.first * Entry->Cost;
230  }
231  }
232 
233  if (ISD == ISD::SDIV &&
235  Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
236  // On X86, vector signed division by constants power-of-two are
237  // normally expanded to the sequence SRA + SRL + ADD + SRA.
238  // The OperandValue properties many not be same as that of previous
239  // operation;conservatively assume OP_None.
240  int Cost = 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info,
243  Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
246  Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info,
249 
250  return Cost;
251  }
252 
253  static const CostTblEntry AVX512BWUniformConstCostTable[] = {
254  { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand.
255  { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand.
256  { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb.
257 
258  { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence
259  { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence
260  };
261 
263  ST->hasBWI()) {
264  if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
265  LT.second))
266  return LT.first * Entry->Cost;
267  }
268 
269  static const CostTblEntry AVX512UniformConstCostTable[] = {
270  { ISD::SRA, MVT::v2i64, 1 },
271  { ISD::SRA, MVT::v4i64, 1 },
272  { ISD::SRA, MVT::v8i64, 1 },
273 
274  { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
275  { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
276  };
277 
279  ST->hasAVX512()) {
280  if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
281  LT.second))
282  return LT.first * Entry->Cost;
283  }
284 
285  static const CostTblEntry AVX2UniformConstCostTable[] = {
286  { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand.
287  { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand.
288  { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb.
289 
290  { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
291 
292  { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
293  { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
294  { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
295  { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
296  };
297 
299  ST->hasAVX2()) {
300  if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
301  LT.second))
302  return LT.first * Entry->Cost;
303  }
304 
305  static const CostTblEntry SSE2UniformConstCostTable[] = {
306  { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand.
307  { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand.
308  { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
309 
310  { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split.
311  { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split.
312  { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
313 
314  { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
315  { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
316  { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
317  { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
318  { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split.
319  { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
320  { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split.
321  { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
322  };
323 
325  ST->hasSSE2()) {
326  // pmuldq sequence.
327  if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
328  return LT.first * 32;
329  if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
330  return LT.first * 15;
331 
332  // XOP has faster vXi8 shifts.
333  if ((ISD != ISD::SHL && ISD != ISD::SRL && ISD != ISD::SRA) ||
334  !ST->hasXOP())
335  if (const auto *Entry =
336  CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
337  return LT.first * Entry->Cost;
338  }
339 
340  static const CostTblEntry AVX2UniformCostTable[] = {
341  // Uniform splats are cheaper for the following instructions.
342  { ISD::SHL, MVT::v16i16, 1 }, // psllw.
343  { ISD::SRL, MVT::v16i16, 1 }, // psrlw.
344  { ISD::SRA, MVT::v16i16, 1 }, // psraw.
345  };
346 
347  if (ST->hasAVX2() &&
349  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
350  if (const auto *Entry =
351  CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
352  return LT.first * Entry->Cost;
353  }
354 
355  static const CostTblEntry SSE2UniformCostTable[] = {
356  // Uniform splats are cheaper for the following instructions.
357  { ISD::SHL, MVT::v8i16, 1 }, // psllw.
358  { ISD::SHL, MVT::v4i32, 1 }, // pslld
359  { ISD::SHL, MVT::v2i64, 1 }, // psllq.
360 
361  { ISD::SRL, MVT::v8i16, 1 }, // psrlw.
362  { ISD::SRL, MVT::v4i32, 1 }, // psrld.
363  { ISD::SRL, MVT::v2i64, 1 }, // psrlq.
364 
365  { ISD::SRA, MVT::v8i16, 1 }, // psraw.
366  { ISD::SRA, MVT::v4i32, 1 }, // psrad.
367  };
368 
369  if (ST->hasSSE2() &&
371  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
372  if (const auto *Entry =
373  CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
374  return LT.first * Entry->Cost;
375  }
376 
377  static const CostTblEntry AVX512DQCostTable[] = {
378  { ISD::MUL, MVT::v2i64, 1 },
379  { ISD::MUL, MVT::v4i64, 1 },
380  { ISD::MUL, MVT::v8i64, 1 }
381  };
382 
383  // Look for AVX512DQ lowering tricks for custom cases.
384  if (ST->hasDQI())
385  if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
386  return LT.first * Entry->Cost;
387 
388  static const CostTblEntry AVX512BWCostTable[] = {
389  { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
390  { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
391  { ISD::SRA, MVT::v8i16, 1 }, // vpsravw
392 
393  { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
394  { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
395  { ISD::SRA, MVT::v16i16, 1 }, // vpsravw
396 
397  { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
398  { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
399  { ISD::SRA, MVT::v32i16, 1 }, // vpsravw
400 
401  { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence.
402  { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence.
403  { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence.
404 
405  { ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence.
406  { ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence.
407  { ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence.
408 
409  // Vectorizing division is a bad idea. See the SSE2 table for more comments.
410  { ISD::SDIV, MVT::v64i8, 64*20 },
411  { ISD::SDIV, MVT::v32i16, 32*20 },
412  { ISD::UDIV, MVT::v64i8, 64*20 },
413  { ISD::UDIV, MVT::v32i16, 32*20 }
414  };
415 
416  // Look for AVX512BW lowering tricks for custom cases.
417  if (ST->hasBWI())
418  if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
419  return LT.first * Entry->Cost;
420 
421  static const CostTblEntry AVX512CostTable[] = {
422  { ISD::SHL, MVT::v16i32, 1 },
423  { ISD::SRL, MVT::v16i32, 1 },
424  { ISD::SRA, MVT::v16i32, 1 },
425 
426  { ISD::SHL, MVT::v8i64, 1 },
427  { ISD::SRL, MVT::v8i64, 1 },
428 
429  { ISD::SRA, MVT::v2i64, 1 },
430  { ISD::SRA, MVT::v4i64, 1 },
431  { ISD::SRA, MVT::v8i64, 1 },
432 
433  { ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence.
434  { ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence.
435  { ISD::MUL, MVT::v16i32, 1 }, // pmulld
436  { ISD::MUL, MVT::v8i64, 8 }, // 3*pmuludq/3*shift/2*add
437 
438  // Vectorizing division is a bad idea. See the SSE2 table for more comments.
439  { ISD::SDIV, MVT::v16i32, 16*20 },
440  { ISD::SDIV, MVT::v8i64, 8*20 },
441  { ISD::UDIV, MVT::v16i32, 16*20 },
442  { ISD::UDIV, MVT::v8i64, 8*20 }
443  };
444 
445  if (ST->hasAVX512())
446  if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
447  return LT.first * Entry->Cost;
448 
449  static const CostTblEntry AVX2ShiftCostTable[] = {
450  // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
451  // customize them to detect the cases where shift amount is a scalar one.
452  { ISD::SHL, MVT::v4i32, 1 },
453  { ISD::SRL, MVT::v4i32, 1 },
454  { ISD::SRA, MVT::v4i32, 1 },
455  { ISD::SHL, MVT::v8i32, 1 },
456  { ISD::SRL, MVT::v8i32, 1 },
457  { ISD::SRA, MVT::v8i32, 1 },
458  { ISD::SHL, MVT::v2i64, 1 },
459  { ISD::SRL, MVT::v2i64, 1 },
460  { ISD::SHL, MVT::v4i64, 1 },
461  { ISD::SRL, MVT::v4i64, 1 },
462  };
463 
464  // Look for AVX2 lowering tricks.
465  if (ST->hasAVX2()) {
466  if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
469  // On AVX2, a packed v16i16 shift left by a constant build_vector
470  // is lowered into a vector multiply (vpmullw).
471  return LT.first;
472 
473  if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
474  return LT.first * Entry->Cost;
475  }
476 
477  static const CostTblEntry XOPShiftCostTable[] = {
478  // 128bit shifts take 1cy, but right shifts require negation beforehand.
479  { ISD::SHL, MVT::v16i8, 1 },
480  { ISD::SRL, MVT::v16i8, 2 },
481  { ISD::SRA, MVT::v16i8, 2 },
482  { ISD::SHL, MVT::v8i16, 1 },
483  { ISD::SRL, MVT::v8i16, 2 },
484  { ISD::SRA, MVT::v8i16, 2 },
485  { ISD::SHL, MVT::v4i32, 1 },
486  { ISD::SRL, MVT::v4i32, 2 },
487  { ISD::SRA, MVT::v4i32, 2 },
488  { ISD::SHL, MVT::v2i64, 1 },
489  { ISD::SRL, MVT::v2i64, 2 },
490  { ISD::SRA, MVT::v2i64, 2 },
491  // 256bit shifts require splitting if AVX2 didn't catch them above.
492  { ISD::SHL, MVT::v32i8, 2+2 },
493  { ISD::SRL, MVT::v32i8, 4+2 },
494  { ISD::SRA, MVT::v32i8, 4+2 },
495  { ISD::SHL, MVT::v16i16, 2+2 },
496  { ISD::SRL, MVT::v16i16, 4+2 },
497  { ISD::SRA, MVT::v16i16, 4+2 },
498  { ISD::SHL, MVT::v8i32, 2+2 },
499  { ISD::SRL, MVT::v8i32, 4+2 },
500  { ISD::SRA, MVT::v8i32, 4+2 },
501  { ISD::SHL, MVT::v4i64, 2+2 },
502  { ISD::SRL, MVT::v4i64, 4+2 },
503  { ISD::SRA, MVT::v4i64, 4+2 },
504  };
505 
506  // Look for XOP lowering tricks.
507  if (ST->hasXOP())
508  if (const auto *Entry = CostTableLookup(XOPShiftCostTable, ISD, LT.second))
509  return LT.first * Entry->Cost;
510 
511  static const CostTblEntry SSE2UniformShiftCostTable[] = {
512  // Uniform splats are cheaper for the following instructions.
513  { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split.
514  { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split.
515  { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split.
516 
517  { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split.
518  { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split.
519  { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split.
520 
521  { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split.
522  { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split.
523  { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle.
524  { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split.
525  };
526 
527  if (ST->hasSSE2() &&
529  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
530 
531  // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
532  if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2())
533  return LT.first * 4; // 2*psrad + shuffle.
534 
535  if (const auto *Entry =
536  CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
537  return LT.first * Entry->Cost;
538  }
539 
540  if (ISD == ISD::SHL &&
542  MVT VT = LT.second;
543  // Vector shift left by non uniform constant can be lowered
544  // into vector multiply.
545  if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
546  ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
547  ISD = ISD::MUL;
548  }
549 
550  static const CostTblEntry AVX2CostTable[] = {
551  { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence.
552  { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
553 
554  { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence.
555  { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
556 
557  { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence.
558  { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence.
559  { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence.
560  { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence.
561 
562  { ISD::SUB, MVT::v32i8, 1 }, // psubb
563  { ISD::ADD, MVT::v32i8, 1 }, // paddb
564  { ISD::SUB, MVT::v16i16, 1 }, // psubw
565  { ISD::ADD, MVT::v16i16, 1 }, // paddw
566  { ISD::SUB, MVT::v8i32, 1 }, // psubd
567  { ISD::ADD, MVT::v8i32, 1 }, // paddd
568  { ISD::SUB, MVT::v4i64, 1 }, // psubq
569  { ISD::ADD, MVT::v4i64, 1 }, // paddq
570 
571  { ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence.
572  { ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence.
573  { ISD::MUL, MVT::v16i16, 1 }, // pmullw
574  { ISD::MUL, MVT::v8i32, 1 }, // pmulld
575  { ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add
576 
577  { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
578  { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
579  { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
580  { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/
581  { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
582  { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
583  };
584 
585  // Look for AVX2 lowering tricks for custom cases.
586  if (ST->hasAVX2())
587  if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
588  return LT.first * Entry->Cost;
589 
590  static const CostTblEntry AVX1CostTable[] = {
591  // We don't have to scalarize unsupported ops. We can issue two half-sized
592  // operations and we only need to extract the upper YMM half.
593  // Two ops + 1 extract + 1 insert = 4.
594  { ISD::MUL, MVT::v16i16, 4 },
595  { ISD::MUL, MVT::v8i32, 4 },
596  { ISD::SUB, MVT::v32i8, 4 },
597  { ISD::ADD, MVT::v32i8, 4 },
598  { ISD::SUB, MVT::v16i16, 4 },
599  { ISD::ADD, MVT::v16i16, 4 },
600  { ISD::SUB, MVT::v8i32, 4 },
601  { ISD::ADD, MVT::v8i32, 4 },
602  { ISD::SUB, MVT::v4i64, 4 },
603  { ISD::ADD, MVT::v4i64, 4 },
604 
605  // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
606  // are lowered as a series of long multiplies(3), shifts(3) and adds(2)
607  // Because we believe v4i64 to be a legal type, we must also include the
608  // extract+insert in the cost table. Therefore, the cost here is 18
609  // instead of 8.
610  { ISD::MUL, MVT::v4i64, 18 },
611 
612  { ISD::MUL, MVT::v32i8, 26 }, // extend/pmullw/trunc sequence.
613 
614  { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
615  { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
616  { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
617  { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/
618  { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/
619  { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/
620 
621  // Vectorizing division is a bad idea. See the SSE2 table for more comments.
622  { ISD::SDIV, MVT::v32i8, 32*20 },
623  { ISD::SDIV, MVT::v16i16, 16*20 },
624  { ISD::SDIV, MVT::v8i32, 8*20 },
625  { ISD::SDIV, MVT::v4i64, 4*20 },
626  { ISD::UDIV, MVT::v32i8, 32*20 },
627  { ISD::UDIV, MVT::v16i16, 16*20 },
628  { ISD::UDIV, MVT::v8i32, 8*20 },
629  { ISD::UDIV, MVT::v4i64, 4*20 },
630  };
631 
632  if (ST->hasAVX())
633  if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
634  return LT.first * Entry->Cost;
635 
636  static const CostTblEntry SSE42CostTable[] = {
637  { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/
638  { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
639  { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/
640  { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
641  };
642 
643  if (ST->hasSSE42())
644  if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
645  return LT.first * Entry->Cost;
646 
647  static const CostTblEntry SSE41CostTable[] = {
648  { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence.
649  { ISD::SHL, MVT::v32i8, 2*11+2 }, // pblendvb sequence + split.
650  { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence.
651  { ISD::SHL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
652  { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
653  { ISD::SHL, MVT::v8i32, 2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split
654 
655  { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence.
656  { ISD::SRL, MVT::v32i8, 2*12+2 }, // pblendvb sequence + split.
657  { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence.
658  { ISD::SRL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
659  { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend.
660  { ISD::SRL, MVT::v8i32, 2*11+2 }, // Shift each lane + blend + split.
661 
662  { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence.
663  { ISD::SRA, MVT::v32i8, 2*24+2 }, // pblendvb sequence + split.
664  { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence.
665  { ISD::SRA, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
666  { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
667  { ISD::SRA, MVT::v8i32, 2*12+2 }, // Shift each lane + blend + split.
668 
669  { ISD::MUL, MVT::v4i32, 1 } // pmulld
670  };
671 
672  if (ST->hasSSE41())
673  if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
674  return LT.first * Entry->Cost;
675 
676  static const CostTblEntry SSE2CostTable[] = {
677  // We don't correctly identify costs of casts because they are marked as
678  // custom.
679  { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence.
680  { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence.
681  { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
682  { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
683  { ISD::SHL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
684 
685  { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence.
686  { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence.
687  { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
688  { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
689  { ISD::SRL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
690 
691  { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence.
692  { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence.
693  { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend.
694  { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence.
695  { ISD::SRA, MVT::v4i64, 2*12+2 }, // srl/xor/sub sequence+split.
696 
697  { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence.
698  { ISD::MUL, MVT::v8i16, 1 }, // pmullw
699  { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
700  { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
701 
702  { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/
703  { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
704  { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
705  { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
706 
707  // It is not a good idea to vectorize division. We have to scalarize it and
708  // in the process we will often end up having to spilling regular
709  // registers. The overhead of division is going to dominate most kernels
710  // anyways so try hard to prevent vectorization of division - it is
711  // generally a bad idea. Assume somewhat arbitrarily that we have to be able
712  // to hide "20 cycles" for each lane.
713  { ISD::SDIV, MVT::v16i8, 16*20 },
714  { ISD::SDIV, MVT::v8i16, 8*20 },
715  { ISD::SDIV, MVT::v4i32, 4*20 },
716  { ISD::SDIV, MVT::v2i64, 2*20 },
717  { ISD::UDIV, MVT::v16i8, 16*20 },
718  { ISD::UDIV, MVT::v8i16, 8*20 },
719  { ISD::UDIV, MVT::v4i32, 4*20 },
720  { ISD::UDIV, MVT::v2i64, 2*20 },
721  };
722 
723  if (ST->hasSSE2())
724  if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
725  return LT.first * Entry->Cost;
726 
727  static const CostTblEntry SSE1CostTable[] = {
728  { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/
729  { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
730  };
731 
732  if (ST->hasSSE1())
733  if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
734  return LT.first * Entry->Cost;
735 
736  // Fallback to the default implementation.
737  return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
738 }
739 
741  Type *SubTp) {
742  // 64-bit packed float vectors (v2f32) are widened to type v4f32.
743  // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
744  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
745 
746  // For Broadcasts we are splatting the first element from the first input
747  // register, so only need to reference that input and all the output
748  // registers are the same.
749  if (Kind == TTI::SK_Broadcast)
750  LT.first = 1;
751 
752  // We are going to permute multiple sources and the result will be in multiple
753  // destinations. Providing an accurate cost only for splits where the element
754  // type remains the same.
755  if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
756  MVT LegalVT = LT.second;
757  if (LegalVT.getVectorElementType().getSizeInBits() ==
759  LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) {
760 
761  unsigned VecTySize = DL.getTypeStoreSize(Tp);
762  unsigned LegalVTSize = LegalVT.getStoreSize();
763  // Number of source vectors after legalization:
764  unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
765  // Number of destination vectors after legalization:
766  unsigned NumOfDests = LT.first;
767 
768  Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(),
769  LegalVT.getVectorNumElements());
770 
771  unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
772  return NumOfShuffles *
773  getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
774  }
775 
776  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
777  }
778 
779  // For 2-input shuffles, we must account for splitting the 2 inputs into many.
780  if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
781  // We assume that source and destination have the same vector type.
782  int NumOfDests = LT.first;
783  int NumOfShufflesPerDest = LT.first * 2 - 1;
784  LT.first = NumOfDests * NumOfShufflesPerDest;
785  }
786 
787  static const CostTblEntry AVX512VBMIShuffleTbl[] = {
788  { TTI::SK_Reverse, MVT::v64i8, 1 }, // vpermb
789  { TTI::SK_Reverse, MVT::v32i8, 1 }, // vpermb
790 
791  { TTI::SK_PermuteSingleSrc, MVT::v64i8, 1 }, // vpermb
792  { TTI::SK_PermuteSingleSrc, MVT::v32i8, 1 }, // vpermb
793 
794  { TTI::SK_PermuteTwoSrc, MVT::v64i8, 1 }, // vpermt2b
795  { TTI::SK_PermuteTwoSrc, MVT::v32i8, 1 }, // vpermt2b
796  { TTI::SK_PermuteTwoSrc, MVT::v16i8, 1 } // vpermt2b
797  };
798 
799  if (ST->hasVBMI())
800  if (const auto *Entry =
801  CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
802  return LT.first * Entry->Cost;
803 
804  static const CostTblEntry AVX512BWShuffleTbl[] = {
805  { TTI::SK_Broadcast, MVT::v32i16, 1 }, // vpbroadcastw
806  { TTI::SK_Broadcast, MVT::v64i8, 1 }, // vpbroadcastb
807 
808  { TTI::SK_Reverse, MVT::v32i16, 1 }, // vpermw
809  { TTI::SK_Reverse, MVT::v16i16, 1 }, // vpermw
810  { TTI::SK_Reverse, MVT::v64i8, 2 }, // pshufb + vshufi64x2
811 
812  { TTI::SK_PermuteSingleSrc, MVT::v32i16, 1 }, // vpermw
813  { TTI::SK_PermuteSingleSrc, MVT::v16i16, 1 }, // vpermw
814  { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // vpermw
815  { TTI::SK_PermuteSingleSrc, MVT::v64i8, 8 }, // extend to v32i16
816  { TTI::SK_PermuteSingleSrc, MVT::v32i8, 3 }, // vpermw + zext/trunc
817 
818  { TTI::SK_PermuteTwoSrc, MVT::v32i16, 1 }, // vpermt2w
819  { TTI::SK_PermuteTwoSrc, MVT::v16i16, 1 }, // vpermt2w
820  { TTI::SK_PermuteTwoSrc, MVT::v8i16, 1 }, // vpermt2w
821  { TTI::SK_PermuteTwoSrc, MVT::v32i8, 3 }, // zext + vpermt2w + trunc
822  { TTI::SK_PermuteTwoSrc, MVT::v64i8, 19 }, // 6 * v32i8 + 1
823  { TTI::SK_PermuteTwoSrc, MVT::v16i8, 3 } // zext + vpermt2w + trunc
824  };
825 
826  if (ST->hasBWI())
827  if (const auto *Entry =
828  CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
829  return LT.first * Entry->Cost;
830 
831  static const CostTblEntry AVX512ShuffleTbl[] = {
832  { TTI::SK_Broadcast, MVT::v8f64, 1 }, // vbroadcastpd
833  { TTI::SK_Broadcast, MVT::v16f32, 1 }, // vbroadcastps
834  { TTI::SK_Broadcast, MVT::v8i64, 1 }, // vpbroadcastq
835  { TTI::SK_Broadcast, MVT::v16i32, 1 }, // vpbroadcastd
836 
837  { TTI::SK_Reverse, MVT::v8f64, 1 }, // vpermpd
838  { TTI::SK_Reverse, MVT::v16f32, 1 }, // vpermps
839  { TTI::SK_Reverse, MVT::v8i64, 1 }, // vpermq
840  { TTI::SK_Reverse, MVT::v16i32, 1 }, // vpermd
841 
842  { TTI::SK_PermuteSingleSrc, MVT::v8f64, 1 }, // vpermpd
843  { TTI::SK_PermuteSingleSrc, MVT::v4f64, 1 }, // vpermpd
844  { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // vpermpd
845  { TTI::SK_PermuteSingleSrc, MVT::v16f32, 1 }, // vpermps
846  { TTI::SK_PermuteSingleSrc, MVT::v8f32, 1 }, // vpermps
847  { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // vpermps
848  { TTI::SK_PermuteSingleSrc, MVT::v8i64, 1 }, // vpermq
849  { TTI::SK_PermuteSingleSrc, MVT::v4i64, 1 }, // vpermq
850  { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // vpermq
851  { TTI::SK_PermuteSingleSrc, MVT::v16i32, 1 }, // vpermd
852  { TTI::SK_PermuteSingleSrc, MVT::v8i32, 1 }, // vpermd
853  { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }, // vpermd
854  { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb
855 
856  { TTI::SK_PermuteTwoSrc, MVT::v8f64, 1 }, // vpermt2pd
857  { TTI::SK_PermuteTwoSrc, MVT::v16f32, 1 }, // vpermt2ps
858  { TTI::SK_PermuteTwoSrc, MVT::v8i64, 1 }, // vpermt2q
859  { TTI::SK_PermuteTwoSrc, MVT::v16i32, 1 }, // vpermt2d
860  { TTI::SK_PermuteTwoSrc, MVT::v4f64, 1 }, // vpermt2pd
861  { TTI::SK_PermuteTwoSrc, MVT::v8f32, 1 }, // vpermt2ps
862  { TTI::SK_PermuteTwoSrc, MVT::v4i64, 1 }, // vpermt2q
863  { TTI::SK_PermuteTwoSrc, MVT::v8i32, 1 }, // vpermt2d
864  { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // vpermt2pd
865  { TTI::SK_PermuteTwoSrc, MVT::v4f32, 1 }, // vpermt2ps
866  { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // vpermt2q
867  { TTI::SK_PermuteTwoSrc, MVT::v4i32, 1 } // vpermt2d
868  };
869 
870  if (ST->hasAVX512())
871  if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
872  return LT.first * Entry->Cost;
873 
874  static const CostTblEntry AVX2ShuffleTbl[] = {
875  { TTI::SK_Broadcast, MVT::v4f64, 1 }, // vbroadcastpd
876  { TTI::SK_Broadcast, MVT::v8f32, 1 }, // vbroadcastps
877  { TTI::SK_Broadcast, MVT::v4i64, 1 }, // vpbroadcastq
878  { TTI::SK_Broadcast, MVT::v8i32, 1 }, // vpbroadcastd
879  { TTI::SK_Broadcast, MVT::v16i16, 1 }, // vpbroadcastw
880  { TTI::SK_Broadcast, MVT::v32i8, 1 }, // vpbroadcastb
881 
882  { TTI::SK_Reverse, MVT::v4f64, 1 }, // vpermpd
883  { TTI::SK_Reverse, MVT::v8f32, 1 }, // vpermps
884  { TTI::SK_Reverse, MVT::v4i64, 1 }, // vpermq
885  { TTI::SK_Reverse, MVT::v8i32, 1 }, // vpermd
886  { TTI::SK_Reverse, MVT::v16i16, 2 }, // vperm2i128 + pshufb
887  { TTI::SK_Reverse, MVT::v32i8, 2 }, // vperm2i128 + pshufb
888 
889  { TTI::SK_Alternate, MVT::v16i16, 1 }, // vpblendw
890  { TTI::SK_Alternate, MVT::v32i8, 1 }, // vpblendvb
891 
892  { TTI::SK_PermuteSingleSrc, MVT::v4f64, 1 }, // vpermpd
893  { TTI::SK_PermuteSingleSrc, MVT::v8f32, 1 }, // vpermps
894  { TTI::SK_PermuteSingleSrc, MVT::v4i64, 1 }, // vpermq
895  { TTI::SK_PermuteSingleSrc, MVT::v8i32, 1 }, // vpermd
896  { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vperm2i128 + 2*vpshufb
897  // + vpblendvb
898  { TTI::SK_PermuteSingleSrc, MVT::v32i8, 4 }, // vperm2i128 + 2*vpshufb
899  // + vpblendvb
900 
901  { TTI::SK_PermuteTwoSrc, MVT::v4f64, 3 }, // 2*vpermpd + vblendpd
902  { TTI::SK_PermuteTwoSrc, MVT::v8f32, 3 }, // 2*vpermps + vblendps
903  { TTI::SK_PermuteTwoSrc, MVT::v4i64, 3 }, // 2*vpermq + vpblendd
904  { TTI::SK_PermuteTwoSrc, MVT::v8i32, 3 }, // 2*vpermd + vpblendd
905  { TTI::SK_PermuteTwoSrc, MVT::v16i16, 7 }, // 2*vperm2i128 + 4*vpshufb
906  // + vpblendvb
907  { TTI::SK_PermuteTwoSrc, MVT::v32i8, 7 }, // 2*vperm2i128 + 4*vpshufb
908  // + vpblendvb
909  };
910 
911  if (ST->hasAVX2())
912  if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
913  return LT.first * Entry->Cost;
914 
915  static const CostTblEntry XOPShuffleTbl[] = {
916  { TTI::SK_PermuteSingleSrc, MVT::v4f64, 2 }, // vperm2f128 + vpermil2pd
917  { TTI::SK_PermuteSingleSrc, MVT::v8f32, 2 }, // vperm2f128 + vpermil2ps
918  { TTI::SK_PermuteSingleSrc, MVT::v4i64, 2 }, // vperm2f128 + vpermil2pd
919  { TTI::SK_PermuteSingleSrc, MVT::v8i32, 2 }, // vperm2f128 + vpermil2ps
920  { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vextractf128 + 2*vpperm
921  // + vinsertf128
922  { TTI::SK_PermuteSingleSrc, MVT::v32i8, 4 }, // vextractf128 + 2*vpperm
923  // + vinsertf128
924 
925  { TTI::SK_PermuteTwoSrc, MVT::v16i16, 9 }, // 2*vextractf128 + 6*vpperm
926  // + vinsertf128
927  { TTI::SK_PermuteTwoSrc, MVT::v8i16, 1 }, // vpperm
928  { TTI::SK_PermuteTwoSrc, MVT::v32i8, 9 }, // 2*vextractf128 + 6*vpperm
929  // + vinsertf128
930  { TTI::SK_PermuteTwoSrc, MVT::v16i8, 1 }, // vpperm
931  };
932 
933  if (ST->hasXOP())
934  if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
935  return LT.first * Entry->Cost;
936 
937  static const CostTblEntry AVX1ShuffleTbl[] = {
938  { TTI::SK_Broadcast, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd
939  { TTI::SK_Broadcast, MVT::v8f32, 2 }, // vperm2f128 + vpermilps
940  { TTI::SK_Broadcast, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd
941  { TTI::SK_Broadcast, MVT::v8i32, 2 }, // vperm2f128 + vpermilps
942  { TTI::SK_Broadcast, MVT::v16i16, 3 }, // vpshuflw + vpshufd + vinsertf128
943  { TTI::SK_Broadcast, MVT::v32i8, 2 }, // vpshufb + vinsertf128
944 
945  { TTI::SK_Reverse, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd
946  { TTI::SK_Reverse, MVT::v8f32, 2 }, // vperm2f128 + vpermilps
947  { TTI::SK_Reverse, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd
948  { TTI::SK_Reverse, MVT::v8i32, 2 }, // vperm2f128 + vpermilps
949  { TTI::SK_Reverse, MVT::v16i16, 4 }, // vextractf128 + 2*pshufb
950  // + vinsertf128
951  { TTI::SK_Reverse, MVT::v32i8, 4 }, // vextractf128 + 2*pshufb
952  // + vinsertf128
953 
954  { TTI::SK_Alternate, MVT::v4i64, 1 }, // vblendpd
955  { TTI::SK_Alternate, MVT::v4f64, 1 }, // vblendpd
956  { TTI::SK_Alternate, MVT::v8i32, 1 }, // vblendps
957  { TTI::SK_Alternate, MVT::v8f32, 1 }, // vblendps
958  { TTI::SK_Alternate, MVT::v16i16, 3 }, // vpand + vpandn + vpor
959  { TTI::SK_Alternate, MVT::v32i8, 3 }, // vpand + vpandn + vpor
960 
961  { TTI::SK_PermuteSingleSrc, MVT::v4f64, 3 }, // 2*vperm2f128 + vshufpd
962  { TTI::SK_PermuteSingleSrc, MVT::v4i64, 3 }, // 2*vperm2f128 + vshufpd
963  { TTI::SK_PermuteSingleSrc, MVT::v8f32, 4 }, // 2*vperm2f128 + 2*vshufps
964  { TTI::SK_PermuteSingleSrc, MVT::v8i32, 4 }, // 2*vperm2f128 + 2*vshufps
965  { TTI::SK_PermuteSingleSrc, MVT::v16i16, 8 }, // vextractf128 + 4*pshufb
966  // + 2*por + vinsertf128
967  { TTI::SK_PermuteSingleSrc, MVT::v32i8, 8 }, // vextractf128 + 4*pshufb
968  // + 2*por + vinsertf128
969 
970  { TTI::SK_PermuteTwoSrc, MVT::v4f64, 4 }, // 2*vperm2f128 + 2*vshufpd
971  { TTI::SK_PermuteTwoSrc, MVT::v8f32, 4 }, // 2*vperm2f128 + 2*vshufps
972  { TTI::SK_PermuteTwoSrc, MVT::v4i64, 4 }, // 2*vperm2f128 + 2*vshufpd
973  { TTI::SK_PermuteTwoSrc, MVT::v8i32, 4 }, // 2*vperm2f128 + 2*vshufps
974  { TTI::SK_PermuteTwoSrc, MVT::v16i16, 15 }, // 2*vextractf128 + 8*pshufb
975  // + 4*por + vinsertf128
976  { TTI::SK_PermuteTwoSrc, MVT::v32i8, 15 }, // 2*vextractf128 + 8*pshufb
977  // + 4*por + vinsertf128
978  };
979 
980  if (ST->hasAVX())
981  if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
982  return LT.first * Entry->Cost;
983 
984  static const CostTblEntry SSE41ShuffleTbl[] = {
985  { TTI::SK_Alternate, MVT::v2i64, 1 }, // pblendw
986  { TTI::SK_Alternate, MVT::v2f64, 1 }, // movsd
987  { TTI::SK_Alternate, MVT::v4i32, 1 }, // pblendw
988  { TTI::SK_Alternate, MVT::v4f32, 1 }, // blendps
989  { TTI::SK_Alternate, MVT::v8i16, 1 }, // pblendw
990  { TTI::SK_Alternate, MVT::v16i8, 1 } // pblendvb
991  };
992 
993  if (ST->hasSSE41())
994  if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
995  return LT.first * Entry->Cost;
996 
997  static const CostTblEntry SSSE3ShuffleTbl[] = {
998  { TTI::SK_Broadcast, MVT::v8i16, 1 }, // pshufb
999  { TTI::SK_Broadcast, MVT::v16i8, 1 }, // pshufb
1000 
1001  { TTI::SK_Reverse, MVT::v8i16, 1 }, // pshufb
1002  { TTI::SK_Reverse, MVT::v16i8, 1 }, // pshufb
1003 
1004  { TTI::SK_Alternate, MVT::v8i16, 3 }, // 2*pshufb + por
1005  { TTI::SK_Alternate, MVT::v16i8, 3 }, // 2*pshufb + por
1006 
1007  { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // pshufb
1008  { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb
1009 
1010  { TTI::SK_PermuteTwoSrc, MVT::v8i16, 3 }, // 2*pshufb + por
1011  { TTI::SK_PermuteTwoSrc, MVT::v16i8, 3 }, // 2*pshufb + por
1012  };
1013 
1014  if (ST->hasSSSE3())
1015  if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
1016  return LT.first * Entry->Cost;
1017 
1018  static const CostTblEntry SSE2ShuffleTbl[] = {
1019  { TTI::SK_Broadcast, MVT::v2f64, 1 }, // shufpd
1020  { TTI::SK_Broadcast, MVT::v2i64, 1 }, // pshufd
1021  { TTI::SK_Broadcast, MVT::v4i32, 1 }, // pshufd
1022  { TTI::SK_Broadcast, MVT::v8i16, 2 }, // pshuflw + pshufd
1023  { TTI::SK_Broadcast, MVT::v16i8, 3 }, // unpck + pshuflw + pshufd
1024 
1025  { TTI::SK_Reverse, MVT::v2f64, 1 }, // shufpd
1026  { TTI::SK_Reverse, MVT::v2i64, 1 }, // pshufd
1027  { TTI::SK_Reverse, MVT::v4i32, 1 }, // pshufd
1028  { TTI::SK_Reverse, MVT::v8i16, 3 }, // pshuflw + pshufhw + pshufd
1029  { TTI::SK_Reverse, MVT::v16i8, 9 }, // 2*pshuflw + 2*pshufhw
1030  // + 2*pshufd + 2*unpck + packus
1031 
1032  { TTI::SK_Alternate, MVT::v2i64, 1 }, // movsd
1033  { TTI::SK_Alternate, MVT::v2f64, 1 }, // movsd
1034  { TTI::SK_Alternate, MVT::v4i32, 2 }, // 2*shufps
1035  { TTI::SK_Alternate, MVT::v8i16, 3 }, // pand + pandn + por
1036  { TTI::SK_Alternate, MVT::v16i8, 3 }, // pand + pandn + por
1037 
1038  { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // shufpd
1039  { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // pshufd
1040  { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }, // pshufd
1041  { TTI::SK_PermuteSingleSrc, MVT::v8i16, 5 }, // 2*pshuflw + 2*pshufhw
1042  // + pshufd/unpck
1043  { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
1044  // + 2*pshufd + 2*unpck + 2*packus
1045 
1046  { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
1047  { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
1048  { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
1049  { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
1050  { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
1051  };
1052 
1053  if (ST->hasSSE2())
1054  if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
1055  return LT.first * Entry->Cost;
1056 
1057  static const CostTblEntry SSE1ShuffleTbl[] = {
1058  { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
1059  { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
1060  { TTI::SK_Alternate, MVT::v4f32, 2 }, // 2*shufps
1061  { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
1062  { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
1063  };
1064 
1065  if (ST->hasSSE1())
1066  if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
1067  return LT.first * Entry->Cost;
1068 
1069  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
1070 }
1071 
1072 int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1073  const Instruction *I) {
1074  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1075  assert(ISD && "Invalid opcode");
1076 
1077  // FIXME: Need a better design of the cost table to handle non-simple types of
1078  // potential massive combinations (elem_num x src_type x dst_type).
1079 
1080  static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
1087 
1094 
1101 
1108  };
1109 
1110  // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
1111  // 256-bit wide vectors.
1112 
1113  static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
1117 
1122 
1123  // v16i1 -> v16i32 - load + broadcast
1134 
1145 
1169 
1178  };
1179 
1180  static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
1197 
1204 
1207 
1209  };
1210 
1211  static const TypeConversionCostTblEntry AVXConversionTbl[] = {
1228 
1236 
1249 
1263  // The generic code to compute the scalar overhead is currently broken.
1264  // Workaround this limitation by estimating the scalarization overhead
1265  // here. We have roughly 10 instructions per scalar element.
1266  // Multiply that by the vector width.
1267  // FIXME: remove that when PR19268 is fixed.
1272 
1275  // This node is expanded into scalarized operations but BasicTTI is overly
1276  // optimistic estimating its cost. It computes 3 per element (one
1277  // vector-extract, one scalar conversion and one vector-insert). The
1278  // problem is that the inserts form a read-modify-write chain so latency
1279  // should be factored in too. Inflating the cost per element by 1.
1282 
1285  };
1286 
1287  static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
1294 
1313 
1321 
1322  };
1323 
1324  static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
1325  // These are somewhat magic numbers justified by looking at the output of
1326  // Intel's IACA, running some kernels and making sure when we take
1327  // legalization into account the throughput will be overestimated.
1329  { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1336 
1337  { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1345 
1347 
1372 
1382  };
1383 
1384  std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
1385  std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);
1386 
1387  if (ST->hasSSE2() && !ST->hasAVX()) {
1388  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
1389  LTDest.second, LTSrc.second))
1390  return LTSrc.first * Entry->Cost;
1391  }
1392 
1393  EVT SrcTy = TLI->getValueType(DL, Src);
1394  EVT DstTy = TLI->getValueType(DL, Dst);
1395 
1396  // The function getSimpleVT only handles simple value types.
1397  if (!SrcTy.isSimple() || !DstTy.isSimple())
1398  return BaseT::getCastInstrCost(Opcode, Dst, Src);
1399 
1400  if (ST->hasDQI())
1401  if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
1402  DstTy.getSimpleVT(),
1403  SrcTy.getSimpleVT()))
1404  return Entry->Cost;
1405 
1406  if (ST->hasAVX512())
1407  if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
1408  DstTy.getSimpleVT(),
1409  SrcTy.getSimpleVT()))
1410  return Entry->Cost;
1411 
1412  if (ST->hasAVX2()) {
1413  if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
1414  DstTy.getSimpleVT(),
1415  SrcTy.getSimpleVT()))
1416  return Entry->Cost;
1417  }
1418 
1419  if (ST->hasAVX()) {
1420  if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
1421  DstTy.getSimpleVT(),
1422  SrcTy.getSimpleVT()))
1423  return Entry->Cost;
1424  }
1425 
1426  if (ST->hasSSE41()) {
1427  if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
1428  DstTy.getSimpleVT(),
1429  SrcTy.getSimpleVT()))
1430  return Entry->Cost;
1431  }
1432 
1433  if (ST->hasSSE2()) {
1434  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
1435  DstTy.getSimpleVT(),
1436  SrcTy.getSimpleVT()))
1437  return Entry->Cost;
1438  }
1439 
1440  return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
1441 }
1442 
1443 int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1444  const Instruction *I) {
1445  // Legalize the type.
1446  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1447 
1448  MVT MTy = LT.second;
1449 
1450  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1451  assert(ISD && "Invalid opcode");
1452 
1453  static const CostTblEntry SSE2CostTbl[] = {
1454  { ISD::SETCC, MVT::v2i64, 8 },
1455  { ISD::SETCC, MVT::v4i32, 1 },
1456  { ISD::SETCC, MVT::v8i16, 1 },
1457  { ISD::SETCC, MVT::v16i8, 1 },
1458  };
1459 
1460  static const CostTblEntry SSE42CostTbl[] = {
1461  { ISD::SETCC, MVT::v2f64, 1 },
1462  { ISD::SETCC, MVT::v4f32, 1 },
1463  { ISD::SETCC, MVT::v2i64, 1 },
1464  };
1465 
1466  static const CostTblEntry AVX1CostTbl[] = {
1467  { ISD::SETCC, MVT::v4f64, 1 },
1468  { ISD::SETCC, MVT::v8f32, 1 },
1469  // AVX1 does not support 8-wide integer compare.
1470  { ISD::SETCC, MVT::v4i64, 4 },
1471  { ISD::SETCC, MVT::v8i32, 4 },
1472  { ISD::SETCC, MVT::v16i16, 4 },
1473  { ISD::SETCC, MVT::v32i8, 4 },
1474  };
1475 
1476  static const CostTblEntry AVX2CostTbl[] = {
1477  { ISD::SETCC, MVT::v4i64, 1 },
1478  { ISD::SETCC, MVT::v8i32, 1 },
1479  { ISD::SETCC, MVT::v16i16, 1 },
1480  { ISD::SETCC, MVT::v32i8, 1 },
1481  };
1482 
1483  static const CostTblEntry AVX512CostTbl[] = {
1484  { ISD::SETCC, MVT::v8i64, 1 },
1485  { ISD::SETCC, MVT::v16i32, 1 },
1486  { ISD::SETCC, MVT::v8f64, 1 },
1487  { ISD::SETCC, MVT::v16f32, 1 },
1488  };
1489 
1490  if (ST->hasAVX512())
1491  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
1492  return LT.first * Entry->Cost;
1493 
1494  if (ST->hasAVX2())
1495  if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
1496  return LT.first * Entry->Cost;
1497 
1498  if (ST->hasAVX())
1499  if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
1500  return LT.first * Entry->Cost;
1501 
1502  if (ST->hasSSE42())
1503  if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
1504  return LT.first * Entry->Cost;
1505 
1506  if (ST->hasSSE2())
1507  if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
1508  return LT.first * Entry->Cost;
1509 
1510  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
1511 }
1512 
1514 
1517  unsigned ScalarizationCostPassed) {
1518  // Costs should match the codegen from:
1519  // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
1520  // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
1521  // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
1522  // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
1523  // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
1524  static const CostTblEntry AVX512CDCostTbl[] = {
1525  { ISD::CTLZ, MVT::v8i64, 1 },
1526  { ISD::CTLZ, MVT::v16i32, 1 },
1527  { ISD::CTLZ, MVT::v32i16, 8 },
1528  { ISD::CTLZ, MVT::v64i8, 20 },
1529  { ISD::CTLZ, MVT::v4i64, 1 },
1530  { ISD::CTLZ, MVT::v8i32, 1 },
1531  { ISD::CTLZ, MVT::v16i16, 4 },
1532  { ISD::CTLZ, MVT::v32i8, 10 },
1533  { ISD::CTLZ, MVT::v2i64, 1 },
1534  { ISD::CTLZ, MVT::v4i32, 1 },
1535  { ISD::CTLZ, MVT::v8i16, 4 },
1536  { ISD::CTLZ, MVT::v16i8, 4 },
1537  };
1538  static const CostTblEntry AVX512BWCostTbl[] = {
1539  { ISD::BITREVERSE, MVT::v8i64, 5 },
1540  { ISD::BITREVERSE, MVT::v16i32, 5 },
1541  { ISD::BITREVERSE, MVT::v32i16, 5 },
1542  { ISD::BITREVERSE, MVT::v64i8, 5 },
1543  { ISD::CTLZ, MVT::v8i64, 23 },
1544  { ISD::CTLZ, MVT::v16i32, 22 },
1545  { ISD::CTLZ, MVT::v32i16, 18 },
1546  { ISD::CTLZ, MVT::v64i8, 17 },
1547  { ISD::CTPOP, MVT::v8i64, 7 },
1548  { ISD::CTPOP, MVT::v16i32, 11 },
1549  { ISD::CTPOP, MVT::v32i16, 9 },
1550  { ISD::CTPOP, MVT::v64i8, 6 },
1551  { ISD::CTTZ, MVT::v8i64, 10 },
1552  { ISD::CTTZ, MVT::v16i32, 14 },
1553  { ISD::CTTZ, MVT::v32i16, 12 },
1554  { ISD::CTTZ, MVT::v64i8, 9 },
1555  };
1556  static const CostTblEntry AVX512CostTbl[] = {
1557  { ISD::BITREVERSE, MVT::v8i64, 36 },
1558  { ISD::BITREVERSE, MVT::v16i32, 24 },
1559  { ISD::CTLZ, MVT::v8i64, 29 },
1560  { ISD::CTLZ, MVT::v16i32, 35 },
1561  { ISD::CTPOP, MVT::v8i64, 16 },
1562  { ISD::CTPOP, MVT::v16i32, 24 },
1563  { ISD::CTTZ, MVT::v8i64, 20 },
1564  { ISD::CTTZ, MVT::v16i32, 28 },
1565  };
1566  static const CostTblEntry XOPCostTbl[] = {
1567  { ISD::BITREVERSE, MVT::v4i64, 4 },
1568  { ISD::BITREVERSE, MVT::v8i32, 4 },
1569  { ISD::BITREVERSE, MVT::v16i16, 4 },
1570  { ISD::BITREVERSE, MVT::v32i8, 4 },
1571  { ISD::BITREVERSE, MVT::v2i64, 1 },
1572  { ISD::BITREVERSE, MVT::v4i32, 1 },
1573  { ISD::BITREVERSE, MVT::v8i16, 1 },
1574  { ISD::BITREVERSE, MVT::v16i8, 1 },
1575  { ISD::BITREVERSE, MVT::i64, 3 },
1576  { ISD::BITREVERSE, MVT::i32, 3 },
1577  { ISD::BITREVERSE, MVT::i16, 3 },
1578  { ISD::BITREVERSE, MVT::i8, 3 }
1579  };
1580  static const CostTblEntry AVX2CostTbl[] = {
1581  { ISD::BITREVERSE, MVT::v4i64, 5 },
1582  { ISD::BITREVERSE, MVT::v8i32, 5 },
1583  { ISD::BITREVERSE, MVT::v16i16, 5 },
1584  { ISD::BITREVERSE, MVT::v32i8, 5 },
1585  { ISD::BSWAP, MVT::v4i64, 1 },
1586  { ISD::BSWAP, MVT::v8i32, 1 },
1587  { ISD::BSWAP, MVT::v16i16, 1 },
1588  { ISD::CTLZ, MVT::v4i64, 23 },
1589  { ISD::CTLZ, MVT::v8i32, 18 },
1590  { ISD::CTLZ, MVT::v16i16, 14 },
1591  { ISD::CTLZ, MVT::v32i8, 9 },
1592  { ISD::CTPOP, MVT::v4i64, 7 },
1593  { ISD::CTPOP, MVT::v8i32, 11 },
1594  { ISD::CTPOP, MVT::v16i16, 9 },
1595  { ISD::CTPOP, MVT::v32i8, 6 },
1596  { ISD::CTTZ, MVT::v4i64, 10 },
1597  { ISD::CTTZ, MVT::v8i32, 14 },
1598  { ISD::CTTZ, MVT::v16i16, 12 },
1599  { ISD::CTTZ, MVT::v32i8, 9 },
1600  { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/
1601  { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
1602  { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
1603  { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/
1604  { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
1605  { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
1606  };
1607  static const CostTblEntry AVX1CostTbl[] = {
1608  { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert
1609  { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert
1610  { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert
1611  { ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert
1612  { ISD::BSWAP, MVT::v4i64, 4 },
1613  { ISD::BSWAP, MVT::v8i32, 4 },
1614  { ISD::BSWAP, MVT::v16i16, 4 },
1615  { ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert
1616  { ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert
1617  { ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert
1618  { ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
1619  { ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert
1620  { ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert
1621  { ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert
1622  { ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert
1623  { ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert
1624  { ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert
1625  { ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert
1626  { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
1627  { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
1628  { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
1629  { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
1630  { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/
1631  { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/
1632  { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/
1633  };
1634  static const CostTblEntry SSE42CostTbl[] = {
1635  { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
1636  { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
1637  };
1638  static const CostTblEntry SSSE3CostTbl[] = {
1639  { ISD::BITREVERSE, MVT::v2i64, 5 },
1640  { ISD::BITREVERSE, MVT::v4i32, 5 },
1641  { ISD::BITREVERSE, MVT::v8i16, 5 },
1642  { ISD::BITREVERSE, MVT::v16i8, 5 },
1643  { ISD::BSWAP, MVT::v2i64, 1 },
1644  { ISD::BSWAP, MVT::v4i32, 1 },
1645  { ISD::BSWAP, MVT::v8i16, 1 },
1646  { ISD::CTLZ, MVT::v2i64, 23 },
1647  { ISD::CTLZ, MVT::v4i32, 18 },
1648  { ISD::CTLZ, MVT::v8i16, 14 },
1649  { ISD::CTLZ, MVT::v16i8, 9 },
1650  { ISD::CTPOP, MVT::v2i64, 7 },
1651  { ISD::CTPOP, MVT::v4i32, 11 },
1652  { ISD::CTPOP, MVT::v8i16, 9 },
1653  { ISD::CTPOP, MVT::v16i8, 6 },
1654  { ISD::CTTZ, MVT::v2i64, 10 },
1655  { ISD::CTTZ, MVT::v4i32, 14 },
1656  { ISD::CTTZ, MVT::v8i16, 12 },
1657  { ISD::CTTZ, MVT::v16i8, 9 }
1658  };
1659  static const CostTblEntry SSE2CostTbl[] = {
1660  { ISD::BITREVERSE, MVT::v2i64, 29 },
1661  { ISD::BITREVERSE, MVT::v4i32, 27 },
1662  { ISD::BITREVERSE, MVT::v8i16, 27 },
1663  { ISD::BITREVERSE, MVT::v16i8, 20 },
1664  { ISD::BSWAP, MVT::v2i64, 7 },
1665  { ISD::BSWAP, MVT::v4i32, 7 },
1666  { ISD::BSWAP, MVT::v8i16, 7 },
1667  { ISD::CTLZ, MVT::v2i64, 25 },
1668  { ISD::CTLZ, MVT::v4i32, 26 },
1669  { ISD::CTLZ, MVT::v8i16, 20 },
1670  { ISD::CTLZ, MVT::v16i8, 17 },
1671  { ISD::CTPOP, MVT::v2i64, 12 },
1672  { ISD::CTPOP, MVT::v4i32, 15 },
1673  { ISD::CTPOP, MVT::v8i16, 13 },
1674  { ISD::CTPOP, MVT::v16i8, 10 },
1675  { ISD::CTTZ, MVT::v2i64, 14 },
1676  { ISD::CTTZ, MVT::v4i32, 18 },
1677  { ISD::CTTZ, MVT::v8i16, 16 },
1678  { ISD::CTTZ, MVT::v16i8, 13 },
1679  { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/
1680  { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/
1681  };
1682  static const CostTblEntry SSE1CostTbl[] = {
1683  { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
1684  { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
1685  };
1686  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
1687  { ISD::BITREVERSE, MVT::i64, 14 }
1688  };
1689  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1690  { ISD::BITREVERSE, MVT::i32, 14 },
1691  { ISD::BITREVERSE, MVT::i16, 14 },
1692  { ISD::BITREVERSE, MVT::i8, 11 }
1693  };
1694 
1695  unsigned ISD = ISD::DELETED_NODE;
1696  switch (IID) {
1697  default:
1698  break;
1699  case Intrinsic::bitreverse:
1700  ISD = ISD::BITREVERSE;
1701  break;
1702  case Intrinsic::bswap:
1703  ISD = ISD::BSWAP;
1704  break;
1705  case Intrinsic::ctlz:
1706  ISD = ISD::CTLZ;
1707  break;
1708  case Intrinsic::ctpop:
1709  ISD = ISD::CTPOP;
1710  break;
1711  case Intrinsic::cttz:
1712  ISD = ISD::CTTZ;
1713  break;
1714  case Intrinsic::sqrt:
1715  ISD = ISD::FSQRT;
1716  break;
1717  }
1718 
1719  // Legalize the type.
1720  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
1721  MVT MTy = LT.second;
1722 
1723  // Attempt to lookup cost.
1724  if (ST->hasCDI())
1725  if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
1726  return LT.first * Entry->Cost;
1727 
1728  if (ST->hasBWI())
1729  if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
1730  return LT.first * Entry->Cost;
1731 
1732  if (ST->hasAVX512())
1733  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
1734  return LT.first * Entry->Cost;
1735 
1736  if (ST->hasXOP())
1737  if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
1738  return LT.first * Entry->Cost;
1739 
1740  if (ST->hasAVX2())
1741  if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
1742  return LT.first * Entry->Cost;
1743 
1744  if (ST->hasAVX())
1745  if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
1746  return LT.first * Entry->Cost;
1747 
1748  if (ST->hasSSE42())
1749  if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
1750  return LT.first * Entry->Cost;
1751 
1752  if (ST->hasSSSE3())
1753  if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
1754  return LT.first * Entry->Cost;
1755 
1756  if (ST->hasSSE2())
1757  if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
1758  return LT.first * Entry->Cost;
1759 
1760  if (ST->hasSSE1())
1761  if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
1762  return LT.first * Entry->Cost;
1763 
1764  if (ST->is64Bit())
1765  if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
1766  return LT.first * Entry->Cost;
1767 
1768  if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
1769  return LT.first * Entry->Cost;
1770 
1771  return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed);
1772 }
1773 
1775  ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) {
1776  return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF);
1777 }
1778 
1779 int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
1780  assert(Val->isVectorTy() && "This must be a vector type");
1781 
1782  Type *ScalarType = Val->getScalarType();
1783 
1784  if (Index != -1U) {
1785  // Legalize the type.
1786  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
1787 
1788  // This type is legalized to a scalar type.
1789  if (!LT.second.isVector())
1790  return 0;
1791 
1792  // The type may be split. Normalize the index to the new type.
1793  unsigned Width = LT.second.getVectorNumElements();
1794  Index = Index % Width;
1795 
1796  // Floating point scalars are already located in index #0.
1797  if (ScalarType->isFloatingPointTy() && Index == 0)
1798  return 0;
1799  }
1800 
1801  // Add to the base cost if we know that the extracted element of a vector is
1802  // destined to be moved to and used in the integer register file.
1803  int RegisterFileMoveCost = 0;
1804  if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
1805  RegisterFileMoveCost = 1;
1806 
1807  return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
1808 }
1809 
1810 int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
1811  unsigned AddressSpace, const Instruction *I) {
1812  // Handle non-power-of-two vectors such as <3 x float>
1813  if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
1814  unsigned NumElem = VTy->getVectorNumElements();
1815 
1816  // Handle a few common cases:
1817  // <3 x float>
1818  if (NumElem == 3 && VTy->getScalarSizeInBits() == 32)
1819  // Cost = 64 bit store + extract + 32 bit store.
1820  return 3;
1821 
1822  // <3 x double>
1823  if (NumElem == 3 && VTy->getScalarSizeInBits() == 64)
1824  // Cost = 128 bit store + unpack + 64 bit store.
1825  return 3;
1826 
1827  // Assume that all other non-power-of-two numbers are scalarized.
1828  if (!isPowerOf2_32(NumElem)) {
1829  int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
1830  AddressSpace);
1831  int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load,
1832  Opcode == Instruction::Store);
1833  return NumElem * Cost + SplitCost;
1834  }
1835  }
1836 
1837  // Legalize the type.
1838  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
1839  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1840  "Invalid Opcode");
1841 
1842  // Each load/store unit costs 1.
1843  int Cost = LT.first * 1;
1844 
1845  // This isn't exactly right. We're using slow unaligned 32-byte accesses as a
1846  // proxy for a double-pumped AVX memory interface such as on Sandybridge.
1847  if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow())
1848  Cost *= 2;
1849 
1850  return Cost;
1851 }
1852 
1853 int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
1854  unsigned Alignment,
1855  unsigned AddressSpace) {
1856  VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy);
1857  if (!SrcVTy)
1858  // To calculate scalar take the regular cost, without mask
1859  return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace);
1860 
1861  unsigned NumElem = SrcVTy->getVectorNumElements();
1862  VectorType *MaskTy =
1863  VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
1864  if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy)) ||
1865  (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy)) ||
1866  !isPowerOf2_32(NumElem)) {
1867  // Scalarization
1868  int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
1869  int ScalarCompareCost = getCmpSelInstrCost(
1870  Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr);
1871  int BranchCost = getCFInstrCost(Instruction::Br);
1872  int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
1873 
1874  int ValueSplitCost = getScalarizationOverhead(
1875  SrcVTy, Opcode == Instruction::Load, Opcode == Instruction::Store);
1876  int MemopCost =
1877  NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
1878  Alignment, AddressSpace);
1879  return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
1880  }
1881 
1882  // Legalize the type.
1883  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
1884  auto VT = TLI->getValueType(DL, SrcVTy);
1885  int Cost = 0;
1886  if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
1887  LT.second.getVectorNumElements() == NumElem)
1888  // Promotion requires expand/truncate for data and a shuffle for mask.
1889  Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, nullptr) +
1890  getShuffleCost(TTI::SK_Alternate, MaskTy, 0, nullptr);
1891 
1892  else if (LT.second.getVectorNumElements() > NumElem) {
1893  VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(),
1894  LT.second.getVectorNumElements());
1895  // Expanding requires fill mask with zeroes
1896  Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy);
1897  }
1898  if (!ST->hasAVX512())
1899  return Cost + LT.first*4; // Each maskmov costs 4
1900 
1901  // AVX-512 masked load/store is cheapper
1902  return Cost+LT.first;
1903 }
1904 
1906  const SCEV *Ptr) {
1907  // Address computations in vectorized code with non-consecutive addresses will
1908  // likely result in more instructions compared to scalar code where the
1909  // computation can more often be merged into the index mode. The resulting
1910  // extra micro-ops can significantly decrease throughput.
1911  unsigned NumVectorInstToHideOverhead = 10;
1912 
1913  // Cost modeling of Strided Access Computation is hidden by the indexing
1914  // modes of X86 regardless of the stride value. We dont believe that there
1915  // is a difference between constant strided access in gerenal and constant
1916  // strided value which is less than or equal to 64.
1917  // Even in the case of (loop invariant) stride whose value is not known at
1918  // compile time, the address computation will not incur more than one extra
1919  // ADD instruction.
1920  if (Ty->isVectorTy() && SE) {
1921  if (!BaseT::isStridedAccess(Ptr))
1922  return NumVectorInstToHideOverhead;
1923  if (!BaseT::getConstantStrideStep(SE, Ptr))
1924  return 1;
1925  }
1926 
1927  return BaseT::getAddressComputationCost(Ty, SE, Ptr);
1928 }
1929 
1930 int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
1931  bool IsPairwise) {
1932 
1933  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1934 
1935  MVT MTy = LT.second;
1936 
1937  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1938  assert(ISD && "Invalid opcode");
1939 
1940  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
1941  // and make it as the cost.
1942 
1943  static const CostTblEntry SSE42CostTblPairWise[] = {
1944  { ISD::FADD, MVT::v2f64, 2 },
1945  { ISD::FADD, MVT::v4f32, 4 },
1946  { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
1947  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
1948  { ISD::ADD, MVT::v8i16, 5 },
1949  };
1950 
1951  static const CostTblEntry AVX1CostTblPairWise[] = {
1952  { ISD::FADD, MVT::v4f32, 4 },
1953  { ISD::FADD, MVT::v4f64, 5 },
1954  { ISD::FADD, MVT::v8f32, 7 },
1955  { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
1956  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
1957  { ISD::ADD, MVT::v4i64, 5 }, // The data reported by the IACA tool is "4.8".
1958  { ISD::ADD, MVT::v8i16, 5 },
1959  { ISD::ADD, MVT::v8i32, 5 },
1960  };
1961 
1962  static const CostTblEntry SSE42CostTblNoPairWise[] = {
1963  { ISD::FADD, MVT::v2f64, 2 },
1964  { ISD::FADD, MVT::v4f32, 4 },
1965  { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
1966  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
1967  { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
1968  };
1969 
1970  static const CostTblEntry AVX1CostTblNoPairWise[] = {
1971  { ISD::FADD, MVT::v4f32, 3 },
1972  { ISD::FADD, MVT::v4f64, 3 },
1973  { ISD::FADD, MVT::v8f32, 4 },
1974  { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
1975  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "2.8".
1976  { ISD::ADD, MVT::v4i64, 3 },
1977  { ISD::ADD, MVT::v8i16, 4 },
1978  { ISD::ADD, MVT::v8i32, 5 },
1979  };
1980 
1981  if (IsPairwise) {
1982  if (ST->hasAVX())
1983  if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
1984  return LT.first * Entry->Cost;
1985 
1986  if (ST->hasSSE42())
1987  if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
1988  return LT.first * Entry->Cost;
1989  } else {
1990  if (ST->hasAVX())
1991  if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
1992  return LT.first * Entry->Cost;
1993 
1994  if (ST->hasSSE42())
1995  if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
1996  return LT.first * Entry->Cost;
1997  }
1998 
1999  return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise);
2000 }
2001 
2003  bool IsPairwise, bool IsUnsigned) {
2004  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2005 
2006  MVT MTy = LT.second;
2007 
2008  int ISD;
2009  if (ValTy->isIntOrIntVectorTy()) {
2010  ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
2011  } else {
2012  assert(ValTy->isFPOrFPVectorTy() &&
2013  "Expected float point or integer vector type.");
2014  ISD = ISD::FMINNUM;
2015  }
2016 
2017  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
2018  // and make it as the cost.
2019 
2020  static const CostTblEntry SSE42CostTblPairWise[] = {
2021  {ISD::FMINNUM, MVT::v2f64, 3},
2022  {ISD::FMINNUM, MVT::v4f32, 2},
2023  {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
2024  {ISD::UMIN, MVT::v2i64, 8}, // The data reported by the IACA is "8.6"
2025  {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
2026  {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
2027  {ISD::SMIN, MVT::v8i16, 2},
2028  {ISD::UMIN, MVT::v8i16, 2},
2029  };
2030 
2031  static const CostTblEntry AVX1CostTblPairWise[] = {
2032  {ISD::FMINNUM, MVT::v4f32, 1},
2033  {ISD::FMINNUM, MVT::v4f64, 1},
2034  {ISD::FMINNUM, MVT::v8f32, 2},
2035  {ISD::SMIN, MVT::v2i64, 3},
2036  {ISD::UMIN, MVT::v2i64, 3},
2037  {ISD::SMIN, MVT::v4i32, 1},
2038  {ISD::UMIN, MVT::v4i32, 1},
2039  {ISD::SMIN, MVT::v8i16, 1},
2040  {ISD::UMIN, MVT::v8i16, 1},
2041  {ISD::SMIN, MVT::v8i32, 3},
2042  {ISD::UMIN, MVT::v8i32, 3},
2043  };
2044 
2045  static const CostTblEntry AVX2CostTblPairWise[] = {
2046  {ISD::SMIN, MVT::v4i64, 2},
2047  {ISD::UMIN, MVT::v4i64, 2},
2048  {ISD::SMIN, MVT::v8i32, 1},
2049  {ISD::UMIN, MVT::v8i32, 1},
2050  {ISD::SMIN, MVT::v16i16, 1},
2051  {ISD::UMIN, MVT::v16i16, 1},
2052  {ISD::SMIN, MVT::v32i8, 2},
2053  {ISD::UMIN, MVT::v32i8, 2},
2054  };
2055 
2056  static const CostTblEntry AVX512CostTblPairWise[] = {
2057  {ISD::FMINNUM, MVT::v8f64, 1},
2058  {ISD::FMINNUM, MVT::v16f32, 2},
2059  {ISD::SMIN, MVT::v8i64, 2},
2060  {ISD::UMIN, MVT::v8i64, 2},
2061  {ISD::SMIN, MVT::v16i32, 1},
2062  {ISD::UMIN, MVT::v16i32, 1},
2063  };
2064 
2065  static const CostTblEntry SSE42CostTblNoPairWise[] = {
2066  {ISD::FMINNUM, MVT::v2f64, 3},
2067  {ISD::FMINNUM, MVT::v4f32, 3},
2068  {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
2069  {ISD::UMIN, MVT::v2i64, 9}, // The data reported by the IACA is "8.6"
2070  {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
2071  {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
2072  {ISD::SMIN, MVT::v8i16, 1}, // The data reported by the IACA is "1.5"
2073  {ISD::UMIN, MVT::v8i16, 2}, // The data reported by the IACA is "1.8"
2074  };
2075 
2076  static const CostTblEntry AVX1CostTblNoPairWise[] = {
2077  {ISD::FMINNUM, MVT::v4f32, 1},
2078  {ISD::FMINNUM, MVT::v4f64, 1},
2079  {ISD::FMINNUM, MVT::v8f32, 1},
2080  {ISD::SMIN, MVT::v2i64, 3},
2081  {ISD::UMIN, MVT::v2i64, 3},
2082  {ISD::SMIN, MVT::v4i32, 1},
2083  {ISD::UMIN, MVT::v4i32, 1},
2084  {ISD::SMIN, MVT::v8i16, 1},
2085  {ISD::UMIN, MVT::v8i16, 1},
2086  {ISD::SMIN, MVT::v8i32, 2},
2087  {ISD::UMIN, MVT::v8i32, 2},
2088  };
2089 
2090  static const CostTblEntry AVX2CostTblNoPairWise[] = {
2091  {ISD::SMIN, MVT::v4i64, 1},
2092  {ISD::UMIN, MVT::v4i64, 1},
2093  {ISD::SMIN, MVT::v8i32, 1},
2094  {ISD::UMIN, MVT::v8i32, 1},
2095  {ISD::SMIN, MVT::v16i16, 1},
2096  {ISD::UMIN, MVT::v16i16, 1},
2097  {ISD::SMIN, MVT::v32i8, 1},
2098  {ISD::UMIN, MVT::v32i8, 1},
2099  };
2100 
2101  static const CostTblEntry AVX512CostTblNoPairWise[] = {
2102  {ISD::FMINNUM, MVT::v8f64, 1},
2103  {ISD::FMINNUM, MVT::v16f32, 2},
2104  {ISD::SMIN, MVT::v8i64, 1},
2105  {ISD::UMIN, MVT::v8i64, 1},
2106  {ISD::SMIN, MVT::v16i32, 1},
2107  {ISD::UMIN, MVT::v16i32, 1},
2108  };
2109 
2110  if (IsPairwise) {
2111  if (ST->hasAVX512())
2112  if (const auto *Entry = CostTableLookup(AVX512CostTblPairWise, ISD, MTy))
2113  return LT.first * Entry->Cost;
2114 
2115  if (ST->hasAVX2())
2116  if (const auto *Entry = CostTableLookup(AVX2CostTblPairWise, ISD, MTy))
2117  return LT.first * Entry->Cost;
2118 
2119  if (ST->hasAVX())
2120  if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
2121  return LT.first * Entry->Cost;
2122 
2123  if (ST->hasSSE42())
2124  if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
2125  return LT.first * Entry->Cost;
2126  } else {
2127  if (ST->hasAVX512())
2128  if (const auto *Entry =
2129  CostTableLookup(AVX512CostTblNoPairWise, ISD, MTy))
2130  return LT.first * Entry->Cost;
2131 
2132  if (ST->hasAVX2())
2133  if (const auto *Entry = CostTableLookup(AVX2CostTblNoPairWise, ISD, MTy))
2134  return LT.first * Entry->Cost;
2135 
2136  if (ST->hasAVX())
2137  if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
2138  return LT.first * Entry->Cost;
2139 
2140  if (ST->hasSSE42())
2141  if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
2142  return LT.first * Entry->Cost;
2143  }
2144 
2145  return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned);
2146 }
2147 
2148 /// \brief Calculate the cost of materializing a 64-bit value. This helper
2149 /// method might only calculate a fraction of a larger immediate. Therefore it
2150 /// is valid to return a cost of ZERO.
2151 int X86TTIImpl::getIntImmCost(int64_t Val) {
2152  if (Val == 0)
2153  return TTI::TCC_Free;
2154 
2155  if (isInt<32>(Val))
2156  return TTI::TCC_Basic;
2157 
2158  return 2 * TTI::TCC_Basic;
2159 }
2160 
2161 int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
2162  assert(Ty->isIntegerTy());
2163 
2164  unsigned BitSize = Ty->getPrimitiveSizeInBits();
2165  if (BitSize == 0)
2166  return ~0U;
2167 
2168  // Never hoist constants larger than 128bit, because this might lead to
2169  // incorrect code generation or assertions in codegen.
2170  // Fixme: Create a cost model for types larger than i128 once the codegen
2171  // issues have been fixed.
2172  if (BitSize > 128)
2173  return TTI::TCC_Free;
2174 
2175  if (Imm == 0)
2176  return TTI::TCC_Free;
2177 
2178  // Sign-extend all constants to a multiple of 64-bit.
2179  APInt ImmVal = Imm;
2180  if (BitSize & 0x3f)
2181  ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
2182 
2183  // Split the constant into 64-bit chunks and calculate the cost for each
2184  // chunk.
2185  int Cost = 0;
2186  for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
2187  APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
2188  int64_t Val = Tmp.getSExtValue();
2189  Cost += getIntImmCost(Val);
2190  }
2191  // We need at least one instruction to materialize the constant.
2192  return std::max(1, Cost);
2193 }
2194 
2195 int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
2196  Type *Ty) {
2197  assert(Ty->isIntegerTy());
2198 
2199  unsigned BitSize = Ty->getPrimitiveSizeInBits();
2200  // There is no cost model for constants with a bit size of 0. Return TCC_Free
2201  // here, so that constant hoisting will ignore this constant.
2202  if (BitSize == 0)
2203  return TTI::TCC_Free;
2204 
2205  unsigned ImmIdx = ~0U;
2206  switch (Opcode) {
2207  default:
2208  return TTI::TCC_Free;
2209  case Instruction::GetElementPtr:
2210  // Always hoist the base address of a GetElementPtr. This prevents the
2211  // creation of new constants for every base constant that gets constant
2212  // folded with the offset.
2213  if (Idx == 0)
2214  return 2 * TTI::TCC_Basic;
2215  return TTI::TCC_Free;
2216  case Instruction::Store:
2217  ImmIdx = 0;
2218  break;
2219  case Instruction::ICmp:
2220  // This is an imperfect hack to prevent constant hoisting of
2221  // compares that might be trying to check if a 64-bit value fits in
2222  // 32-bits. The backend can optimize these cases using a right shift by 32.
2223  // Ideally we would check the compare predicate here. There also other
2224  // similar immediates the backend can use shifts for.
2225  if (Idx == 1 && Imm.getBitWidth() == 64) {
2226  uint64_t ImmVal = Imm.getZExtValue();
2227  if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
2228  return TTI::TCC_Free;
2229  }
2230  ImmIdx = 1;
2231  break;
2232  case Instruction::And:
2233  // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
2234  // by using a 32-bit operation with implicit zero extension. Detect such
2235  // immediates here as the normal path expects bit 31 to be sign extended.
2236  if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue()))
2237  return TTI::TCC_Free;
2239  case Instruction::Add:
2240  case Instruction::Sub:
2241  case Instruction::Mul:
2242  case Instruction::UDiv:
2243  case Instruction::SDiv:
2244  case Instruction::URem:
2245  case Instruction::SRem:
2246  case Instruction::Or:
2247  case Instruction::Xor:
2248  ImmIdx = 1;
2249  break;
2250  // Always return TCC_Free for the shift value of a shift instruction.
2251  case Instruction::Shl:
2252  case Instruction::LShr:
2253  case Instruction::AShr:
2254  if (Idx == 1)
2255  return TTI::TCC_Free;
2256  break;
2257  case Instruction::Trunc:
2258  case Instruction::ZExt:
2259  case Instruction::SExt:
2260  case Instruction::IntToPtr:
2261  case Instruction::PtrToInt:
2262  case Instruction::BitCast:
2263  case Instruction::PHI:
2264  case Instruction::Call:
2265  case Instruction::Select:
2266  case Instruction::Ret:
2267  case Instruction::Load:
2268  break;
2269  }
2270 
2271  if (Idx == ImmIdx) {
2272  int NumConstants = (BitSize + 63) / 64;
2273  int Cost = X86TTIImpl::getIntImmCost(Imm, Ty);
2274  return (Cost <= NumConstants * TTI::TCC_Basic)
2275  ? static_cast<int>(TTI::TCC_Free)
2276  : Cost;
2277  }
2278 
2279  return X86TTIImpl::getIntImmCost(Imm, Ty);
2280 }
2281 
2282 int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
2283  Type *Ty) {
2284  assert(Ty->isIntegerTy());
2285 
2286  unsigned BitSize = Ty->getPrimitiveSizeInBits();
2287  // There is no cost model for constants with a bit size of 0. Return TCC_Free
2288  // here, so that constant hoisting will ignore this constant.
2289  if (BitSize == 0)
2290  return TTI::TCC_Free;
2291 
2292  switch (IID) {
2293  default:
2294  return TTI::TCC_Free;
2295  case Intrinsic::sadd_with_overflow:
2296  case Intrinsic::uadd_with_overflow:
2297  case Intrinsic::ssub_with_overflow:
2298  case Intrinsic::usub_with_overflow:
2299  case Intrinsic::smul_with_overflow:
2300  case Intrinsic::umul_with_overflow:
2301  if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
2302  return TTI::TCC_Free;
2303  break;
2304  case Intrinsic::experimental_stackmap:
2305  if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
2306  return TTI::TCC_Free;
2307  break;
2308  case Intrinsic::experimental_patchpoint_void:
2309  case Intrinsic::experimental_patchpoint_i64:
2310  if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
2311  return TTI::TCC_Free;
2312  break;
2313  }
2314  return X86TTIImpl::getIntImmCost(Imm, Ty);
2315 }
2316 
2317 unsigned X86TTIImpl::getUserCost(const User *U,
2318  ArrayRef<const Value *> Operands) {
2319  if (isa<StoreInst>(U)) {
2320  Value *Ptr = U->getOperand(1);
2321  // Store instruction with index and scale costs 2 Uops.
2322  // Check the preceding GEP to identify non-const indices.
2323  if (auto GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
2324  if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
2325  return TTI::TCC_Basic * 2;
2326  }
2327  return TTI::TCC_Basic;
2328  }
2329  return BaseT::getUserCost(U, Operands);
2330 }
2331 
2332 // Return an average cost of Gather / Scatter instruction, maybe improved later
2333 int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
2334  unsigned Alignment, unsigned AddressSpace) {
2335 
2336  assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
2337  unsigned VF = SrcVTy->getVectorNumElements();
2338 
2339  // Try to reduce index size from 64 bit (default for GEP)
2340  // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
2341  // operation will use 16 x 64 indices which do not fit in a zmm and needs
2342  // to split. Also check that the base pointer is the same for all lanes,
2343  // and that there's at most one variable index.
2344  auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) {
2345  unsigned IndexSize = DL.getPointerSizeInBits();
2347  if (IndexSize < 64 || !GEP)
2348  return IndexSize;
2349 
2350  unsigned NumOfVarIndices = 0;
2351  Value *Ptrs = GEP->getPointerOperand();
2352  if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
2353  return IndexSize;
2354  for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
2355  if (isa<Constant>(GEP->getOperand(i)))
2356  continue;
2357  Type *IndxTy = GEP->getOperand(i)->getType();
2358  if (IndxTy->isVectorTy())
2359  IndxTy = IndxTy->getVectorElementType();
2360  if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
2361  !isa<SExtInst>(GEP->getOperand(i))) ||
2362  ++NumOfVarIndices > 1)
2363  return IndexSize; // 64
2364  }
2365  return (unsigned)32;
2366  };
2367 
2368 
2369  // Trying to reduce IndexSize to 32 bits for vector 16.
2370  // By default the IndexSize is equal to pointer size.
2371  unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
2372  ? getIndexSizeInBits(Ptr, DL)
2374 
2375  Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(),
2376  IndexSize), VF);
2377  std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
2378  std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
2379  int SplitFactor = std::max(IdxsLT.first, SrcLT.first);
2380  if (SplitFactor > 1) {
2381  // Handle splitting of vector of pointers
2382  Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
2383  return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
2384  AddressSpace);
2385  }
2386 
2387  // The gather / scatter cost is given by Intel architects. It is a rough
2388  // number since we are looking at one instruction in a time.
2389  const int GSOverhead = (Opcode == Instruction::Load)
2390  ? ST->getGatherOverhead()
2391  : ST->getScatterOverhead();
2392  return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
2393  Alignment, AddressSpace);
2394 }
2395 
2396 /// Return the cost of full scalarization of gather / scatter operation.
2397 ///
2398 /// Opcode - Load or Store instruction.
2399 /// SrcVTy - The type of the data vector that should be gathered or scattered.
2400 /// VariableMask - The mask is non-constant at compile time.
2401 /// Alignment - Alignment for one element.
2402 /// AddressSpace - pointer[s] address space.
2403 ///
2404 int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
2405  bool VariableMask, unsigned Alignment,
2406  unsigned AddressSpace) {
2407  unsigned VF = SrcVTy->getVectorNumElements();
2408 
2409  int MaskUnpackCost = 0;
2410  if (VariableMask) {
2411  VectorType *MaskTy =
2412  VectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
2413  MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true);
2414  int ScalarCompareCost =
2415  getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()),
2416  nullptr);
2417  int BranchCost = getCFInstrCost(Instruction::Br);
2418  MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
2419  }
2420 
2421  // The cost of the scalar loads/stores.
2422  int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
2423  Alignment, AddressSpace);
2424 
2425  int InsertExtractCost = 0;
2426  if (Opcode == Instruction::Load)
2427  for (unsigned i = 0; i < VF; ++i)
2428  // Add the cost of inserting each scalar load into the vector
2429  InsertExtractCost +=
2430  getVectorInstrCost(Instruction::InsertElement, SrcVTy, i);
2431  else
2432  for (unsigned i = 0; i < VF; ++i)
2433  // Add the cost of extracting each element out of the data vector
2434  InsertExtractCost +=
2435  getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i);
2436 
2437  return MemoryOpCost + MaskUnpackCost + InsertExtractCost;
2438 }
2439 
2440 /// Calculate the cost of Gather / Scatter operation
2441 int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
2442  Value *Ptr, bool VariableMask,
2443  unsigned Alignment) {
2444  assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
2445  unsigned VF = SrcVTy->getVectorNumElements();
2446  PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
2447  if (!PtrTy && Ptr->getType()->isVectorTy())
2448  PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType());
2449  assert(PtrTy && "Unexpected type for Ptr argument");
2450  unsigned AddressSpace = PtrTy->getAddressSpace();
2451 
2452  bool Scalarize = false;
2453  if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) ||
2454  (Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy)))
2455  Scalarize = true;
2456  // Gather / Scatter for vector 2 is not profitable on KNL / SKX
2457  // Vector-4 of gather/scatter instruction does not exist on KNL.
2458  // We can extend it to 8 elements, but zeroing upper bits of
2459  // the mask vector will add more instructions. Right now we give the scalar
2460  // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction
2461  // is better in the VariableMask case.
2462  if (ST->hasAVX512() && (VF == 2 || (VF == 4 && !ST->hasVLX())))
2463  Scalarize = true;
2464 
2465  if (Scalarize)
2466  return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
2467  AddressSpace);
2468 
2469  return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
2470 }
2471 
2474  // X86 specific here are "instruction number 1st priority".
2475  return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
2476  C1.NumIVMuls, C1.NumBaseAdds,
2477  C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
2478  std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
2479  C2.NumIVMuls, C2.NumBaseAdds,
2480  C2.ScaleCost, C2.ImmCost, C2.SetupCost);
2481 }
2482 
2484  // The backend can't handle a single element vector.
2485  if (isa<VectorType>(DataTy) && DataTy->getVectorNumElements() == 1)
2486  return false;
2487  Type *ScalarTy = DataTy->getScalarType();
2488  int DataWidth = isa<PointerType>(ScalarTy) ?
2490 
2491  return ((DataWidth == 32 || DataWidth == 64) && ST->hasAVX()) ||
2492  ((DataWidth == 8 || DataWidth == 16) && ST->hasBWI());
2493 }
2494 
2496  return isLegalMaskedLoad(DataType);
2497 }
2498 
2500  // This function is called now in two cases: from the Loop Vectorizer
2501  // and from the Scalarizer.
2502  // When the Loop Vectorizer asks about legality of the feature,
2503  // the vectorization factor is not calculated yet. The Loop Vectorizer
2504  // sends a scalar type and the decision is based on the width of the
2505  // scalar element.
2506  // Later on, the cost model will estimate usage this intrinsic based on
2507  // the vector type.
2508  // The Scalarizer asks again about legality. It sends a vector type.
2509  // In this case we can reject non-power-of-2 vectors.
2510  // We also reject single element vectors as the type legalizer can't
2511  // scalarize it.
2512  if (isa<VectorType>(DataTy)) {
2513  unsigned NumElts = DataTy->getVectorNumElements();
2514  if (NumElts == 1 || !isPowerOf2_32(NumElts))
2515  return false;
2516  }
2517  Type *ScalarTy = DataTy->getScalarType();
2518  int DataWidth = isa<PointerType>(ScalarTy) ?
2520 
2521  // AVX-512 and Skylake AVX2 allows gather and scatter
2522  return (DataWidth == 32 || DataWidth == 64) && (ST->hasAVX512() ||
2524 }
2525 
2527  // AVX2 doesn't support scatter
2528  if (!ST->hasAVX512())
2529  return false;
2530  return isLegalMaskedGather(DataType);
2531 }
2532 
2533 bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
2534  EVT VT = TLI->getValueType(DL, DataType);
2535  return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
2536 }
2537 
2539  const Function *Callee) const {
2540  const TargetMachine &TM = getTLI()->getTargetMachine();
2541 
2542  // Work this as a subsetting of subtarget features.
2543  const FeatureBitset &CallerBits =
2544  TM.getSubtargetImpl(*Caller)->getFeatureBits();
2545  const FeatureBitset &CalleeBits =
2546  TM.getSubtargetImpl(*Callee)->getFeatureBits();
2547 
2548  // FIXME: This is likely too limiting as it will include subtarget features
2549  // that we might not care about for inlining, but it is conservatively
2550  // correct.
2551  return (CallerBits & CalleeBits) == CalleeBits;
2552 }
2553 
2555 X86TTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const {
2556  // Only enable vector loads for equality comparison.
2557  // Right now the vector version is not as fast, see #33329.
2558  static const auto ThreeWayOptions = [this]() {
2560  if (ST->is64Bit()) {
2561  Options.LoadSizes.push_back(8);
2562  }
2563  Options.LoadSizes.push_back(4);
2564  Options.LoadSizes.push_back(2);
2565  Options.LoadSizes.push_back(1);
2566  return Options;
2567  }();
2568  static const auto EqZeroOptions = [this]() {
2570  // TODO: enable AVX512 when the DAG is ready.
2571  // if (ST->hasAVX512()) Options.LoadSizes.push_back(64);
2572  if (ST->hasAVX2()) Options.LoadSizes.push_back(32);
2573  if (ST->hasSSE2()) Options.LoadSizes.push_back(16);
2574  if (ST->is64Bit()) {
2575  Options.LoadSizes.push_back(8);
2576  }
2577  Options.LoadSizes.push_back(4);
2578  Options.LoadSizes.push_back(2);
2579  Options.LoadSizes.push_back(1);
2580  return Options;
2581  }();
2582  return IsZeroCmp ? &EqZeroOptions : &ThreeWayOptions;
2583 }
2584 
2586  // TODO: We expect this to be beneficial regardless of arch,
2587  // but there are currently some unexplained performance artifacts on Atom.
2588  // As a temporary solution, disable on Atom.
2589  return !(ST->isAtom());
2590 }
2591 
2592 // Get estimation for interleaved load/store operations for AVX2.
2593 // \p Factor is the interleaved-access factor (stride) - number of
2594 // (interleaved) elements in the group.
2595 // \p Indices contains the indices for a strided load: when the
2596 // interleaved load has gaps they indicate which elements are used.
2597 // If Indices is empty (or if the number of indices is equal to the size
2598 // of the interleaved-access as given in \p Factor) the access has no gaps.
2599 //
2600 // As opposed to AVX-512, AVX2 does not have generic shuffles that allow
2601 // computing the cost using a generic formula as a function of generic
2602 // shuffles. We therefore use a lookup table instead, filled according to
2603 // the instruction sequences that codegen currently generates.
2605  unsigned Factor,
2606  ArrayRef<unsigned> Indices,
2607  unsigned Alignment,
2608  unsigned AddressSpace) {
2609 
2610  // We currently Support only fully-interleaved groups, with no gaps.
2611  // TODO: Support also strided loads (interleaved-groups with gaps).
2612  if (Indices.size() && Indices.size() != Factor)
2613  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2614  Alignment, AddressSpace);
2615 
2616  // VecTy for interleave memop is <VF*Factor x Elt>.
2617  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
2618  // VecTy = <12 x i32>.
2619  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
2620 
2621  // This function can be called with VecTy=<6xi128>, Factor=3, in which case
2622  // the VF=2, while v2i128 is an unsupported MVT vector type
2623  // (see MachineValueType.h::getVectorVT()).
2624  if (!LegalVT.isVector())
2625  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2626  Alignment, AddressSpace);
2627 
2628  unsigned VF = VecTy->getVectorNumElements() / Factor;
2629  Type *ScalarTy = VecTy->getVectorElementType();
2630 
2631  // Calculate the number of memory operations (NumOfMemOps), required
2632  // for load/store the VecTy.
2633  unsigned VecTySize = DL.getTypeStoreSize(VecTy);
2634  unsigned LegalVTSize = LegalVT.getStoreSize();
2635  unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
2636 
2637  // Get the cost of one memory operation.
2638  Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
2639  LegalVT.getVectorNumElements());
2640  unsigned MemOpCost =
2641  getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
2642 
2643  VectorType *VT = VectorType::get(ScalarTy, VF);
2644  EVT ETy = TLI->getValueType(DL, VT);
2645  if (!ETy.isSimple())
2646  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2647  Alignment, AddressSpace);
2648 
2649  // TODO: Complete for other data-types and strides.
2650  // Each combination of Stride, ElementTy and VF results in a different
2651  // sequence; The cost tables are therefore accessed with:
2652  // Factor (stride) and VectorType=VFxElemType.
2653  // The Cost accounts only for the shuffle sequence;
2654  // The cost of the loads/stores is accounted for separately.
2655  //
2656  static const CostTblEntry AVX2InterleavedLoadTbl[] = {
2657  { 2, MVT::v4i64, 6 }, //(load 8i64 and) deinterleave into 2 x 4i64
2658  { 2, MVT::v4f64, 6 }, //(load 8f64 and) deinterleave into 2 x 4f64
2659 
2660  { 3, MVT::v2i8, 10 }, //(load 6i8 and) deinterleave into 3 x 2i8
2661  { 3, MVT::v4i8, 4 }, //(load 12i8 and) deinterleave into 3 x 4i8
2662  { 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8
2663  { 3, MVT::v16i8, 11}, //(load 48i8 and) deinterleave into 3 x 16i8
2664  { 3, MVT::v32i8, 13}, //(load 96i8 and) deinterleave into 3 x 32i8
2665  { 3, MVT::v8f32, 17 }, //(load 24f32 and)deinterleave into 3 x 8f32
2666 
2667  { 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8
2668  { 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8
2669  { 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8
2670  { 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8
2671  { 4, MVT::v32i8, 80 }, //(load 128i8 and) deinterleave into 4 x 32i8
2672 
2673  { 8, MVT::v8f32, 40 } //(load 64f32 and)deinterleave into 8 x 8f32
2674  };
2675 
2676  static const CostTblEntry AVX2InterleavedStoreTbl[] = {
2677  { 2, MVT::v4i64, 6 }, //interleave into 2 x 4i64 into 8i64 (and store)
2678  { 2, MVT::v4f64, 6 }, //interleave into 2 x 4f64 into 8f64 (and store)
2679 
2680  { 3, MVT::v2i8, 7 }, //interleave 3 x 2i8 into 6i8 (and store)
2681  { 3, MVT::v4i8, 8 }, //interleave 3 x 4i8 into 12i8 (and store)
2682  { 3, MVT::v8i8, 11 }, //interleave 3 x 8i8 into 24i8 (and store)
2683  { 3, MVT::v16i8, 11 }, //interleave 3 x 16i8 into 48i8 (and store)
2684  { 3, MVT::v32i8, 13 }, //interleave 3 x 32i8 into 96i8 (and store)
2685 
2686  { 4, MVT::v2i8, 12 }, //interleave 4 x 2i8 into 8i8 (and store)
2687  { 4, MVT::v4i8, 9 }, //interleave 4 x 4i8 into 16i8 (and store)
2688  { 4, MVT::v8i8, 10 }, //interleave 4 x 8i8 into 32i8 (and store)
2689  { 4, MVT::v16i8, 10 }, //interleave 4 x 16i8 into 64i8 (and store)
2690  { 4, MVT::v32i8, 12 } //interleave 4 x 32i8 into 128i8 (and store)
2691  };
2692 
2693  if (Opcode == Instruction::Load) {
2694  if (const auto *Entry =
2695  CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT()))
2696  return NumOfMemOps * MemOpCost + Entry->Cost;
2697  } else {
2698  assert(Opcode == Instruction::Store &&
2699  "Expected Store Instruction at this point");
2700  if (const auto *Entry =
2701  CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT()))
2702  return NumOfMemOps * MemOpCost + Entry->Cost;
2703  }
2704 
2705  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2706  Alignment, AddressSpace);
2707 }
2708 
2709 // Get estimation for interleaved load/store operations and strided load.
2710 // \p Indices contains indices for strided load.
2711 // \p Factor - the factor of interleaving.
2712 // AVX-512 provides 3-src shuffles that significantly reduces the cost.
2714  unsigned Factor,
2715  ArrayRef<unsigned> Indices,
2716  unsigned Alignment,
2717  unsigned AddressSpace) {
2718 
2719  // VecTy for interleave memop is <VF*Factor x Elt>.
2720  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
2721  // VecTy = <12 x i32>.
2722 
2723  // Calculate the number of memory operations (NumOfMemOps), required
2724  // for load/store the VecTy.
2725  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
2726  unsigned VecTySize = DL.getTypeStoreSize(VecTy);
2727  unsigned LegalVTSize = LegalVT.getStoreSize();
2728  unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
2729 
2730  // Get the cost of one memory operation.
2731  Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
2732  LegalVT.getVectorNumElements());
2733  unsigned MemOpCost =
2734  getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
2735 
2736  unsigned VF = VecTy->getVectorNumElements() / Factor;
2737  MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
2738 
2739  if (Opcode == Instruction::Load) {
2740  // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
2741  // contain the cost of the optimized shuffle sequence that the
2742  // X86InterleavedAccess pass will generate.
2743  // The cost of loads and stores are computed separately from the table.
2744 
2745  // X86InterleavedAccess support only the following interleaved-access group.
2746  static const CostTblEntry AVX512InterleavedLoadTbl[] = {
2747  {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
2748  {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
2749  {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
2750  };
2751 
2752  if (const auto *Entry =
2753  CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
2754  return NumOfMemOps * MemOpCost + Entry->Cost;
2755  //If an entry does not exist, fallback to the default implementation.
2756 
2757  // Kind of shuffle depends on number of loaded values.
2758  // If we load the entire data in one register, we can use a 1-src shuffle.
2759  // Otherwise, we'll merge 2 sources in each operation.
2760  TTI::ShuffleKind ShuffleKind =
2761  (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
2762 
2763  unsigned ShuffleCost =
2764  getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr);
2765 
2766  unsigned NumOfLoadsInInterleaveGrp =
2767  Indices.size() ? Indices.size() : Factor;
2768  Type *ResultTy = VectorType::get(VecTy->getVectorElementType(),
2769  VecTy->getVectorNumElements() / Factor);
2770  unsigned NumOfResults =
2771  getTLI()->getTypeLegalizationCost(DL, ResultTy).first *
2772  NumOfLoadsInInterleaveGrp;
2773 
2774  // About a half of the loads may be folded in shuffles when we have only
2775  // one result. If we have more than one result, we do not fold loads at all.
2776  unsigned NumOfUnfoldedLoads =
2777  NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
2778 
2779  // Get a number of shuffle operations per result.
2780  unsigned NumOfShufflesPerResult =
2781  std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
2782 
2783  // The SK_MergeTwoSrc shuffle clobbers one of src operands.
2784  // When we have more than one destination, we need additional instructions
2785  // to keep sources.
2786  unsigned NumOfMoves = 0;
2787  if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
2788  NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
2789 
2790  int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
2791  NumOfUnfoldedLoads * MemOpCost + NumOfMoves;
2792 
2793  return Cost;
2794  }
2795 
2796  // Store.
2797  assert(Opcode == Instruction::Store &&
2798  "Expected Store Instruction at this point");
2799  // X86InterleavedAccess support only the following interleaved-access group.
2800  static const CostTblEntry AVX512InterleavedStoreTbl[] = {
2801  {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
2802  {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
2803  {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
2804 
2805  {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
2806  {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
2807  {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
2808  {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
2809  };
2810 
2811  if (const auto *Entry =
2812  CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
2813  return NumOfMemOps * MemOpCost + Entry->Cost;
2814  //If an entry does not exist, fallback to the default implementation.
2815 
2816  // There is no strided stores meanwhile. And store can't be folded in
2817  // shuffle.
2818  unsigned NumOfSources = Factor; // The number of values to be merged.
2819  unsigned ShuffleCost =
2820  getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr);
2821  unsigned NumOfShufflesPerStore = NumOfSources - 1;
2822 
2823  // The SK_MergeTwoSrc shuffle clobbers one of src operands.
2824  // We need additional instructions to keep sources.
2825  unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
2826  int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
2827  NumOfMoves;
2828  return Cost;
2829 }
2830 
2831 int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
2832  unsigned Factor,
2833  ArrayRef<unsigned> Indices,
2834  unsigned Alignment,
2835  unsigned AddressSpace) {
2836  auto isSupportedOnAVX512 = [](Type *VecTy, bool &RequiresBW) {
2837  RequiresBW = false;
2838  Type *EltTy = VecTy->getVectorElementType();
2839  if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
2840  EltTy->isIntegerTy(32) || EltTy->isPointerTy())
2841  return true;
2842  if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8)) {
2843  RequiresBW = true;
2844  return true;
2845  }
2846  return false;
2847  };
2848  bool RequiresBW;
2849  bool HasAVX512Solution = isSupportedOnAVX512(VecTy, RequiresBW);
2850  if (ST->hasAVX512() && HasAVX512Solution && (!RequiresBW || ST->hasBWI()))
2851  return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
2852  Alignment, AddressSpace);
2853  if (ST->hasAVX2())
2854  return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
2855  Alignment, AddressSpace);
2856 
2857  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2858  Alignment, AddressSpace);
2859 }
bool hasAVX() const
Definition: X86Subtarget.h:449
Type * getVectorElementType() const
Definition: Type.h:368
constexpr bool isUInt< 32 >(uint64_t x)
Definition: MathExtras.h:341
X = FP_ROUND(Y, TRUNC) - Rounding &#39;Y&#39; from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:512
bool is64Bit() const
Is this x86_64? (disregarding specific ABI / programming model)
Definition: X86Subtarget.h:414
void push_back(const T &Elt)
Definition: SmallVector.h:212
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:109
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:569
unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >())
Definition: BasicTTIImpl.h:469
static IntegerType * getInt1Ty(LLVMContext &C)
Definition: Type.cpp:173
llvm::Optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1542
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:841
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:42
bool hasSSE41() const
Definition: X86Subtarget.h:447
Compute iterated dominance frontiers using a linear time algorithm.
Definition: AllocatorList.h:24
unsigned minRequiredElementSize(const Value *Val, bool &isSigned)
static MVT getVectorVT(MVT VT, unsigned NumElements)
Cost tables and simple lookup functions.
int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, bool VariableMask, unsigned Alignment)
Calculate the cost of Gather / Scatter operation.
bool hasPOPCNT() const
Definition: X86Subtarget.h:458
bool hasAVX2() const
Definition: X86Subtarget.h:450
const Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
bool isVector() const
Return true if this is a vector value type.
unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:408
This file a TargetTransformInfo::Concept conforming object specific to the X86 target machine...
The main scalar evolution driver.
unsigned getRegisterBitWidth(bool Vector) const
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:253
unsigned getVectorNumElements() const
bool isLegalMaskedScatter(Type *DataType)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:813
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:344
Type Conversion Cost Table.
Definition: CostTable.h:45
[US]{MIN/MAX} - Binary minimum or maximum or signed or unsigned integers.
Definition: ISDOpcodes.h:359
Hexagon Common GEP
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:227
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace)
unsigned getAtomicMemIntrinsicMaxElementSize() const
Cost Table Entry.
Definition: CostTable.h:25
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1488
unsigned getNumberOfRegisters(bool Vector)
int getGatherOverhead() const
Definition: X86Subtarget.h:495
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:130
unsigned getMaxInterleaveFactor(unsigned VF)
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:209
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
bool isUnalignedMem32Slow() const
Definition: X86Subtarget.h:494
unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef< Value * > Args, FastMathFlags FMF, unsigned VF=1)
Get intrinsic cost based on arguments.
Definition: BasicTTIImpl.h:857
bool isLegalMaskedStore(Type *DataType)
unsigned getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
int getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value *> Args=ArrayRef< const Value *>())
const FeatureBitset & getFeatureBits() const
getFeatureBits - Return the feature bits.
Shift and rotation operations.
Definition: ISDOpcodes.h:379
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:197
unsigned getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwise)
Try to calculate arithmetic and shuffle op costs for reduction operations.
const TypeConversionCostTblEntry * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntry > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table, TypeTy must be comparable to CompareTy by ==. ...
Definition: CostTable.h:55
unsigned getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isLSRCostLess(TargetTransformInfo::LSRCost &C1, TargetTransformInfo::LSRCost &C2)
bool hasVLX() const
Definition: X86Subtarget.h:523
unsigned getSizeInBits() const
unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace)
Definition: BasicTTIImpl.h:741
unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I)
Definition: BasicTTIImpl.h:663
Choose alternate elements from vector.
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1554
unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:525
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:245
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:455
PopcntSupportKind
Flags indicating the kind of support for population count.
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:891
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:200
llvm::Optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:33
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:203
int getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr)
Reverse the order of the vector.
amdgpu Simplify well known AMD library false Value * Callee
bool hasDQI() const
Definition: X86Subtarget.h:521
MVT getVectorElementType() const
Value * getOperand(unsigned i) const
Definition: User.h:154
Class to represent pointers.
Definition: DerivedTypes.h:467
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:498
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return &#39;this&#39;.
Definition: Type.h:301
bool isFloatTy() const
Return true if this is &#39;float&#39;, a 32-bit IEEE fp type.
Definition: Type.h:147
an instruction for type-safe pointer arithmetic to access elements of arrays and structs ...
Definition: Instructions.h:837
bool isSLM() const
Definition: X86Subtarget.h:535
bool hasSSSE3() const
Definition: X86Subtarget.h:446
If not nullptr, enable inline expansion of memcmp.
* if(!EatIfPresent(lltok::kw_thread_local)) return false
ParseOptionalThreadLocal := /*empty.
Container class for subtarget features.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:421
Machine Value Type.
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46
Simple binary floating point operators.
Definition: ISDOpcodes.h:259
bool isLegalMaskedGather(Type *DataType)
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:149
unsigned getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwise, bool)
Try to calculate op costs for min/max reduction operations.
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:221
Expected to fold away in lowering.
unsigned getUserCost(const User *U, ArrayRef< const Value * > Operands)
bool isLegalMaskedLoad(Type *DataType)
const TTI::MemCmpExpansionOptions * enableMemCmpExpansion(bool IsZeroCmp) const
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:495
Merge elements from two source vectors into one with any shuffle mask.
int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace)
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
X86ProcFamilyEnum getProcFamily() const
Definition: X86Subtarget.h:531
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:283
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
bool hasXOP() const
Definition: X86Subtarget.h:471
int getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace)
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:530
bool hasSSE42() const
Definition: X86Subtarget.h:448
Extended Value Type.
Definition: ValueTypes.h:34
int getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm)
unsigned getUserCost(const User *U, ArrayRef< const Value *> Operands)
const TargetMachine & getTargetMachine() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace, const Instruction *I=nullptr)
int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp)
const SCEVConstant * getConstantStrideStep(ScalarEvolution *SE, const SCEV *Ptr)
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
OperandValueProperties
Additional properties of an operand&#39;s values.
unsigned getCFInstrCost(unsigned Opcode)
Definition: BasicTTIImpl.h:658
bool isAtom() const
TODO: to be removed later and replaced with suitable properties.
Definition: X86Subtarget.h:534
unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp)
Definition: BasicTTIImpl.h:516
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:935
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:240
unsigned getNumOperands() const
Definition: User.h:176
constexpr bool isInt< 32 >(int64_t x)
Definition: MathExtras.h:301
AddressSpace
Definition: NVPTXBaseInfo.h:22
bool hasVBMI() const
Definition: X86Subtarget.h:481
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:382
int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I=nullptr)
int getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwiseForm, bool IsUnsigned)
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target&#39;s TargetSubtargetInf...
unsigned getVectorNumElements() const
Definition: DerivedTypes.h:462
Class to represent vector types.
Definition: DerivedTypes.h:393
Class for arbitrary precision integers.
Definition: APInt.h:69
int getScatterOverhead() const
Definition: X86Subtarget.h:496
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:445
unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:710
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const
const CostTblEntry * CostTableLookup(ArrayRef< CostTblEntry > Tbl, int ISD, MVT Ty)
Find in cost table, TypeTy must be comparable to CompareTy by ==.
Definition: CostTable.h:32
This class represents an analyzed expression in the program.
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, const Instruction *I=nullptr)
#define I(x, y, z)
Definition: MD5.cpp:58
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:323
unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: BasicTTIImpl.h:703
InsertSubvector. Index indicates start offset.
unsigned Insns
TODO: Some of these could be merged.
bool hasCDI() const
Definition: X86Subtarget.h:517
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:185
const unsigned Kind
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
The cost of a typical &#39;add&#39; instruction.
int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace)
bool hasSSE1() const
Definition: X86Subtarget.h:443
unsigned getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition: Type.cpp:115
LLVM Value Representation.
Definition: Value.h:73
uint64_t getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type...
Definition: DataLayout.h:386
bool hasDivRemOp(Type *DataType, bool IsSigned)
static VectorType * get(Type *ElementType, unsigned NumElements)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:593
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition: Compiler.h:235
Broadcast element 0 to all other elements.
bool hasAVX512() const
Definition: X86Subtarget.h:451
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:57
Convenience struct for specifying and reasoning about fast-math flags.
Definition: Operator.h:160
bool hasBWI() const
Definition: X86Subtarget.h:522
OperandValueKind
Additional information about an operand&#39;s possible values.
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:412
This pass exposes codegen information to IR-level passes.
Conversion operators.
Definition: ISDOpcodes.h:442
CacheLevel
The possible cache levels.
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:451
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:126
bool isDoubleTy() const
Return true if this is &#39;double&#39;, a 64-bit IEEE fp type.
Definition: Type.h:150
int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef< Type *> Tys, FastMathFlags FMF, unsigned ScalarizationCostPassed=UINT_MAX)
static IntegerType * getInt8Ty(LLVMContext &C)
Definition: Type.cpp:174
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
std::pair< int, MVT > getTypeLegalizationCost(const DataLayout &DL, Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
bool hasSSE2() const
Definition: X86Subtarget.h:444
int getIntImmCost(int64_t)
Calculate the cost of materializing a 64-bit value.
This file describes how to lower LLVM code to machine code.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Shuffle elements of single source vector with any shuffle mask.