LLVM  9.0.0svn
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements a TargetTransformInfo analysis pass specific to the
10 /// X86 target machine. It uses the target's detailed information to provide
11 /// more precise answers to certain TTI queries, while letting the target
12 /// independent and default TTI implementations handle the rest.
13 ///
14 //===----------------------------------------------------------------------===//
15 /// About Cost Model numbers used below it's necessary to say the following:
16 /// the numbers correspond to some "generic" X86 CPU instead of usage of
17 /// concrete CPU model. Usually the numbers correspond to CPU where the feature
18 /// apeared at the first time. For example, if we do Subtarget.hasSSE42() in
19 /// the lookups below the cost is based on Nehalem as that was the first CPU
20 /// to support that feature level and thus has most likely the worst case cost.
21 /// Some examples of other technologies/CPUs:
22 /// SSE 3 - Pentium4 / Athlon64
23 /// SSE 4.1 - Penryn
24 /// SSE 4.2 - Nehalem
25 /// AVX - Sandy Bridge
26 /// AVX2 - Haswell
27 /// AVX-512 - Xeon Phi / Skylake
28 /// And some examples of instruction target dependent costs (latency)
29 /// divss sqrtss rsqrtss
30 /// AMD K7 11-16 19 3
31 /// Piledriver 9-24 13-15 5
32 /// Jaguar 14 16 2
33 /// Pentium II,III 18 30 2
34 /// Nehalem 7-14 7-18 3
35 /// Haswell 10-13 11 5
36 /// TODO: Develop and implement the target dependent cost model and
37 /// specialize cost numbers for different Cost Model Targets such as throughput,
38 /// code size, latency and uop count.
39 //===----------------------------------------------------------------------===//
40 
41 #include "X86TargetTransformInfo.h"
44 #include "llvm/CodeGen/CostTable.h"
46 #include "llvm/IR/IntrinsicInst.h"
47 #include "llvm/Support/Debug.h"
48 
49 using namespace llvm;
50 
51 #define DEBUG_TYPE "x86tti"
52 
53 //===----------------------------------------------------------------------===//
54 //
55 // X86 cost model.
56 //
57 //===----------------------------------------------------------------------===//
58 
60 X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
61  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
62  // TODO: Currently the __builtin_popcount() implementation using SSE3
63  // instructions is inefficient. Once the problem is fixed, we should
64  // call ST->hasSSE3() instead of ST->hasPOPCNT().
66 }
67 
70  switch (Level) {
72  // - Penryn
73  // - Nehalem
74  // - Westmere
75  // - Sandy Bridge
76  // - Ivy Bridge
77  // - Haswell
78  // - Broadwell
79  // - Skylake
80  // - Kabylake
81  return 32 * 1024; // 32 KByte
83  // - Penryn
84  // - Nehalem
85  // - Westmere
86  // - Sandy Bridge
87  // - Ivy Bridge
88  // - Haswell
89  // - Broadwell
90  // - Skylake
91  // - Kabylake
92  return 256 * 1024; // 256 KByte
93  }
94 
95  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
96 }
97 
100  // - Penryn
101  // - Nehalem
102  // - Westmere
103  // - Sandy Bridge
104  // - Ivy Bridge
105  // - Haswell
106  // - Broadwell
107  // - Skylake
108  // - Kabylake
109  switch (Level) {
113  return 8;
114  }
115 
116  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
117 }
118 
119 unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
120  if (Vector && !ST->hasSSE1())
121  return 0;
122 
123  if (ST->is64Bit()) {
124  if (Vector && ST->hasAVX512())
125  return 32;
126  return 16;
127  }
128  return 8;
129 }
130 
131 unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const {
132  unsigned PreferVectorWidth = ST->getPreferVectorWidth();
133  if (Vector) {
134  if (ST->hasAVX512() && PreferVectorWidth >= 512)
135  return 512;
136  if (ST->hasAVX() && PreferVectorWidth >= 256)
137  return 256;
138  if (ST->hasSSE1() && PreferVectorWidth >= 128)
139  return 128;
140  return 0;
141  }
142 
143  if (ST->is64Bit())
144  return 64;
145 
146  return 32;
147 }
148 
149 unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
150  return getRegisterBitWidth(true);
151 }
152 
153 unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
154  // If the loop will not be vectorized, don't interleave the loop.
155  // Let regular unroll to unroll the loop, which saves the overflow
156  // check and memory check cost.
157  if (VF == 1)
158  return 1;
159 
160  if (ST->isAtom())
161  return 1;
162 
163  // Sandybridge and Haswell have multiple execution ports and pipelined
164  // vector units.
165  if (ST->hasAVX())
166  return 4;
167 
168  return 2;
169 }
170 
172  unsigned Opcode, Type *Ty,
174  TTI::OperandValueProperties Opd1PropInfo,
175  TTI::OperandValueProperties Opd2PropInfo,
177  // Legalize the type.
178  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
179 
180  int ISD = TLI->InstructionOpcodeToISD(Opcode);
181  assert(ISD && "Invalid opcode");
182 
183  static const CostTblEntry GLMCostTable[] = {
184  { ISD::FDIV, MVT::f32, 18 }, // divss
185  { ISD::FDIV, MVT::v4f32, 35 }, // divps
186  { ISD::FDIV, MVT::f64, 33 }, // divsd
187  { ISD::FDIV, MVT::v2f64, 65 }, // divpd
188  };
189 
190  if (ST->isGLM())
191  if (const auto *Entry = CostTableLookup(GLMCostTable, ISD,
192  LT.second))
193  return LT.first * Entry->Cost;
194 
195  static const CostTblEntry SLMCostTable[] = {
196  { ISD::MUL, MVT::v4i32, 11 }, // pmulld
197  { ISD::MUL, MVT::v8i16, 2 }, // pmullw
198  { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
199  { ISD::FMUL, MVT::f64, 2 }, // mulsd
200  { ISD::FMUL, MVT::v2f64, 4 }, // mulpd
201  { ISD::FMUL, MVT::v4f32, 2 }, // mulps
202  { ISD::FDIV, MVT::f32, 17 }, // divss
203  { ISD::FDIV, MVT::v4f32, 39 }, // divps
204  { ISD::FDIV, MVT::f64, 32 }, // divsd
205  { ISD::FDIV, MVT::v2f64, 69 }, // divpd
206  { ISD::FADD, MVT::v2f64, 2 }, // addpd
207  { ISD::FSUB, MVT::v2f64, 2 }, // subpd
208  // v2i64/v4i64 mul is custom lowered as a series of long:
209  // multiplies(3), shifts(3) and adds(2)
210  // slm muldq version throughput is 2 and addq throughput 4
211  // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
212  // 3X4 (addq throughput) = 17
213  { ISD::MUL, MVT::v2i64, 17 },
214  // slm addq\subq throughput is 4
215  { ISD::ADD, MVT::v2i64, 4 },
216  { ISD::SUB, MVT::v2i64, 4 },
217  };
218 
219  if (ST->isSLM()) {
220  if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
221  // Check if the operands can be shrinked into a smaller datatype.
222  bool Op1Signed = false;
223  unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
224  bool Op2Signed = false;
225  unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
226 
227  bool signedMode = Op1Signed | Op2Signed;
228  unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
229 
230  if (OpMinSize <= 7)
231  return LT.first * 3; // pmullw/sext
232  if (!signedMode && OpMinSize <= 8)
233  return LT.first * 3; // pmullw/zext
234  if (OpMinSize <= 15)
235  return LT.first * 5; // pmullw/pmulhw/pshuf
236  if (!signedMode && OpMinSize <= 16)
237  return LT.first * 5; // pmullw/pmulhw/pshuf
238  }
239 
240  if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
241  LT.second)) {
242  return LT.first * Entry->Cost;
243  }
244  }
245 
246  if ((ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
247  ISD == ISD::UREM) &&
250  Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
251  if (ISD == ISD::SDIV || ISD == ISD::SREM) {
252  // On X86, vector signed division by constants power-of-two are
253  // normally expanded to the sequence SRA + SRL + ADD + SRA.
254  // The OperandValue properties may not be the same as that of the previous
255  // operation; conservatively assume OP_None.
256  int Cost =
257  2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, Op2Info,
260  Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
263  Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info,
266 
267  if (ISD == ISD::SREM) {
268  // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
269  Cost += getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info);
270  Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Op1Info, Op2Info);
271  }
272 
273  return Cost;
274  }
275 
276  // Vector unsigned division/remainder will be simplified to shifts/masks.
277  if (ISD == ISD::UDIV)
278  return getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
281 
282  if (ISD == ISD::UREM)
283  return getArithmeticInstrCost(Instruction::And, Ty, Op1Info, Op2Info,
286  }
287 
288  static const CostTblEntry AVX512BWUniformConstCostTable[] = {
289  { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand.
290  { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand.
291  { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb.
292  };
293 
295  ST->hasBWI()) {
296  if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
297  LT.second))
298  return LT.first * Entry->Cost;
299  }
300 
301  static const CostTblEntry AVX512UniformConstCostTable[] = {
302  { ISD::SRA, MVT::v2i64, 1 },
303  { ISD::SRA, MVT::v4i64, 1 },
304  { ISD::SRA, MVT::v8i64, 1 },
305  };
306 
308  ST->hasAVX512()) {
309  if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
310  LT.second))
311  return LT.first * Entry->Cost;
312  }
313 
314  static const CostTblEntry AVX2UniformConstCostTable[] = {
315  { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand.
316  { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand.
317  { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb.
318 
319  { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
320  };
321 
323  ST->hasAVX2()) {
324  if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
325  LT.second))
326  return LT.first * Entry->Cost;
327  }
328 
329  static const CostTblEntry SSE2UniformConstCostTable[] = {
330  { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand.
331  { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand.
332  { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
333 
334  { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split.
335  { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split.
336  { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
337  };
338 
339  // XOP has faster vXi8 shifts.
341  ST->hasSSE2() && !ST->hasXOP()) {
342  if (const auto *Entry =
343  CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
344  return LT.first * Entry->Cost;
345  }
346 
347  static const CostTblEntry AVX512BWConstCostTable[] = {
348  { ISD::SDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
349  { ISD::SREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
350  { ISD::UDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
351  { ISD::UREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
352  { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence
353  { ISD::SREM, MVT::v32i16, 8 }, // vpmulhw+mul+sub sequence
354  { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence
355  { ISD::UREM, MVT::v32i16, 8 }, // vpmulhuw+mul+sub sequence
356  };
357 
360  ST->hasBWI()) {
361  if (const auto *Entry =
362  CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
363  return LT.first * Entry->Cost;
364  }
365 
366  static const CostTblEntry AVX512ConstCostTable[] = {
367  { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
368  { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
369  { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
370  { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
371  };
372 
375  ST->hasAVX512()) {
376  if (const auto *Entry =
377  CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
378  return LT.first * Entry->Cost;
379  }
380 
381  static const CostTblEntry AVX2ConstCostTable[] = {
382  { ISD::SDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
383  { ISD::SREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
384  { ISD::UDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
385  { ISD::UREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
386  { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
387  { ISD::SREM, MVT::v16i16, 8 }, // vpmulhw+mul+sub sequence
388  { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
389  { ISD::UREM, MVT::v16i16, 8 }, // vpmulhuw+mul+sub sequence
390  { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
391  { ISD::SREM, MVT::v8i32, 19 }, // vpmuldq+mul+sub sequence
392  { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
393  { ISD::UREM, MVT::v8i32, 19 }, // vpmuludq+mul+sub sequence
394  };
395 
398  ST->hasAVX2()) {
399  if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
400  return LT.first * Entry->Cost;
401  }
402 
403  static const CostTblEntry SSE2ConstCostTable[] = {
404  { ISD::SDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
405  { ISD::SREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
406  { ISD::SDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
407  { ISD::SREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
408  { ISD::UDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
409  { ISD::UREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
410  { ISD::UDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
411  { ISD::UREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
412  { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
413  { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split.
414  { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
415  { ISD::SREM, MVT::v8i16, 8 }, // pmulhw+mul+sub sequence
416  { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
417  { ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split.
418  { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
419  { ISD::UREM, MVT::v8i16, 8 }, // pmulhuw+mul+sub sequence
420  { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split.
421  { ISD::SREM, MVT::v8i32, 48+2 }, // 2*pmuludq+mul+sub sequence + split.
422  { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
423  { ISD::SREM, MVT::v4i32, 24 }, // pmuludq+mul+sub sequence
424  { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split.
425  { ISD::UREM, MVT::v8i32, 40+2 }, // 2*pmuludq+mul+sub sequence + split.
426  { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
427  { ISD::UREM, MVT::v4i32, 20 }, // pmuludq+mul+sub sequence
428  };
429 
432  ST->hasSSE2()) {
433  // pmuldq sequence.
434  if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
435  return LT.first * 32;
436  if (ISD == ISD::SREM && LT.second == MVT::v8i32 && ST->hasAVX())
437  return LT.first * 38;
438  if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
439  return LT.first * 15;
440  if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41())
441  return LT.first * 20;
442 
443  if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
444  return LT.first * Entry->Cost;
445  }
446 
447  static const CostTblEntry AVX2UniformCostTable[] = {
448  // Uniform splats are cheaper for the following instructions.
449  { ISD::SHL, MVT::v16i16, 1 }, // psllw.
450  { ISD::SRL, MVT::v16i16, 1 }, // psrlw.
451  { ISD::SRA, MVT::v16i16, 1 }, // psraw.
452  };
453 
454  if (ST->hasAVX2() &&
456  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
457  if (const auto *Entry =
458  CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
459  return LT.first * Entry->Cost;
460  }
461 
462  static const CostTblEntry SSE2UniformCostTable[] = {
463  // Uniform splats are cheaper for the following instructions.
464  { ISD::SHL, MVT::v8i16, 1 }, // psllw.
465  { ISD::SHL, MVT::v4i32, 1 }, // pslld
466  { ISD::SHL, MVT::v2i64, 1 }, // psllq.
467 
468  { ISD::SRL, MVT::v8i16, 1 }, // psrlw.
469  { ISD::SRL, MVT::v4i32, 1 }, // psrld.
470  { ISD::SRL, MVT::v2i64, 1 }, // psrlq.
471 
472  { ISD::SRA, MVT::v8i16, 1 }, // psraw.
473  { ISD::SRA, MVT::v4i32, 1 }, // psrad.
474  };
475 
476  if (ST->hasSSE2() &&
478  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
479  if (const auto *Entry =
480  CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
481  return LT.first * Entry->Cost;
482  }
483 
484  static const CostTblEntry AVX512DQCostTable[] = {
485  { ISD::MUL, MVT::v2i64, 1 },
486  { ISD::MUL, MVT::v4i64, 1 },
487  { ISD::MUL, MVT::v8i64, 1 }
488  };
489 
490  // Look for AVX512DQ lowering tricks for custom cases.
491  if (ST->hasDQI())
492  if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
493  return LT.first * Entry->Cost;
494 
495  static const CostTblEntry AVX512BWCostTable[] = {
496  { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
497  { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
498  { ISD::SRA, MVT::v8i16, 1 }, // vpsravw
499 
500  { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
501  { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
502  { ISD::SRA, MVT::v16i16, 1 }, // vpsravw
503 
504  { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
505  { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
506  { ISD::SRA, MVT::v32i16, 1 }, // vpsravw
507 
508  { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence.
509  { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence.
510  { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence.
511 
512  { ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence.
513  { ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence.
514  { ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence.
515  };
516 
517  // Look for AVX512BW lowering tricks for custom cases.
518  if (ST->hasBWI())
519  if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
520  return LT.first * Entry->Cost;
521 
522  static const CostTblEntry AVX512CostTable[] = {
523  { ISD::SHL, MVT::v16i32, 1 },
524  { ISD::SRL, MVT::v16i32, 1 },
525  { ISD::SRA, MVT::v16i32, 1 },
526 
527  { ISD::SHL, MVT::v8i64, 1 },
528  { ISD::SRL, MVT::v8i64, 1 },
529 
530  { ISD::SRA, MVT::v2i64, 1 },
531  { ISD::SRA, MVT::v4i64, 1 },
532  { ISD::SRA, MVT::v8i64, 1 },
533 
534  { ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence.
535  { ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence.
536  { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org)
537  { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org)
538  { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org)
539  { ISD::MUL, MVT::v8i64, 8 }, // 3*pmuludq/3*shift/2*add
540 
541  { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
542  { ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
543  { ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
544 
545  { ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
546  { ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
547  { ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
548  };
549 
550  if (ST->hasAVX512())
551  if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
552  return LT.first * Entry->Cost;
553 
554  static const CostTblEntry AVX2ShiftCostTable[] = {
555  // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
556  // customize them to detect the cases where shift amount is a scalar one.
557  { ISD::SHL, MVT::v4i32, 1 },
558  { ISD::SRL, MVT::v4i32, 1 },
559  { ISD::SRA, MVT::v4i32, 1 },
560  { ISD::SHL, MVT::v8i32, 1 },
561  { ISD::SRL, MVT::v8i32, 1 },
562  { ISD::SRA, MVT::v8i32, 1 },
563  { ISD::SHL, MVT::v2i64, 1 },
564  { ISD::SRL, MVT::v2i64, 1 },
565  { ISD::SHL, MVT::v4i64, 1 },
566  { ISD::SRL, MVT::v4i64, 1 },
567  };
568 
569  // Look for AVX2 lowering tricks.
570  if (ST->hasAVX2()) {
571  if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
574  // On AVX2, a packed v16i16 shift left by a constant build_vector
575  // is lowered into a vector multiply (vpmullw).
576  return getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info,
579 
580  if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
581  return LT.first * Entry->Cost;
582  }
583 
584  static const CostTblEntry XOPShiftCostTable[] = {
585  // 128bit shifts take 1cy, but right shifts require negation beforehand.
586  { ISD::SHL, MVT::v16i8, 1 },
587  { ISD::SRL, MVT::v16i8, 2 },
588  { ISD::SRA, MVT::v16i8, 2 },
589  { ISD::SHL, MVT::v8i16, 1 },
590  { ISD::SRL, MVT::v8i16, 2 },
591  { ISD::SRA, MVT::v8i16, 2 },
592  { ISD::SHL, MVT::v4i32, 1 },
593  { ISD::SRL, MVT::v4i32, 2 },
594  { ISD::SRA, MVT::v4i32, 2 },
595  { ISD::SHL, MVT::v2i64, 1 },
596  { ISD::SRL, MVT::v2i64, 2 },
597  { ISD::SRA, MVT::v2i64, 2 },
598  // 256bit shifts require splitting if AVX2 didn't catch them above.
599  { ISD::SHL, MVT::v32i8, 2+2 },
600  { ISD::SRL, MVT::v32i8, 4+2 },
601  { ISD::SRA, MVT::v32i8, 4+2 },
602  { ISD::SHL, MVT::v16i16, 2+2 },
603  { ISD::SRL, MVT::v16i16, 4+2 },
604  { ISD::SRA, MVT::v16i16, 4+2 },
605  { ISD::SHL, MVT::v8i32, 2+2 },
606  { ISD::SRL, MVT::v8i32, 4+2 },
607  { ISD::SRA, MVT::v8i32, 4+2 },
608  { ISD::SHL, MVT::v4i64, 2+2 },
609  { ISD::SRL, MVT::v4i64, 4+2 },
610  { ISD::SRA, MVT::v4i64, 4+2 },
611  };
612 
613  // Look for XOP lowering tricks.
614  if (ST->hasXOP()) {
615  // If the right shift is constant then we'll fold the negation so
616  // it's as cheap as a left shift.
617  int ShiftISD = ISD;
618  if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) &&
621  ShiftISD = ISD::SHL;
622  if (const auto *Entry =
623  CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
624  return LT.first * Entry->Cost;
625  }
626 
627  static const CostTblEntry SSE2UniformShiftCostTable[] = {
628  // Uniform splats are cheaper for the following instructions.
629  { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split.
630  { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split.
631  { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split.
632 
633  { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split.
634  { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split.
635  { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split.
636 
637  { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split.
638  { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split.
639  { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle.
640  { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split.
641  };
642 
643  if (ST->hasSSE2() &&
645  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
646 
647  // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
648  if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2())
649  return LT.first * 4; // 2*psrad + shuffle.
650 
651  if (const auto *Entry =
652  CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
653  return LT.first * Entry->Cost;
654  }
655 
656  if (ISD == ISD::SHL &&
658  MVT VT = LT.second;
659  // Vector shift left by non uniform constant can be lowered
660  // into vector multiply.
661  if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
662  ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
663  ISD = ISD::MUL;
664  }
665 
666  static const CostTblEntry AVX2CostTable[] = {
667  { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence.
668  { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
669 
670  { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence.
671  { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
672 
673  { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence.
674  { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence.
675  { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence.
676  { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence.
677 
678  { ISD::SUB, MVT::v32i8, 1 }, // psubb
679  { ISD::ADD, MVT::v32i8, 1 }, // paddb
680  { ISD::SUB, MVT::v16i16, 1 }, // psubw
681  { ISD::ADD, MVT::v16i16, 1 }, // paddw
682  { ISD::SUB, MVT::v8i32, 1 }, // psubd
683  { ISD::ADD, MVT::v8i32, 1 }, // paddd
684  { ISD::SUB, MVT::v4i64, 1 }, // psubq
685  { ISD::ADD, MVT::v4i64, 1 }, // paddq
686 
687  { ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence.
688  { ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence.
689  { ISD::MUL, MVT::v16i16, 1 }, // pmullw
690  { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org)
691  { ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add
692 
693  { ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
694  { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
695  { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
696  { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
697  { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
698  { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
699 
700  { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
701  { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
702  { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
703  { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/
704  { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
705  { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
706  };
707 
708  // Look for AVX2 lowering tricks for custom cases.
709  if (ST->hasAVX2())
710  if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
711  return LT.first * Entry->Cost;
712 
713  static const CostTblEntry AVX1CostTable[] = {
714  // We don't have to scalarize unsupported ops. We can issue two half-sized
715  // operations and we only need to extract the upper YMM half.
716  // Two ops + 1 extract + 1 insert = 4.
717  { ISD::MUL, MVT::v16i16, 4 },
718  { ISD::MUL, MVT::v8i32, 4 },
719  { ISD::SUB, MVT::v32i8, 4 },
720  { ISD::ADD, MVT::v32i8, 4 },
721  { ISD::SUB, MVT::v16i16, 4 },
722  { ISD::ADD, MVT::v16i16, 4 },
723  { ISD::SUB, MVT::v8i32, 4 },
724  { ISD::ADD, MVT::v8i32, 4 },
725  { ISD::SUB, MVT::v4i64, 4 },
726  { ISD::ADD, MVT::v4i64, 4 },
727 
728  // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
729  // are lowered as a series of long multiplies(3), shifts(3) and adds(2)
730  // Because we believe v4i64 to be a legal type, we must also include the
731  // extract+insert in the cost table. Therefore, the cost here is 18
732  // instead of 8.
733  { ISD::MUL, MVT::v4i64, 18 },
734 
735  { ISD::MUL, MVT::v32i8, 26 }, // extend/pmullw/trunc sequence.
736 
737  { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
738  { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
739  { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
740  { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/
741  { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/
742  { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/
743  };
744 
745  if (ST->hasAVX())
746  if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
747  return LT.first * Entry->Cost;
748 
749  static const CostTblEntry SSE42CostTable[] = {
750  { ISD::FADD, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
751  { ISD::FADD, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
752  { ISD::FADD, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
753  { ISD::FADD, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
754 
755  { ISD::FSUB, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
756  { ISD::FSUB, MVT::f32 , 1 }, // Nehalem from http://www.agner.org/
757  { ISD::FSUB, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
758  { ISD::FSUB, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
759 
760  { ISD::FMUL, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
761  { ISD::FMUL, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
762  { ISD::FMUL, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
763  { ISD::FMUL, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
764 
765  { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/
766  { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
767  { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/
768  { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
769  };
770 
771  if (ST->hasSSE42())
772  if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
773  return LT.first * Entry->Cost;
774 
775  static const CostTblEntry SSE41CostTable[] = {
776  { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence.
777  { ISD::SHL, MVT::v32i8, 2*11+2 }, // pblendvb sequence + split.
778  { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence.
779  { ISD::SHL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
780  { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
781  { ISD::SHL, MVT::v8i32, 2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split
782 
783  { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence.
784  { ISD::SRL, MVT::v32i8, 2*12+2 }, // pblendvb sequence + split.
785  { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence.
786  { ISD::SRL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
787  { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend.
788  { ISD::SRL, MVT::v8i32, 2*11+2 }, // Shift each lane + blend + split.
789 
790  { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence.
791  { ISD::SRA, MVT::v32i8, 2*24+2 }, // pblendvb sequence + split.
792  { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence.
793  { ISD::SRA, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
794  { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
795  { ISD::SRA, MVT::v8i32, 2*12+2 }, // Shift each lane + blend + split.
796 
797  { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org)
798  };
799 
800  if (ST->hasSSE41())
801  if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
802  return LT.first * Entry->Cost;
803 
804  static const CostTblEntry SSE2CostTable[] = {
805  // We don't correctly identify costs of casts because they are marked as
806  // custom.
807  { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence.
808  { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence.
809  { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
810  { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
811  { ISD::SHL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
812 
813  { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence.
814  { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence.
815  { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
816  { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
817  { ISD::SRL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
818 
819  { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence.
820  { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence.
821  { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend.
822  { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence.
823  { ISD::SRA, MVT::v4i64, 2*12+2 }, // srl/xor/sub sequence+split.
824 
825  { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence.
826  { ISD::MUL, MVT::v8i16, 1 }, // pmullw
827  { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
828  { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
829 
830  { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/
831  { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
832  { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
833  { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
834 
835  { ISD::FADD, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
836  { ISD::FADD, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
837 
838  { ISD::FSUB, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
839  { ISD::FSUB, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
840  };
841 
842  if (ST->hasSSE2())
843  if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
844  return LT.first * Entry->Cost;
845 
846  static const CostTblEntry SSE1CostTable[] = {
847  { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/
848  { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
849 
850  { ISD::FADD, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
851  { ISD::FADD, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
852 
853  { ISD::FSUB, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
854  { ISD::FSUB, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
855 
856  { ISD::ADD, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
857  { ISD::ADD, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
858  { ISD::ADD, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
859 
860  { ISD::SUB, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
861  { ISD::SUB, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
862  { ISD::SUB, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
863  };
864 
865  if (ST->hasSSE1())
866  if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
867  return LT.first * Entry->Cost;
868 
869  // It is not a good idea to vectorize division. We have to scalarize it and
870  // in the process we will often end up having to spilling regular
871  // registers. The overhead of division is going to dominate most kernels
872  // anyways so try hard to prevent vectorization of division - it is
873  // generally a bad idea. Assume somewhat arbitrarily that we have to be able
874  // to hide "20 cycles" for each lane.
875  if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM ||
876  ISD == ISD::UDIV || ISD == ISD::UREM)) {
877  int ScalarCost = getArithmeticInstrCost(
878  Opcode, Ty->getScalarType(), Op1Info, Op2Info,
880  return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
881  }
882 
883  // Fallback to the default implementation.
884  return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
885 }
886 
888  Type *SubTp) {
889  // 64-bit packed float vectors (v2f32) are widened to type v4f32.
890  // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
891  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
892 
893  // Treat Transpose as 2-op shuffles - there's no difference in lowering.
894  if (Kind == TTI::SK_Transpose)
895  Kind = TTI::SK_PermuteTwoSrc;
896 
897  // For Broadcasts we are splatting the first element from the first input
898  // register, so only need to reference that input and all the output
899  // registers are the same.
900  if (Kind == TTI::SK_Broadcast)
901  LT.first = 1;
902 
903  // Subvector extractions are free if they start at the beginning of a
904  // vector and cheap if the subvectors are aligned.
905  if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
906  int NumElts = LT.second.getVectorNumElements();
907  if ((Index % NumElts) == 0)
908  return 0;
909  std::pair<int, MVT> SubLT = TLI->getTypeLegalizationCost(DL, SubTp);
910  if (SubLT.second.isVector()) {
911  int NumSubElts = SubLT.second.getVectorNumElements();
912  if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
913  return SubLT.first;
914  }
915  }
916 
917  // We are going to permute multiple sources and the result will be in multiple
918  // destinations. Providing an accurate cost only for splits where the element
919  // type remains the same.
920  if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
921  MVT LegalVT = LT.second;
922  if (LegalVT.isVector() &&
923  LegalVT.getVectorElementType().getSizeInBits() ==
925  LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) {
926 
927  unsigned VecTySize = DL.getTypeStoreSize(Tp);
928  unsigned LegalVTSize = LegalVT.getStoreSize();
929  // Number of source vectors after legalization:
930  unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
931  // Number of destination vectors after legalization:
932  unsigned NumOfDests = LT.first;
933 
934  Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(),
935  LegalVT.getVectorNumElements());
936 
937  unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
938  return NumOfShuffles *
939  getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
940  }
941 
942  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
943  }
944 
945  // For 2-input shuffles, we must account for splitting the 2 inputs into many.
946  if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
947  // We assume that source and destination have the same vector type.
948  int NumOfDests = LT.first;
949  int NumOfShufflesPerDest = LT.first * 2 - 1;
950  LT.first = NumOfDests * NumOfShufflesPerDest;
951  }
952 
953  static const CostTblEntry AVX512VBMIShuffleTbl[] = {
954  {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
955  {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
956 
957  {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
958  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
959 
960  {TTI::SK_PermuteTwoSrc, MVT::v64i8, 1}, // vpermt2b
961  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 1}, // vpermt2b
962  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1} // vpermt2b
963  };
964 
965  if (ST->hasVBMI())
966  if (const auto *Entry =
967  CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
968  return LT.first * Entry->Cost;
969 
970  static const CostTblEntry AVX512BWShuffleTbl[] = {
971  {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
972  {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
973 
974  {TTI::SK_Reverse, MVT::v32i16, 1}, // vpermw
975  {TTI::SK_Reverse, MVT::v16i16, 1}, // vpermw
976  {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
977 
978  {TTI::SK_PermuteSingleSrc, MVT::v32i16, 1}, // vpermw
979  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 1}, // vpermw
980  {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // vpermw
981  {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
982  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 3}, // vpermw + zext/trunc
983 
984  {TTI::SK_PermuteTwoSrc, MVT::v32i16, 1}, // vpermt2w
985  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 1}, // vpermt2w
986  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpermt2w
987  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 3}, // zext + vpermt2w + trunc
988  {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
989  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3} // zext + vpermt2w + trunc
990  };
991 
992  if (ST->hasBWI())
993  if (const auto *Entry =
994  CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
995  return LT.first * Entry->Cost;
996 
997  static const CostTblEntry AVX512ShuffleTbl[] = {
998  {TTI::SK_Broadcast, MVT::v8f64, 1}, // vbroadcastpd
999  {TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps
1000  {TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq
1001  {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd
1002 
1003  {TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd
1004  {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps
1005  {TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq
1006  {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd
1007 
1008  {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd
1009  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1010  {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // vpermpd
1011  {TTI::SK_PermuteSingleSrc, MVT::v16f32, 1}, // vpermps
1012  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1013  {TTI::SK_PermuteSingleSrc, MVT::v4f32, 1}, // vpermps
1014  {TTI::SK_PermuteSingleSrc, MVT::v8i64, 1}, // vpermq
1015  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1016  {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // vpermq
1017  {TTI::SK_PermuteSingleSrc, MVT::v16i32, 1}, // vpermd
1018  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1019  {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // vpermd
1020  {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1021 
1022  {TTI::SK_PermuteTwoSrc, MVT::v8f64, 1}, // vpermt2pd
1023  {TTI::SK_PermuteTwoSrc, MVT::v16f32, 1}, // vpermt2ps
1024  {TTI::SK_PermuteTwoSrc, MVT::v8i64, 1}, // vpermt2q
1025  {TTI::SK_PermuteTwoSrc, MVT::v16i32, 1}, // vpermt2d
1026  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 1}, // vpermt2pd
1027  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 1}, // vpermt2ps
1028  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 1}, // vpermt2q
1029  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 1}, // vpermt2d
1030  {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1}, // vpermt2pd
1031  {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1}, // vpermt2ps
1032  {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1}, // vpermt2q
1033  {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1} // vpermt2d
1034  };
1035 
1036  if (ST->hasAVX512())
1037  if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1038  return LT.first * Entry->Cost;
1039 
1040  static const CostTblEntry AVX2ShuffleTbl[] = {
1041  {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1042  {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1043  {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1044  {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1045  {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1046  {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1047 
1048  {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1049  {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1050  {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1051  {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1052  {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1053  {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1054 
1055  {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1056  {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1057 
1058  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1059  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1060  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1061  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1062  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1063  // + vpblendvb
1064  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1065  // + vpblendvb
1066 
1067  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1068  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1069  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1070  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1071  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1072  // + vpblendvb
1073  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
1074  // + vpblendvb
1075  };
1076 
1077  if (ST->hasAVX2())
1078  if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1079  return LT.first * Entry->Cost;
1080 
1081  static const CostTblEntry XOPShuffleTbl[] = {
1082  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
1083  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
1084  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
1085  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
1086  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1087  // + vinsertf128
1088  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
1089  // + vinsertf128
1090 
1091  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1092  // + vinsertf128
1093  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
1094  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
1095  // + vinsertf128
1096  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
1097  };
1098 
1099  if (ST->hasXOP())
1100  if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1101  return LT.first * Entry->Cost;
1102 
1103  static const CostTblEntry AVX1ShuffleTbl[] = {
1104  {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1105  {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1106  {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1107  {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1108  {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1109  {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
1110 
1111  {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1112  {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1113  {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1114  {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1115  {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1116  // + vinsertf128
1117  {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
1118  // + vinsertf128
1119 
1120  {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
1121  {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
1122  {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
1123  {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
1124  {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
1125  {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
1126 
1127  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
1128  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
1129  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1130  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1131  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
1132  // + 2*por + vinsertf128
1133  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
1134  // + 2*por + vinsertf128
1135 
1136  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
1137  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
1138  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1139  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1140  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
1141  // + 4*por + vinsertf128
1142  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
1143  // + 4*por + vinsertf128
1144  };
1145 
1146  if (ST->hasAVX())
1147  if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
1148  return LT.first * Entry->Cost;
1149 
1150  static const CostTblEntry SSE41ShuffleTbl[] = {
1151  {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
1152  {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1153  {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
1154  {TTI::SK_Select, MVT::v4f32, 1}, // blendps
1155  {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
1156  {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
1157  };
1158 
1159  if (ST->hasSSE41())
1160  if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
1161  return LT.first * Entry->Cost;
1162 
1163  static const CostTblEntry SSSE3ShuffleTbl[] = {
1164  {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
1165  {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
1166 
1167  {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
1168  {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
1169 
1170  {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
1171  {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
1172 
1173  {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
1174  {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1175 
1176  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
1177  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
1178  };
1179 
1180  if (ST->hasSSSE3())
1181  if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
1182  return LT.first * Entry->Cost;
1183 
1184  static const CostTblEntry SSE2ShuffleTbl[] = {
1185  {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
1186  {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
1187  {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
1188  {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
1189  {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
1190 
1191  {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
1192  {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
1193  {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
1194  {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
1195  {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
1196  // + 2*pshufd + 2*unpck + packus
1197 
1198  {TTI::SK_Select, MVT::v2i64, 1}, // movsd
1199  {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1200  {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
1201  {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
1202  {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
1203 
1204  {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
1205  {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
1206  {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
1207  {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
1208  // + pshufd/unpck
1209  { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
1210  // + 2*pshufd + 2*unpck + 2*packus
1211 
1212  { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
1213  { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
1214  { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
1215  { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
1216  { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
1217  };
1218 
1219  if (ST->hasSSE2())
1220  if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
1221  return LT.first * Entry->Cost;
1222 
1223  static const CostTblEntry SSE1ShuffleTbl[] = {
1224  { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
1225  { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
1226  { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
1227  { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
1228  { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
1229  };
1230 
1231  if (ST->hasSSE1())
1232  if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
1233  return LT.first * Entry->Cost;
1234 
1235  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
1236 }
1237 
1238 int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1239  const Instruction *I) {
1240  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1241  assert(ISD && "Invalid opcode");
1242 
1243  // FIXME: Need a better design of the cost table to handle non-simple types of
1244  // potential massive combinations (elem_num x src_type x dst_type).
1245 
1246  static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
1249 
1250  // Mask sign extend has an instruction.
1257 
1258  // Mask zero extend is a load + broadcast.
1265  };
1266 
1267  static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
1274 
1281 
1288 
1295  };
1296 
1297  // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
1298  // 256-bit wide vectors.
1299 
1300  static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
1304 
1309 
1310  // v16i1 -> v16i32 - load + broadcast
1321 
1330 
1355 
1357 
1367  };
1368 
1369  static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
1386 
1393 
1396 
1398  };
1399 
1400  static const TypeConversionCostTblEntry AVXConversionTbl[] = {
1417 
1425 
1438 
1454  // The generic code to compute the scalar overhead is currently broken.
1455  // Workaround this limitation by estimating the scalarization overhead
1456  // here. We have roughly 10 instructions per scalar element.
1457  // Multiply that by the vector width.
1458  // FIXME: remove that when PR19268 is fixed.
1461 
1464  // This node is expanded into scalarized operations but BasicTTI is overly
1465  // optimistic estimating its cost. It computes 3 per element (one
1466  // vector-extract, one scalar conversion and one vector-insert). The
1467  // problem is that the inserts form a read-modify-write chain so latency
1468  // should be factored in too. Inflating the cost per element by 1.
1471 
1474  };
1475 
1476  static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
1483 
1502 
1510 
1512  };
1513 
1514  static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
1515  // These are somewhat magic numbers justified by looking at the output of
1516  // Intel's IACA, running some kernels and making sure when we take
1517  // legalization into account the throughput will be overestimated.
1519  { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1526 
1527  { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1535 
1537 
1539 
1564 
1574  };
1575 
1576  std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
1577  std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);
1578 
1579  if (ST->hasSSE2() && !ST->hasAVX()) {
1580  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
1581  LTDest.second, LTSrc.second))
1582  return LTSrc.first * Entry->Cost;
1583  }
1584 
1585  EVT SrcTy = TLI->getValueType(DL, Src);
1586  EVT DstTy = TLI->getValueType(DL, Dst);
1587 
1588  // The function getSimpleVT only handles simple value types.
1589  if (!SrcTy.isSimple() || !DstTy.isSimple())
1590  return BaseT::getCastInstrCost(Opcode, Dst, Src);
1591 
1592  MVT SimpleSrcTy = SrcTy.getSimpleVT();
1593  MVT SimpleDstTy = DstTy.getSimpleVT();
1594 
1595  // Make sure that neither type is going to be split before using the
1596  // AVX512 tables. This handles -mprefer-vector-width=256
1597  // with -min-legal-vector-width<=256
1598  if (TLI->getTypeAction(SimpleSrcTy) != TargetLowering::TypeSplitVector &&
1599  TLI->getTypeAction(SimpleDstTy) != TargetLowering::TypeSplitVector) {
1600  if (ST->hasBWI())
1601  if (const auto *Entry = ConvertCostTableLookup(AVX512BWConversionTbl, ISD,
1602  SimpleDstTy, SimpleSrcTy))
1603  return Entry->Cost;
1604 
1605  if (ST->hasDQI())
1606  if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
1607  SimpleDstTy, SimpleSrcTy))
1608  return Entry->Cost;
1609 
1610  if (ST->hasAVX512())
1611  if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
1612  SimpleDstTy, SimpleSrcTy))
1613  return Entry->Cost;
1614  }
1615 
1616  if (ST->hasAVX2()) {
1617  if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
1618  SimpleDstTy, SimpleSrcTy))
1619  return Entry->Cost;
1620  }
1621 
1622  if (ST->hasAVX()) {
1623  if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
1624  SimpleDstTy, SimpleSrcTy))
1625  return Entry->Cost;
1626  }
1627 
1628  if (ST->hasSSE41()) {
1629  if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
1630  SimpleDstTy, SimpleSrcTy))
1631  return Entry->Cost;
1632  }
1633 
1634  if (ST->hasSSE2()) {
1635  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
1636  SimpleDstTy, SimpleSrcTy))
1637  return Entry->Cost;
1638  }
1639 
1640  return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
1641 }
1642 
1643 int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1644  const Instruction *I) {
1645  // Legalize the type.
1646  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1647 
1648  MVT MTy = LT.second;
1649 
1650  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1651  assert(ISD && "Invalid opcode");
1652 
1653  unsigned ExtraCost = 0;
1654  if (I && (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp)) {
1655  // Some vector comparison predicates cost extra instructions.
1656  if (MTy.isVector() &&
1657  !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
1658  (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
1659  ST->hasBWI())) {
1660  switch (cast<CmpInst>(I)->getPredicate()) {
1661  case CmpInst::Predicate::ICMP_NE:
1662  // xor(cmpeq(x,y),-1)
1663  ExtraCost = 1;
1664  break;
1665  case CmpInst::Predicate::ICMP_SGE:
1666  case CmpInst::Predicate::ICMP_SLE:
1667  // xor(cmpgt(x,y),-1)
1668  ExtraCost = 1;
1669  break;
1670  case CmpInst::Predicate::ICMP_ULT:
1671  case CmpInst::Predicate::ICMP_UGT:
1672  // cmpgt(xor(x,signbit),xor(y,signbit))
1673  // xor(cmpeq(pmaxu(x,y),x),-1)
1674  ExtraCost = 2;
1675  break;
1676  case CmpInst::Predicate::ICMP_ULE:
1677  case CmpInst::Predicate::ICMP_UGE:
1678  if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
1679  (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
1680  // cmpeq(psubus(x,y),0)
1681  // cmpeq(pminu(x,y),x)
1682  ExtraCost = 1;
1683  } else {
1684  // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
1685  ExtraCost = 3;
1686  }
1687  break;
1688  default:
1689  break;
1690  }
1691  }
1692  }
1693 
1694  static const CostTblEntry AVX512BWCostTbl[] = {
1695  { ISD::SETCC, MVT::v32i16, 1 },
1696  { ISD::SETCC, MVT::v64i8, 1 },
1697 
1698  { ISD::SELECT, MVT::v32i16, 1 },
1699  { ISD::SELECT, MVT::v64i8, 1 },
1700  };
1701 
1702  static const CostTblEntry AVX512CostTbl[] = {
1703  { ISD::SETCC, MVT::v8i64, 1 },
1704  { ISD::SETCC, MVT::v16i32, 1 },
1705  { ISD::SETCC, MVT::v8f64, 1 },
1706  { ISD::SETCC, MVT::v16f32, 1 },
1707 
1708  { ISD::SELECT, MVT::v8i64, 1 },
1709  { ISD::SELECT, MVT::v16i32, 1 },
1710  { ISD::SELECT, MVT::v8f64, 1 },
1711  { ISD::SELECT, MVT::v16f32, 1 },
1712  };
1713 
1714  static const CostTblEntry AVX2CostTbl[] = {
1715  { ISD::SETCC, MVT::v4i64, 1 },
1716  { ISD::SETCC, MVT::v8i32, 1 },
1717  { ISD::SETCC, MVT::v16i16, 1 },
1718  { ISD::SETCC, MVT::v32i8, 1 },
1719 
1720  { ISD::SELECT, MVT::v4i64, 1 }, // pblendvb
1721  { ISD::SELECT, MVT::v8i32, 1 }, // pblendvb
1722  { ISD::SELECT, MVT::v16i16, 1 }, // pblendvb
1723  { ISD::SELECT, MVT::v32i8, 1 }, // pblendvb
1724  };
1725 
1726  static const CostTblEntry AVX1CostTbl[] = {
1727  { ISD::SETCC, MVT::v4f64, 1 },
1728  { ISD::SETCC, MVT::v8f32, 1 },
1729  // AVX1 does not support 8-wide integer compare.
1730  { ISD::SETCC, MVT::v4i64, 4 },
1731  { ISD::SETCC, MVT::v8i32, 4 },
1732  { ISD::SETCC, MVT::v16i16, 4 },
1733  { ISD::SETCC, MVT::v32i8, 4 },
1734 
1735  { ISD::SELECT, MVT::v4f64, 1 }, // vblendvpd
1736  { ISD::SELECT, MVT::v8f32, 1 }, // vblendvps
1737  { ISD::SELECT, MVT::v4i64, 1 }, // vblendvpd
1738  { ISD::SELECT, MVT::v8i32, 1 }, // vblendvps
1739  { ISD::SELECT, MVT::v16i16, 3 }, // vandps + vandnps + vorps
1740  { ISD::SELECT, MVT::v32i8, 3 }, // vandps + vandnps + vorps
1741  };
1742 
1743  static const CostTblEntry SSE42CostTbl[] = {
1744  { ISD::SETCC, MVT::v2f64, 1 },
1745  { ISD::SETCC, MVT::v4f32, 1 },
1746  { ISD::SETCC, MVT::v2i64, 1 },
1747  };
1748 
1749  static const CostTblEntry SSE41CostTbl[] = {
1750  { ISD::SELECT, MVT::v2f64, 1 }, // blendvpd
1751  { ISD::SELECT, MVT::v4f32, 1 }, // blendvps
1752  { ISD::SELECT, MVT::v2i64, 1 }, // pblendvb
1753  { ISD::SELECT, MVT::v4i32, 1 }, // pblendvb
1754  { ISD::SELECT, MVT::v8i16, 1 }, // pblendvb
1755  { ISD::SELECT, MVT::v16i8, 1 }, // pblendvb
1756  };
1757 
1758  static const CostTblEntry SSE2CostTbl[] = {
1759  { ISD::SETCC, MVT::v2f64, 2 },
1760  { ISD::SETCC, MVT::f64, 1 },
1761  { ISD::SETCC, MVT::v2i64, 8 },
1762  { ISD::SETCC, MVT::v4i32, 1 },
1763  { ISD::SETCC, MVT::v8i16, 1 },
1764  { ISD::SETCC, MVT::v16i8, 1 },
1765 
1766  { ISD::SELECT, MVT::v2f64, 3 }, // andpd + andnpd + orpd
1767  { ISD::SELECT, MVT::v2i64, 3 }, // pand + pandn + por
1768  { ISD::SELECT, MVT::v4i32, 3 }, // pand + pandn + por
1769  { ISD::SELECT, MVT::v8i16, 3 }, // pand + pandn + por
1770  { ISD::SELECT, MVT::v16i8, 3 }, // pand + pandn + por
1771  };
1772 
1773  static const CostTblEntry SSE1CostTbl[] = {
1774  { ISD::SETCC, MVT::v4f32, 2 },
1775  { ISD::SETCC, MVT::f32, 1 },
1776 
1777  { ISD::SELECT, MVT::v4f32, 3 }, // andps + andnps + orps
1778  };
1779 
1780  if (ST->hasBWI())
1781  if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
1782  return LT.first * (ExtraCost + Entry->Cost);
1783 
1784  if (ST->hasAVX512())
1785  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
1786  return LT.first * (ExtraCost + Entry->Cost);
1787 
1788  if (ST->hasAVX2())
1789  if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
1790  return LT.first * (ExtraCost + Entry->Cost);
1791 
1792  if (ST->hasAVX())
1793  if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
1794  return LT.first * (ExtraCost + Entry->Cost);
1795 
1796  if (ST->hasSSE42())
1797  if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
1798  return LT.first * (ExtraCost + Entry->Cost);
1799 
1800  if (ST->hasSSE41())
1801  if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
1802  return LT.first * (ExtraCost + Entry->Cost);
1803 
1804  if (ST->hasSSE2())
1805  if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
1806  return LT.first * (ExtraCost + Entry->Cost);
1807 
1808  if (ST->hasSSE1())
1809  if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
1810  return LT.first * (ExtraCost + Entry->Cost);
1811 
1812  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
1813 }
1814 
1816 
1819  unsigned ScalarizationCostPassed) {
1820  // Costs should match the codegen from:
1821  // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
1822  // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
1823  // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
1824  // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
1825  // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
1826  static const CostTblEntry AVX512CDCostTbl[] = {
1827  { ISD::CTLZ, MVT::v8i64, 1 },
1828  { ISD::CTLZ, MVT::v16i32, 1 },
1829  { ISD::CTLZ, MVT::v32i16, 8 },
1830  { ISD::CTLZ, MVT::v64i8, 20 },
1831  { ISD::CTLZ, MVT::v4i64, 1 },
1832  { ISD::CTLZ, MVT::v8i32, 1 },
1833  { ISD::CTLZ, MVT::v16i16, 4 },
1834  { ISD::CTLZ, MVT::v32i8, 10 },
1835  { ISD::CTLZ, MVT::v2i64, 1 },
1836  { ISD::CTLZ, MVT::v4i32, 1 },
1837  { ISD::CTLZ, MVT::v8i16, 4 },
1838  { ISD::CTLZ, MVT::v16i8, 4 },
1839  };
1840  static const CostTblEntry AVX512BWCostTbl[] = {
1841  { ISD::BITREVERSE, MVT::v8i64, 5 },
1842  { ISD::BITREVERSE, MVT::v16i32, 5 },
1843  { ISD::BITREVERSE, MVT::v32i16, 5 },
1844  { ISD::BITREVERSE, MVT::v64i8, 5 },
1845  { ISD::CTLZ, MVT::v8i64, 23 },
1846  { ISD::CTLZ, MVT::v16i32, 22 },
1847  { ISD::CTLZ, MVT::v32i16, 18 },
1848  { ISD::CTLZ, MVT::v64i8, 17 },
1849  { ISD::CTPOP, MVT::v8i64, 7 },
1850  { ISD::CTPOP, MVT::v16i32, 11 },
1851  { ISD::CTPOP, MVT::v32i16, 9 },
1852  { ISD::CTPOP, MVT::v64i8, 6 },
1853  { ISD::CTTZ, MVT::v8i64, 10 },
1854  { ISD::CTTZ, MVT::v16i32, 14 },
1855  { ISD::CTTZ, MVT::v32i16, 12 },
1856  { ISD::CTTZ, MVT::v64i8, 9 },
1857  { ISD::SADDSAT, MVT::v32i16, 1 },
1858  { ISD::SADDSAT, MVT::v64i8, 1 },
1859  { ISD::SSUBSAT, MVT::v32i16, 1 },
1860  { ISD::SSUBSAT, MVT::v64i8, 1 },
1861  { ISD::UADDSAT, MVT::v32i16, 1 },
1862  { ISD::UADDSAT, MVT::v64i8, 1 },
1863  { ISD::USUBSAT, MVT::v32i16, 1 },
1864  { ISD::USUBSAT, MVT::v64i8, 1 },
1865  };
1866  static const CostTblEntry AVX512CostTbl[] = {
1867  { ISD::BITREVERSE, MVT::v8i64, 36 },
1868  { ISD::BITREVERSE, MVT::v16i32, 24 },
1869  { ISD::CTLZ, MVT::v8i64, 29 },
1870  { ISD::CTLZ, MVT::v16i32, 35 },
1871  { ISD::CTPOP, MVT::v8i64, 16 },
1872  { ISD::CTPOP, MVT::v16i32, 24 },
1873  { ISD::CTTZ, MVT::v8i64, 20 },
1874  { ISD::CTTZ, MVT::v16i32, 28 },
1875  { ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd
1876  { ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq
1877  { ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq
1878  { ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq
1879  { ISD::UADDSAT, MVT::v16i32, 3 }, // not + pminud + paddd
1880  { ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq
1881  { ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq
1882  { ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq
1883  };
1884  static const CostTblEntry XOPCostTbl[] = {
1885  { ISD::BITREVERSE, MVT::v4i64, 4 },
1886  { ISD::BITREVERSE, MVT::v8i32, 4 },
1887  { ISD::BITREVERSE, MVT::v16i16, 4 },
1888  { ISD::BITREVERSE, MVT::v32i8, 4 },
1889  { ISD::BITREVERSE, MVT::v2i64, 1 },
1890  { ISD::BITREVERSE, MVT::v4i32, 1 },
1891  { ISD::BITREVERSE, MVT::v8i16, 1 },
1892  { ISD::BITREVERSE, MVT::v16i8, 1 },
1893  { ISD::BITREVERSE, MVT::i64, 3 },
1894  { ISD::BITREVERSE, MVT::i32, 3 },
1895  { ISD::BITREVERSE, MVT::i16, 3 },
1896  { ISD::BITREVERSE, MVT::i8, 3 }
1897  };
1898  static const CostTblEntry AVX2CostTbl[] = {
1899  { ISD::BITREVERSE, MVT::v4i64, 5 },
1900  { ISD::BITREVERSE, MVT::v8i32, 5 },
1901  { ISD::BITREVERSE, MVT::v16i16, 5 },
1902  { ISD::BITREVERSE, MVT::v32i8, 5 },
1903  { ISD::BSWAP, MVT::v4i64, 1 },
1904  { ISD::BSWAP, MVT::v8i32, 1 },
1905  { ISD::BSWAP, MVT::v16i16, 1 },
1906  { ISD::CTLZ, MVT::v4i64, 23 },
1907  { ISD::CTLZ, MVT::v8i32, 18 },
1908  { ISD::CTLZ, MVT::v16i16, 14 },
1909  { ISD::CTLZ, MVT::v32i8, 9 },
1910  { ISD::CTPOP, MVT::v4i64, 7 },
1911  { ISD::CTPOP, MVT::v8i32, 11 },
1912  { ISD::CTPOP, MVT::v16i16, 9 },
1913  { ISD::CTPOP, MVT::v32i8, 6 },
1914  { ISD::CTTZ, MVT::v4i64, 10 },
1915  { ISD::CTTZ, MVT::v8i32, 14 },
1916  { ISD::CTTZ, MVT::v16i16, 12 },
1917  { ISD::CTTZ, MVT::v32i8, 9 },
1918  { ISD::SADDSAT, MVT::v16i16, 1 },
1919  { ISD::SADDSAT, MVT::v32i8, 1 },
1920  { ISD::SSUBSAT, MVT::v16i16, 1 },
1921  { ISD::SSUBSAT, MVT::v32i8, 1 },
1922  { ISD::UADDSAT, MVT::v16i16, 1 },
1923  { ISD::UADDSAT, MVT::v32i8, 1 },
1924  { ISD::UADDSAT, MVT::v8i32, 3 }, // not + pminud + paddd
1925  { ISD::USUBSAT, MVT::v16i16, 1 },
1926  { ISD::USUBSAT, MVT::v32i8, 1 },
1927  { ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd
1928  { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/
1929  { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
1930  { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
1931  { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/
1932  { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
1933  { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
1934  };
1935  static const CostTblEntry AVX1CostTbl[] = {
1936  { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert
1937  { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert
1938  { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert
1939  { ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert
1940  { ISD::BSWAP, MVT::v4i64, 4 },
1941  { ISD::BSWAP, MVT::v8i32, 4 },
1942  { ISD::BSWAP, MVT::v16i16, 4 },
1943  { ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert
1944  { ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert
1945  { ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert
1946  { ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
1947  { ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert
1948  { ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert
1949  { ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert
1950  { ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert
1951  { ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert
1952  { ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert
1953  { ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert
1954  { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
1955  { ISD::SADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
1956  { ISD::SADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
1957  { ISD::SSUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
1958  { ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
1959  { ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
1960  { ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
1961  { ISD::UADDSAT, MVT::v8i32, 8 }, // 2 x 128-bit Op + extract/insert
1962  { ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
1963  { ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
1964  { ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert
1965  { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
1966  { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
1967  { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
1968  { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/
1969  { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/
1970  { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/
1971  };
1972  static const CostTblEntry GLMCostTbl[] = {
1973  { ISD::FSQRT, MVT::f32, 19 }, // sqrtss
1974  { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps
1975  { ISD::FSQRT, MVT::f64, 34 }, // sqrtsd
1976  { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd
1977  };
1978  static const CostTblEntry SLMCostTbl[] = {
1979  { ISD::FSQRT, MVT::f32, 20 }, // sqrtss
1980  { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps
1981  { ISD::FSQRT, MVT::f64, 35 }, // sqrtsd
1982  { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd
1983  };
1984  static const CostTblEntry SSE42CostTbl[] = {
1985  { ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd
1986  { ISD::UADDSAT, MVT::v4i32, 3 }, // not + pminud + paddd
1987  { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
1988  { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
1989  };
1990  static const CostTblEntry SSSE3CostTbl[] = {
1991  { ISD::BITREVERSE, MVT::v2i64, 5 },
1992  { ISD::BITREVERSE, MVT::v4i32, 5 },
1993  { ISD::BITREVERSE, MVT::v8i16, 5 },
1994  { ISD::BITREVERSE, MVT::v16i8, 5 },
1995  { ISD::BSWAP, MVT::v2i64, 1 },
1996  { ISD::BSWAP, MVT::v4i32, 1 },
1997  { ISD::BSWAP, MVT::v8i16, 1 },
1998  { ISD::CTLZ, MVT::v2i64, 23 },
1999  { ISD::CTLZ, MVT::v4i32, 18 },
2000  { ISD::CTLZ, MVT::v8i16, 14 },
2001  { ISD::CTLZ, MVT::v16i8, 9 },
2002  { ISD::CTPOP, MVT::v2i64, 7 },
2003  { ISD::CTPOP, MVT::v4i32, 11 },
2004  { ISD::CTPOP, MVT::v8i16, 9 },
2005  { ISD::CTPOP, MVT::v16i8, 6 },
2006  { ISD::CTTZ, MVT::v2i64, 10 },
2007  { ISD::CTTZ, MVT::v4i32, 14 },
2008  { ISD::CTTZ, MVT::v8i16, 12 },
2009  { ISD::CTTZ, MVT::v16i8, 9 }
2010  };
2011  static const CostTblEntry SSE2CostTbl[] = {
2012  { ISD::BITREVERSE, MVT::v2i64, 29 },
2013  { ISD::BITREVERSE, MVT::v4i32, 27 },
2014  { ISD::BITREVERSE, MVT::v8i16, 27 },
2015  { ISD::BITREVERSE, MVT::v16i8, 20 },
2016  { ISD::BSWAP, MVT::v2i64, 7 },
2017  { ISD::BSWAP, MVT::v4i32, 7 },
2018  { ISD::BSWAP, MVT::v8i16, 7 },
2019  { ISD::CTLZ, MVT::v2i64, 25 },
2020  { ISD::CTLZ, MVT::v4i32, 26 },
2021  { ISD::CTLZ, MVT::v8i16, 20 },
2022  { ISD::CTLZ, MVT::v16i8, 17 },
2023  { ISD::CTPOP, MVT::v2i64, 12 },
2024  { ISD::CTPOP, MVT::v4i32, 15 },
2025  { ISD::CTPOP, MVT::v8i16, 13 },
2026  { ISD::CTPOP, MVT::v16i8, 10 },
2027  { ISD::CTTZ, MVT::v2i64, 14 },
2028  { ISD::CTTZ, MVT::v4i32, 18 },
2029  { ISD::CTTZ, MVT::v8i16, 16 },
2030  { ISD::CTTZ, MVT::v16i8, 13 },
2031  { ISD::SADDSAT, MVT::v8i16, 1 },
2032  { ISD::SADDSAT, MVT::v16i8, 1 },
2033  { ISD::SSUBSAT, MVT::v8i16, 1 },
2034  { ISD::SSUBSAT, MVT::v16i8, 1 },
2035  { ISD::UADDSAT, MVT::v8i16, 1 },
2036  { ISD::UADDSAT, MVT::v16i8, 1 },
2037  { ISD::USUBSAT, MVT::v8i16, 1 },
2038  { ISD::USUBSAT, MVT::v16i8, 1 },
2039  { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/
2040  { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/
2041  };
2042  static const CostTblEntry SSE1CostTbl[] = {
2043  { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
2044  { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
2045  };
2046  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
2047  { ISD::BITREVERSE, MVT::i64, 14 },
2048  { ISD::SADDO, MVT::i64, 1 },
2049  { ISD::UADDO, MVT::i64, 1 },
2050  };
2051  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
2052  { ISD::BITREVERSE, MVT::i32, 14 },
2053  { ISD::BITREVERSE, MVT::i16, 14 },
2054  { ISD::BITREVERSE, MVT::i8, 11 },
2055  { ISD::SADDO, MVT::i32, 1 },
2056  { ISD::SADDO, MVT::i16, 1 },
2057  { ISD::SADDO, MVT::i8, 1 },
2058  { ISD::UADDO, MVT::i32, 1 },
2059  { ISD::UADDO, MVT::i16, 1 },
2060  { ISD::UADDO, MVT::i8, 1 },
2061  };
2062 
2063  Type *OpTy = RetTy;
2064  unsigned ISD = ISD::DELETED_NODE;
2065  switch (IID) {
2066  default:
2067  break;
2068  case Intrinsic::bitreverse:
2069  ISD = ISD::BITREVERSE;
2070  break;
2071  case Intrinsic::bswap:
2072  ISD = ISD::BSWAP;
2073  break;
2074  case Intrinsic::ctlz:
2075  ISD = ISD::CTLZ;
2076  break;
2077  case Intrinsic::ctpop:
2078  ISD = ISD::CTPOP;
2079  break;
2080  case Intrinsic::cttz:
2081  ISD = ISD::CTTZ;
2082  break;
2083  case Intrinsic::sadd_sat:
2084  ISD = ISD::SADDSAT;
2085  break;
2086  case Intrinsic::ssub_sat:
2087  ISD = ISD::SSUBSAT;
2088  break;
2089  case Intrinsic::uadd_sat:
2090  ISD = ISD::UADDSAT;
2091  break;
2092  case Intrinsic::usub_sat:
2093  ISD = ISD::USUBSAT;
2094  break;
2095  case Intrinsic::sqrt:
2096  ISD = ISD::FSQRT;
2097  break;
2098  case Intrinsic::sadd_with_overflow:
2099  case Intrinsic::ssub_with_overflow:
2100  // SSUBO has same costs so don't duplicate.
2101  ISD = ISD::SADDO;
2102  OpTy = RetTy->getContainedType(0);
2103  break;
2104  case Intrinsic::uadd_with_overflow:
2105  case Intrinsic::usub_with_overflow:
2106  // USUBO has same costs so don't duplicate.
2107  ISD = ISD::UADDO;
2108  OpTy = RetTy->getContainedType(0);
2109  break;
2110  }
2111 
2112  if (ISD != ISD::DELETED_NODE) {
2113  // Legalize the type.
2114  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, OpTy);
2115  MVT MTy = LT.second;
2116 
2117  // Attempt to lookup cost.
2118  if (ST->isGLM())
2119  if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
2120  return LT.first * Entry->Cost;
2121 
2122  if (ST->isSLM())
2123  if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
2124  return LT.first * Entry->Cost;
2125 
2126  if (ST->hasCDI())
2127  if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
2128  return LT.first * Entry->Cost;
2129 
2130  if (ST->hasBWI())
2131  if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
2132  return LT.first * Entry->Cost;
2133 
2134  if (ST->hasAVX512())
2135  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2136  return LT.first * Entry->Cost;
2137 
2138  if (ST->hasXOP())
2139  if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
2140  return LT.first * Entry->Cost;
2141 
2142  if (ST->hasAVX2())
2143  if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
2144  return LT.first * Entry->Cost;
2145 
2146  if (ST->hasAVX())
2147  if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
2148  return LT.first * Entry->Cost;
2149 
2150  if (ST->hasSSE42())
2151  if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
2152  return LT.first * Entry->Cost;
2153 
2154  if (ST->hasSSSE3())
2155  if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
2156  return LT.first * Entry->Cost;
2157 
2158  if (ST->hasSSE2())
2159  if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
2160  return LT.first * Entry->Cost;
2161 
2162  if (ST->hasSSE1())
2163  if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
2164  return LT.first * Entry->Cost;
2165 
2166  if (ST->is64Bit())
2167  if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
2168  return LT.first * Entry->Cost;
2169 
2170  if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
2171  return LT.first * Entry->Cost;
2172  }
2173 
2174  return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed);
2175 }
2176 
2179  unsigned VF) {
2180  static const CostTblEntry AVX512CostTbl[] = {
2181  { ISD::ROTL, MVT::v8i64, 1 },
2182  { ISD::ROTL, MVT::v4i64, 1 },
2183  { ISD::ROTL, MVT::v2i64, 1 },
2184  { ISD::ROTL, MVT::v16i32, 1 },
2185  { ISD::ROTL, MVT::v8i32, 1 },
2186  { ISD::ROTL, MVT::v4i32, 1 },
2187  { ISD::ROTR, MVT::v8i64, 1 },
2188  { ISD::ROTR, MVT::v4i64, 1 },
2189  { ISD::ROTR, MVT::v2i64, 1 },
2190  { ISD::ROTR, MVT::v16i32, 1 },
2191  { ISD::ROTR, MVT::v8i32, 1 },
2192  { ISD::ROTR, MVT::v4i32, 1 }
2193  };
2194  // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
2195  static const CostTblEntry XOPCostTbl[] = {
2196  { ISD::ROTL, MVT::v4i64, 4 },
2197  { ISD::ROTL, MVT::v8i32, 4 },
2198  { ISD::ROTL, MVT::v16i16, 4 },
2199  { ISD::ROTL, MVT::v32i8, 4 },
2200  { ISD::ROTL, MVT::v2i64, 1 },
2201  { ISD::ROTL, MVT::v4i32, 1 },
2202  { ISD::ROTL, MVT::v8i16, 1 },
2203  { ISD::ROTL, MVT::v16i8, 1 },
2204  { ISD::ROTR, MVT::v4i64, 6 },
2205  { ISD::ROTR, MVT::v8i32, 6 },
2206  { ISD::ROTR, MVT::v16i16, 6 },
2207  { ISD::ROTR, MVT::v32i8, 6 },
2208  { ISD::ROTR, MVT::v2i64, 2 },
2209  { ISD::ROTR, MVT::v4i32, 2 },
2210  { ISD::ROTR, MVT::v8i16, 2 },
2211  { ISD::ROTR, MVT::v16i8, 2 }
2212  };
2213  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
2214  { ISD::ROTL, MVT::i64, 1 },
2215  { ISD::ROTR, MVT::i64, 1 },
2216  { ISD::FSHL, MVT::i64, 4 }
2217  };
2218  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
2219  { ISD::ROTL, MVT::i32, 1 },
2220  { ISD::ROTL, MVT::i16, 1 },
2221  { ISD::ROTL, MVT::i8, 1 },
2222  { ISD::ROTR, MVT::i32, 1 },
2223  { ISD::ROTR, MVT::i16, 1 },
2224  { ISD::ROTR, MVT::i8, 1 },
2225  { ISD::FSHL, MVT::i32, 4 },
2226  { ISD::FSHL, MVT::i16, 4 },
2227  { ISD::FSHL, MVT::i8, 4 }
2228  };
2229 
2230  unsigned ISD = ISD::DELETED_NODE;
2231  switch (IID) {
2232  default:
2233  break;
2234  case Intrinsic::fshl:
2235  ISD = ISD::FSHL;
2236  if (Args[0] == Args[1])
2237  ISD = ISD::ROTL;
2238  break;
2239  case Intrinsic::fshr:
2240  // FSHR has same costs so don't duplicate.
2241  ISD = ISD::FSHL;
2242  if (Args[0] == Args[1])
2243  ISD = ISD::ROTR;
2244  break;
2245  }
2246 
2247  if (ISD != ISD::DELETED_NODE) {
2248  // Legalize the type.
2249  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
2250  MVT MTy = LT.second;
2251 
2252  // Attempt to lookup cost.
2253  if (ST->hasAVX512())
2254  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2255  return LT.first * Entry->Cost;
2256 
2257  if (ST->hasXOP())
2258  if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
2259  return LT.first * Entry->Cost;
2260 
2261  if (ST->is64Bit())
2262  if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
2263  return LT.first * Entry->Cost;
2264 
2265  if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
2266  return LT.first * Entry->Cost;
2267  }
2268 
2269  return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF);
2270 }
2271 
2272 int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
2273  assert(Val->isVectorTy() && "This must be a vector type");
2274 
2275  Type *ScalarType = Val->getScalarType();
2276 
2277  if (Index != -1U) {
2278  // Legalize the type.
2279  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
2280 
2281  // This type is legalized to a scalar type.
2282  if (!LT.second.isVector())
2283  return 0;
2284 
2285  // The type may be split. Normalize the index to the new type.
2286  unsigned Width = LT.second.getVectorNumElements();
2287  Index = Index % Width;
2288 
2289  // Floating point scalars are already located in index #0.
2290  if (ScalarType->isFloatingPointTy() && Index == 0)
2291  return 0;
2292  }
2293 
2294  // Add to the base cost if we know that the extracted element of a vector is
2295  // destined to be moved to and used in the integer register file.
2296  int RegisterFileMoveCost = 0;
2297  if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
2298  RegisterFileMoveCost = 1;
2299 
2300  return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
2301 }
2302 
2303 int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
2304  unsigned AddressSpace, const Instruction *I) {
2305  // Handle non-power-of-two vectors such as <3 x float>
2306  if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
2307  unsigned NumElem = VTy->getVectorNumElements();
2308 
2309  // Handle a few common cases:
2310  // <3 x float>
2311  if (NumElem == 3 && VTy->getScalarSizeInBits() == 32)
2312  // Cost = 64 bit store + extract + 32 bit store.
2313  return 3;
2314 
2315  // <3 x double>
2316  if (NumElem == 3 && VTy->getScalarSizeInBits() == 64)
2317  // Cost = 128 bit store + unpack + 64 bit store.
2318  return 3;
2319 
2320  // Assume that all other non-power-of-two numbers are scalarized.
2321  if (!isPowerOf2_32(NumElem)) {
2322  int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
2323  AddressSpace);
2324  int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load,
2325  Opcode == Instruction::Store);
2326  return NumElem * Cost + SplitCost;
2327  }
2328  }
2329 
2330  // Legalize the type.
2331  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
2332  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
2333  "Invalid Opcode");
2334 
2335  // Each load/store unit costs 1.
2336  int Cost = LT.first * 1;
2337 
2338  // This isn't exactly right. We're using slow unaligned 32-byte accesses as a
2339  // proxy for a double-pumped AVX memory interface such as on Sandybridge.
2340  if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow())
2341  Cost *= 2;
2342 
2343  return Cost;
2344 }
2345 
2346 int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
2347  unsigned Alignment,
2348  unsigned AddressSpace) {
2349  VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy);
2350  if (!SrcVTy)
2351  // To calculate scalar take the regular cost, without mask
2352  return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace);
2353 
2354  unsigned NumElem = SrcVTy->getVectorNumElements();
2355  VectorType *MaskTy =
2356  VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
2357  if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy)) ||
2358  (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy)) ||
2359  !isPowerOf2_32(NumElem)) {
2360  // Scalarization
2361  int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
2362  int ScalarCompareCost = getCmpSelInstrCost(
2363  Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr);
2364  int BranchCost = getCFInstrCost(Instruction::Br);
2365  int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
2366 
2367  int ValueSplitCost = getScalarizationOverhead(
2368  SrcVTy, Opcode == Instruction::Load, Opcode == Instruction::Store);
2369  int MemopCost =
2370  NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
2371  Alignment, AddressSpace);
2372  return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
2373  }
2374 
2375  // Legalize the type.
2376  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
2377  auto VT = TLI->getValueType(DL, SrcVTy);
2378  int Cost = 0;
2379  if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
2380  LT.second.getVectorNumElements() == NumElem)
2381  // Promotion requires expand/truncate for data and a shuffle for mask.
2382  Cost += getShuffleCost(TTI::SK_Select, SrcVTy, 0, nullptr) +
2383  getShuffleCost(TTI::SK_Select, MaskTy, 0, nullptr);
2384 
2385  else if (LT.second.getVectorNumElements() > NumElem) {
2386  VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(),
2387  LT.second.getVectorNumElements());
2388  // Expanding requires fill mask with zeroes
2389  Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy);
2390  }
2391  if (!ST->hasAVX512())
2392  return Cost + LT.first*4; // Each maskmov costs 4
2393 
2394  // AVX-512 masked load/store is cheapper
2395  return Cost+LT.first;
2396 }
2397 
2399  const SCEV *Ptr) {
2400  // Address computations in vectorized code with non-consecutive addresses will
2401  // likely result in more instructions compared to scalar code where the
2402  // computation can more often be merged into the index mode. The resulting
2403  // extra micro-ops can significantly decrease throughput.
2404  unsigned NumVectorInstToHideOverhead = 10;
2405 
2406  // Cost modeling of Strided Access Computation is hidden by the indexing
2407  // modes of X86 regardless of the stride value. We dont believe that there
2408  // is a difference between constant strided access in gerenal and constant
2409  // strided value which is less than or equal to 64.
2410  // Even in the case of (loop invariant) stride whose value is not known at
2411  // compile time, the address computation will not incur more than one extra
2412  // ADD instruction.
2413  if (Ty->isVectorTy() && SE) {
2414  if (!BaseT::isStridedAccess(Ptr))
2415  return NumVectorInstToHideOverhead;
2416  if (!BaseT::getConstantStrideStep(SE, Ptr))
2417  return 1;
2418  }
2419 
2420  return BaseT::getAddressComputationCost(Ty, SE, Ptr);
2421 }
2422 
2423 int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
2424  bool IsPairwise) {
2425 
2426  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2427 
2428  MVT MTy = LT.second;
2429 
2430  int ISD = TLI->InstructionOpcodeToISD(Opcode);
2431  assert(ISD && "Invalid opcode");
2432 
2433  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
2434  // and make it as the cost.
2435 
2436  static const CostTblEntry SSE42CostTblPairWise[] = {
2437  { ISD::FADD, MVT::v2f64, 2 },
2438  { ISD::FADD, MVT::v4f32, 4 },
2439  { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
2440  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
2441  { ISD::ADD, MVT::v8i16, 5 },
2442  };
2443 
2444  static const CostTblEntry AVX1CostTblPairWise[] = {
2445  { ISD::FADD, MVT::v4f32, 4 },
2446  { ISD::FADD, MVT::v4f64, 5 },
2447  { ISD::FADD, MVT::v8f32, 7 },
2448  { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
2449  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
2450  { ISD::ADD, MVT::v4i64, 5 }, // The data reported by the IACA tool is "4.8".
2451  { ISD::ADD, MVT::v8i16, 5 },
2452  { ISD::ADD, MVT::v8i32, 5 },
2453  };
2454 
2455  static const CostTblEntry SSE42CostTblNoPairWise[] = {
2456  { ISD::FADD, MVT::v2f64, 2 },
2457  { ISD::FADD, MVT::v4f32, 4 },
2458  { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
2459  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
2460  { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
2461  };
2462 
2463  static const CostTblEntry AVX1CostTblNoPairWise[] = {
2464  { ISD::FADD, MVT::v4f32, 3 },
2465  { ISD::FADD, MVT::v4f64, 3 },
2466  { ISD::FADD, MVT::v8f32, 4 },
2467  { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
2468  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "2.8".
2469  { ISD::ADD, MVT::v4i64, 3 },
2470  { ISD::ADD, MVT::v8i16, 4 },
2471  { ISD::ADD, MVT::v8i32, 5 },
2472  };
2473 
2474  if (IsPairwise) {
2475  if (ST->hasAVX())
2476  if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
2477  return LT.first * Entry->Cost;
2478 
2479  if (ST->hasSSE42())
2480  if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
2481  return LT.first * Entry->Cost;
2482  } else {
2483  if (ST->hasAVX())
2484  if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
2485  return LT.first * Entry->Cost;
2486 
2487  if (ST->hasSSE42())
2488  if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
2489  return LT.first * Entry->Cost;
2490  }
2491 
2492  return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise);
2493 }
2494 
2496  bool IsPairwise, bool IsUnsigned) {
2497  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2498 
2499  MVT MTy = LT.second;
2500 
2501  int ISD;
2502  if (ValTy->isIntOrIntVectorTy()) {
2503  ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
2504  } else {
2505  assert(ValTy->isFPOrFPVectorTy() &&
2506  "Expected float point or integer vector type.");
2507  ISD = ISD::FMINNUM;
2508  }
2509 
2510  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
2511  // and make it as the cost.
2512 
2513  static const CostTblEntry SSE42CostTblPairWise[] = {
2514  {ISD::FMINNUM, MVT::v2f64, 3},
2515  {ISD::FMINNUM, MVT::v4f32, 2},
2516  {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
2517  {ISD::UMIN, MVT::v2i64, 8}, // The data reported by the IACA is "8.6"
2518  {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
2519  {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
2520  {ISD::SMIN, MVT::v8i16, 2},
2521  {ISD::UMIN, MVT::v8i16, 2},
2522  };
2523 
2524  static const CostTblEntry AVX1CostTblPairWise[] = {
2525  {ISD::FMINNUM, MVT::v4f32, 1},
2526  {ISD::FMINNUM, MVT::v4f64, 1},
2527  {ISD::FMINNUM, MVT::v8f32, 2},
2528  {ISD::SMIN, MVT::v2i64, 3},
2529  {ISD::UMIN, MVT::v2i64, 3},
2530  {ISD::SMIN, MVT::v4i32, 1},
2531  {ISD::UMIN, MVT::v4i32, 1},
2532  {ISD::SMIN, MVT::v8i16, 1},
2533  {ISD::UMIN, MVT::v8i16, 1},
2534  {ISD::SMIN, MVT::v8i32, 3},
2535  {ISD::UMIN, MVT::v8i32, 3},
2536  };
2537 
2538  static const CostTblEntry AVX2CostTblPairWise[] = {
2539  {ISD::SMIN, MVT::v4i64, 2},
2540  {ISD::UMIN, MVT::v4i64, 2},
2541  {ISD::SMIN, MVT::v8i32, 1},
2542  {ISD::UMIN, MVT::v8i32, 1},
2543  {ISD::SMIN, MVT::v16i16, 1},
2544  {ISD::UMIN, MVT::v16i16, 1},
2545  {ISD::SMIN, MVT::v32i8, 2},
2546  {ISD::UMIN, MVT::v32i8, 2},
2547  };
2548 
2549  static const CostTblEntry AVX512CostTblPairWise[] = {
2550  {ISD::FMINNUM, MVT::v8f64, 1},
2551  {ISD::FMINNUM, MVT::v16f32, 2},
2552  {ISD::SMIN, MVT::v8i64, 2},
2553  {ISD::UMIN, MVT::v8i64, 2},
2554  {ISD::SMIN, MVT::v16i32, 1},
2555  {ISD::UMIN, MVT::v16i32, 1},
2556  };
2557 
2558  static const CostTblEntry SSE42CostTblNoPairWise[] = {
2559  {ISD::FMINNUM, MVT::v2f64, 3},
2560  {ISD::FMINNUM, MVT::v4f32, 3},
2561  {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
2562  {ISD::UMIN, MVT::v2i64, 9}, // The data reported by the IACA is "8.6"
2563  {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
2564  {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
2565  {ISD::SMIN, MVT::v8i16, 1}, // The data reported by the IACA is "1.5"
2566  {ISD::UMIN, MVT::v8i16, 2}, // The data reported by the IACA is "1.8"
2567  };
2568 
2569  static const CostTblEntry AVX1CostTblNoPairWise[] = {
2570  {ISD::FMINNUM, MVT::v4f32, 1},
2571  {ISD::FMINNUM, MVT::v4f64, 1},
2572  {ISD::FMINNUM, MVT::v8f32, 1},
2573  {ISD::SMIN, MVT::v2i64, 3},
2574  {ISD::UMIN, MVT::v2i64, 3},
2575  {ISD::SMIN, MVT::v4i32, 1},
2576  {ISD::UMIN, MVT::v4i32, 1},
2577  {ISD::SMIN, MVT::v8i16, 1},
2578  {ISD::UMIN, MVT::v8i16, 1},
2579  {ISD::SMIN, MVT::v8i32, 2},
2580  {ISD::UMIN, MVT::v8i32, 2},
2581  };
2582 
2583  static const CostTblEntry AVX2CostTblNoPairWise[] = {
2584  {ISD::SMIN, MVT::v4i64, 1},
2585  {ISD::UMIN, MVT::v4i64, 1},
2586  {ISD::SMIN, MVT::v8i32, 1},
2587  {ISD::UMIN, MVT::v8i32, 1},
2588  {ISD::SMIN, MVT::v16i16, 1},
2589  {ISD::UMIN, MVT::v16i16, 1},
2590  {ISD::SMIN, MVT::v32i8, 1},
2591  {ISD::UMIN, MVT::v32i8, 1},
2592  };
2593 
2594  static const CostTblEntry AVX512CostTblNoPairWise[] = {
2595  {ISD::FMINNUM, MVT::v8f64, 1},
2596  {ISD::FMINNUM, MVT::v16f32, 2},
2597  {ISD::SMIN, MVT::v8i64, 1},
2598  {ISD::UMIN, MVT::v8i64, 1},
2599  {ISD::SMIN, MVT::v16i32, 1},
2600  {ISD::UMIN, MVT::v16i32, 1},
2601  };
2602 
2603  if (IsPairwise) {
2604  if (ST->hasAVX512())
2605  if (const auto *Entry = CostTableLookup(AVX512CostTblPairWise, ISD, MTy))
2606  return LT.first * Entry->Cost;
2607 
2608  if (ST->hasAVX2())
2609  if (const auto *Entry = CostTableLookup(AVX2CostTblPairWise, ISD, MTy))
2610  return LT.first * Entry->Cost;
2611 
2612  if (ST->hasAVX())
2613  if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
2614  return LT.first * Entry->Cost;
2615 
2616  if (ST->hasSSE42())
2617  if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
2618  return LT.first * Entry->Cost;
2619  } else {
2620  if (ST->hasAVX512())
2621  if (const auto *Entry =
2622  CostTableLookup(AVX512CostTblNoPairWise, ISD, MTy))
2623  return LT.first * Entry->Cost;
2624 
2625  if (ST->hasAVX2())
2626  if (const auto *Entry = CostTableLookup(AVX2CostTblNoPairWise, ISD, MTy))
2627  return LT.first * Entry->Cost;
2628 
2629  if (ST->hasAVX())
2630  if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
2631  return LT.first * Entry->Cost;
2632 
2633  if (ST->hasSSE42())
2634  if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
2635  return LT.first * Entry->Cost;
2636  }
2637 
2638  return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned);
2639 }
2640 
2641 /// Calculate the cost of materializing a 64-bit value. This helper
2642 /// method might only calculate a fraction of a larger immediate. Therefore it
2643 /// is valid to return a cost of ZERO.
2644 int X86TTIImpl::getIntImmCost(int64_t Val) {
2645  if (Val == 0)
2646  return TTI::TCC_Free;
2647 
2648  if (isInt<32>(Val))
2649  return TTI::TCC_Basic;
2650 
2651  return 2 * TTI::TCC_Basic;
2652 }
2653 
2654 int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
2655  assert(Ty->isIntegerTy());
2656 
2657  unsigned BitSize = Ty->getPrimitiveSizeInBits();
2658  if (BitSize == 0)
2659  return ~0U;
2660 
2661  // Never hoist constants larger than 128bit, because this might lead to
2662  // incorrect code generation or assertions in codegen.
2663  // Fixme: Create a cost model for types larger than i128 once the codegen
2664  // issues have been fixed.
2665  if (BitSize > 128)
2666  return TTI::TCC_Free;
2667 
2668  if (Imm == 0)
2669  return TTI::TCC_Free;
2670 
2671  // Sign-extend all constants to a multiple of 64-bit.
2672  APInt ImmVal = Imm;
2673  if (BitSize % 64 != 0)
2674  ImmVal = Imm.sext(alignTo(BitSize, 64));
2675 
2676  // Split the constant into 64-bit chunks and calculate the cost for each
2677  // chunk.
2678  int Cost = 0;
2679  for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
2680  APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
2681  int64_t Val = Tmp.getSExtValue();
2682  Cost += getIntImmCost(Val);
2683  }
2684  // We need at least one instruction to materialize the constant.
2685  return std::max(1, Cost);
2686 }
2687 
2688 int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
2689  Type *Ty) {
2690  assert(Ty->isIntegerTy());
2691 
2692  unsigned BitSize = Ty->getPrimitiveSizeInBits();
2693  // There is no cost model for constants with a bit size of 0. Return TCC_Free
2694  // here, so that constant hoisting will ignore this constant.
2695  if (BitSize == 0)
2696  return TTI::TCC_Free;
2697 
2698  unsigned ImmIdx = ~0U;
2699  switch (Opcode) {
2700  default:
2701  return TTI::TCC_Free;
2702  case Instruction::GetElementPtr:
2703  // Always hoist the base address of a GetElementPtr. This prevents the
2704  // creation of new constants for every base constant that gets constant
2705  // folded with the offset.
2706  if (Idx == 0)
2707  return 2 * TTI::TCC_Basic;
2708  return TTI::TCC_Free;
2709  case Instruction::Store:
2710  ImmIdx = 0;
2711  break;
2712  case Instruction::ICmp:
2713  // This is an imperfect hack to prevent constant hoisting of
2714  // compares that might be trying to check if a 64-bit value fits in
2715  // 32-bits. The backend can optimize these cases using a right shift by 32.
2716  // Ideally we would check the compare predicate here. There also other
2717  // similar immediates the backend can use shifts for.
2718  if (Idx == 1 && Imm.getBitWidth() == 64) {
2719  uint64_t ImmVal = Imm.getZExtValue();
2720  if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
2721  return TTI::TCC_Free;
2722  }
2723  ImmIdx = 1;
2724  break;
2725  case Instruction::And:
2726  // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
2727  // by using a 32-bit operation with implicit zero extension. Detect such
2728  // immediates here as the normal path expects bit 31 to be sign extended.
2729  if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue()))
2730  return TTI::TCC_Free;
2731  ImmIdx = 1;
2732  break;
2733  case Instruction::Add:
2734  case Instruction::Sub:
2735  // For add/sub, we can use the opposite instruction for INT32_MIN.
2736  if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000)
2737  return TTI::TCC_Free;
2738  ImmIdx = 1;
2739  break;
2740  case Instruction::UDiv:
2741  case Instruction::SDiv:
2742  case Instruction::URem:
2743  case Instruction::SRem:
2744  // Division by constant is typically expanded later into a different
2745  // instruction sequence. This completely changes the constants.
2746  // Report them as "free" to stop ConstantHoist from marking them as opaque.
2747  return TTI::TCC_Free;
2748  case Instruction::Mul:
2749  case Instruction::Or:
2750  case Instruction::Xor:
2751  ImmIdx = 1;
2752  break;
2753  // Always return TCC_Free for the shift value of a shift instruction.
2754  case Instruction::Shl:
2755  case Instruction::LShr:
2756  case Instruction::AShr:
2757  if (Idx == 1)
2758  return TTI::TCC_Free;
2759  break;
2760  case Instruction::Trunc:
2761  case Instruction::ZExt:
2762  case Instruction::SExt:
2763  case Instruction::IntToPtr:
2764  case Instruction::PtrToInt:
2765  case Instruction::BitCast:
2766  case Instruction::PHI:
2767  case Instruction::Call:
2768  case Instruction::Select:
2769  case Instruction::Ret:
2770  case Instruction::Load:
2771  break;
2772  }
2773 
2774  if (Idx == ImmIdx) {
2775  int NumConstants = divideCeil(BitSize, 64);
2776  int Cost = X86TTIImpl::getIntImmCost(Imm, Ty);
2777  return (Cost <= NumConstants * TTI::TCC_Basic)
2778  ? static_cast<int>(TTI::TCC_Free)
2779  : Cost;
2780  }
2781 
2782  return X86TTIImpl::getIntImmCost(Imm, Ty);
2783 }
2784 
2785 int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
2786  Type *Ty) {
2787  assert(Ty->isIntegerTy());
2788 
2789  unsigned BitSize = Ty->getPrimitiveSizeInBits();
2790  // There is no cost model for constants with a bit size of 0. Return TCC_Free
2791  // here, so that constant hoisting will ignore this constant.
2792  if (BitSize == 0)
2793  return TTI::TCC_Free;
2794 
2795  switch (IID) {
2796  default:
2797  return TTI::TCC_Free;
2798  case Intrinsic::sadd_with_overflow:
2799  case Intrinsic::uadd_with_overflow:
2800  case Intrinsic::ssub_with_overflow:
2801  case Intrinsic::usub_with_overflow:
2802  case Intrinsic::smul_with_overflow:
2803  case Intrinsic::umul_with_overflow:
2804  if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
2805  return TTI::TCC_Free;
2806  break;
2807  case Intrinsic::experimental_stackmap:
2808  if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
2809  return TTI::TCC_Free;
2810  break;
2811  case Intrinsic::experimental_patchpoint_void:
2812  case Intrinsic::experimental_patchpoint_i64:
2813  if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
2814  return TTI::TCC_Free;
2815  break;
2816  }
2817  return X86TTIImpl::getIntImmCost(Imm, Ty);
2818 }
2819 
2820 unsigned X86TTIImpl::getUserCost(const User *U,
2821  ArrayRef<const Value *> Operands) {
2822  if (isa<StoreInst>(U)) {
2823  Value *Ptr = U->getOperand(1);
2824  // Store instruction with index and scale costs 2 Uops.
2825  // Check the preceding GEP to identify non-const indices.
2826  if (auto GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
2827  if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
2828  return TTI::TCC_Basic * 2;
2829  }
2830  return TTI::TCC_Basic;
2831  }
2832  return BaseT::getUserCost(U, Operands);
2833 }
2834 
2835 // Return an average cost of Gather / Scatter instruction, maybe improved later
2836 int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
2837  unsigned Alignment, unsigned AddressSpace) {
2838 
2839  assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
2840  unsigned VF = SrcVTy->getVectorNumElements();
2841 
2842  // Try to reduce index size from 64 bit (default for GEP)
2843  // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
2844  // operation will use 16 x 64 indices which do not fit in a zmm and needs
2845  // to split. Also check that the base pointer is the same for all lanes,
2846  // and that there's at most one variable index.
2847  auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) {
2848  unsigned IndexSize = DL.getPointerSizeInBits();
2850  if (IndexSize < 64 || !GEP)
2851  return IndexSize;
2852 
2853  unsigned NumOfVarIndices = 0;
2854  Value *Ptrs = GEP->getPointerOperand();
2855  if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
2856  return IndexSize;
2857  for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
2858  if (isa<Constant>(GEP->getOperand(i)))
2859  continue;
2860  Type *IndxTy = GEP->getOperand(i)->getType();
2861  if (IndxTy->isVectorTy())
2862  IndxTy = IndxTy->getVectorElementType();
2863  if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
2864  !isa<SExtInst>(GEP->getOperand(i))) ||
2865  ++NumOfVarIndices > 1)
2866  return IndexSize; // 64
2867  }
2868  return (unsigned)32;
2869  };
2870 
2871 
2872  // Trying to reduce IndexSize to 32 bits for vector 16.
2873  // By default the IndexSize is equal to pointer size.
2874  unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
2875  ? getIndexSizeInBits(Ptr, DL)
2877 
2878  Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(),
2879  IndexSize), VF);
2880  std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
2881  std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
2882  int SplitFactor = std::max(IdxsLT.first, SrcLT.first);
2883  if (SplitFactor > 1) {
2884  // Handle splitting of vector of pointers
2885  Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
2886  return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
2887  AddressSpace);
2888  }
2889 
2890  // The gather / scatter cost is given by Intel architects. It is a rough
2891  // number since we are looking at one instruction in a time.
2892  const int GSOverhead = (Opcode == Instruction::Load)
2893  ? ST->getGatherOverhead()
2894  : ST->getScatterOverhead();
2895  return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
2896  Alignment, AddressSpace);
2897 }
2898 
2899 /// Return the cost of full scalarization of gather / scatter operation.
2900 ///
2901 /// Opcode - Load or Store instruction.
2902 /// SrcVTy - The type of the data vector that should be gathered or scattered.
2903 /// VariableMask - The mask is non-constant at compile time.
2904 /// Alignment - Alignment for one element.
2905 /// AddressSpace - pointer[s] address space.
2906 ///
2907 int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
2908  bool VariableMask, unsigned Alignment,
2909  unsigned AddressSpace) {
2910  unsigned VF = SrcVTy->getVectorNumElements();
2911 
2912  int MaskUnpackCost = 0;
2913  if (VariableMask) {
2914  VectorType *MaskTy =
2915  VectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
2916  MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true);
2917  int ScalarCompareCost =
2918  getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()),
2919  nullptr);
2920  int BranchCost = getCFInstrCost(Instruction::Br);
2921  MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
2922  }
2923 
2924  // The cost of the scalar loads/stores.
2925  int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
2926  Alignment, AddressSpace);
2927 
2928  int InsertExtractCost = 0;
2929  if (Opcode == Instruction::Load)
2930  for (unsigned i = 0; i < VF; ++i)
2931  // Add the cost of inserting each scalar load into the vector
2932  InsertExtractCost +=
2933  getVectorInstrCost(Instruction::InsertElement, SrcVTy, i);
2934  else
2935  for (unsigned i = 0; i < VF; ++i)
2936  // Add the cost of extracting each element out of the data vector
2937  InsertExtractCost +=
2938  getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i);
2939 
2940  return MemoryOpCost + MaskUnpackCost + InsertExtractCost;
2941 }
2942 
2943 /// Calculate the cost of Gather / Scatter operation
2944 int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
2945  Value *Ptr, bool VariableMask,
2946  unsigned Alignment) {
2947  assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
2948  unsigned VF = SrcVTy->getVectorNumElements();
2949  PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
2950  if (!PtrTy && Ptr->getType()->isVectorTy())
2951  PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType());
2952  assert(PtrTy && "Unexpected type for Ptr argument");
2953  unsigned AddressSpace = PtrTy->getAddressSpace();
2954 
2955  bool Scalarize = false;
2956  if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) ||
2957  (Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy)))
2958  Scalarize = true;
2959  // Gather / Scatter for vector 2 is not profitable on KNL / SKX
2960  // Vector-4 of gather/scatter instruction does not exist on KNL.
2961  // We can extend it to 8 elements, but zeroing upper bits of
2962  // the mask vector will add more instructions. Right now we give the scalar
2963  // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction
2964  // is better in the VariableMask case.
2965  if (ST->hasAVX512() && (VF == 2 || (VF == 4 && !ST->hasVLX())))
2966  Scalarize = true;
2967 
2968  if (Scalarize)
2969  return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
2970  AddressSpace);
2971 
2972  return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
2973 }
2974 
2977  // X86 specific here are "instruction number 1st priority".
2978  return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
2979  C1.NumIVMuls, C1.NumBaseAdds,
2980  C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
2981  std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
2982  C2.NumIVMuls, C2.NumBaseAdds,
2983  C2.ScaleCost, C2.ImmCost, C2.SetupCost);
2984 }
2985 
2987  return ST->hasMacroFusion();
2988 }
2989 
2991  // The backend can't handle a single element vector.
2992  if (isa<VectorType>(DataTy) && DataTy->getVectorNumElements() == 1)
2993  return false;
2994  Type *ScalarTy = DataTy->getScalarType();
2995  int DataWidth = isa<PointerType>(ScalarTy) ?
2997 
2998  return ((DataWidth == 32 || DataWidth == 64) && ST->hasAVX()) ||
2999  ((DataWidth == 8 || DataWidth == 16) && ST->hasBWI());
3000 }
3001 
3003  return isLegalMaskedLoad(DataType);
3004 }
3005 
3007  // This function is called now in two cases: from the Loop Vectorizer
3008  // and from the Scalarizer.
3009  // When the Loop Vectorizer asks about legality of the feature,
3010  // the vectorization factor is not calculated yet. The Loop Vectorizer
3011  // sends a scalar type and the decision is based on the width of the
3012  // scalar element.
3013  // Later on, the cost model will estimate usage this intrinsic based on
3014  // the vector type.
3015  // The Scalarizer asks again about legality. It sends a vector type.
3016  // In this case we can reject non-power-of-2 vectors.
3017  // We also reject single element vectors as the type legalizer can't
3018  // scalarize it.
3019  if (isa<VectorType>(DataTy)) {
3020  unsigned NumElts = DataTy->getVectorNumElements();
3021  if (NumElts == 1 || !isPowerOf2_32(NumElts))
3022  return false;
3023  }
3024  Type *ScalarTy = DataTy->getScalarType();
3025  int DataWidth = isa<PointerType>(ScalarTy) ?
3027 
3028  // Some CPUs have better gather performance than others.
3029  // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
3030  // enable gather with a -march.
3031  return (DataWidth == 32 || DataWidth == 64) &&
3032  (ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2()));
3033 }
3034 
3036  // AVX2 doesn't support scatter
3037  if (!ST->hasAVX512())
3038  return false;
3039  return isLegalMaskedGather(DataType);
3040 }
3041 
3042 bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
3043  EVT VT = TLI->getValueType(DL, DataType);
3044  return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
3045 }
3046 
3048  return false;
3049 }
3050 
3052  const Function *Callee) const {
3053  const TargetMachine &TM = getTLI()->getTargetMachine();
3054 
3055  // Work this as a subsetting of subtarget features.
3056  const FeatureBitset &CallerBits =
3057  TM.getSubtargetImpl(*Caller)->getFeatureBits();
3058  const FeatureBitset &CalleeBits =
3059  TM.getSubtargetImpl(*Callee)->getFeatureBits();
3060 
3061  FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
3062  FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
3063  return (RealCallerBits & RealCalleeBits) == RealCalleeBits;
3064 }
3065 
3067  const Function *Caller, const Function *Callee,
3069  if (!BaseT::areFunctionArgsABICompatible(Caller, Callee, Args))
3070  return false;
3071 
3072  // If we get here, we know the target features match. If one function
3073  // considers 512-bit vectors legal and the other does not, consider them
3074  // incompatible.
3075  // FIXME Look at the arguments and only consider 512 bit or larger vectors?
3076  const TargetMachine &TM = getTLI()->getTargetMachine();
3077 
3078  return TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
3079  TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs();
3080 }
3081 
3083 X86TTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const {
3084  // Only enable vector loads for equality comparison.
3085  // Right now the vector version is not as fast, see #33329.
3086  static const auto ThreeWayOptions = [this]() {
3088  if (ST->is64Bit()) {
3089  Options.LoadSizes.push_back(8);
3090  }
3091  Options.LoadSizes.push_back(4);
3092  Options.LoadSizes.push_back(2);
3093  Options.LoadSizes.push_back(1);
3094  return Options;
3095  }();
3096  static const auto EqZeroOptions = [this]() {
3098  // TODO: enable AVX512 when the DAG is ready.
3099  // if (ST->hasAVX512()) Options.LoadSizes.push_back(64);
3100  if (ST->hasAVX2()) Options.LoadSizes.push_back(32);
3101  if (ST->hasSSE2()) Options.LoadSizes.push_back(16);
3102  if (ST->is64Bit()) {
3103  Options.LoadSizes.push_back(8);
3104  }
3105  Options.LoadSizes.push_back(4);
3106  Options.LoadSizes.push_back(2);
3107  Options.LoadSizes.push_back(1);
3108  // All GPR and vector loads can be unaligned. SIMD compare requires integer
3109  // vectors (SSE2/AVX2).
3110  Options.AllowOverlappingLoads = true;
3111  return Options;
3112  }();
3113  return IsZeroCmp ? &EqZeroOptions : &ThreeWayOptions;
3114 }
3115 
3117  // TODO: We expect this to be beneficial regardless of arch,
3118  // but there are currently some unexplained performance artifacts on Atom.
3119  // As a temporary solution, disable on Atom.
3120  return !(ST->isAtom());
3121 }
3122 
3123 // Get estimation for interleaved load/store operations for AVX2.
3124 // \p Factor is the interleaved-access factor (stride) - number of
3125 // (interleaved) elements in the group.
3126 // \p Indices contains the indices for a strided load: when the
3127 // interleaved load has gaps they indicate which elements are used.
3128 // If Indices is empty (or if the number of indices is equal to the size
3129 // of the interleaved-access as given in \p Factor) the access has no gaps.
3130 //
3131 // As opposed to AVX-512, AVX2 does not have generic shuffles that allow
3132 // computing the cost using a generic formula as a function of generic
3133 // shuffles. We therefore use a lookup table instead, filled according to
3134 // the instruction sequences that codegen currently generates.
3136  unsigned Factor,
3137  ArrayRef<unsigned> Indices,
3138  unsigned Alignment,
3139  unsigned AddressSpace,
3140  bool UseMaskForCond,
3141  bool UseMaskForGaps) {
3142 
3143  if (UseMaskForCond || UseMaskForGaps)
3144  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3145  Alignment, AddressSpace,
3146  UseMaskForCond, UseMaskForGaps);
3147 
3148  // We currently Support only fully-interleaved groups, with no gaps.
3149  // TODO: Support also strided loads (interleaved-groups with gaps).
3150  if (Indices.size() && Indices.size() != Factor)
3151  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3152  Alignment, AddressSpace);
3153 
3154  // VecTy for interleave memop is <VF*Factor x Elt>.
3155  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
3156  // VecTy = <12 x i32>.
3157  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
3158 
3159  // This function can be called with VecTy=<6xi128>, Factor=3, in which case
3160  // the VF=2, while v2i128 is an unsupported MVT vector type
3161  // (see MachineValueType.h::getVectorVT()).
3162  if (!LegalVT.isVector())
3163  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3164  Alignment, AddressSpace);
3165 
3166  unsigned VF = VecTy->getVectorNumElements() / Factor;
3167  Type *ScalarTy = VecTy->getVectorElementType();
3168 
3169  // Calculate the number of memory operations (NumOfMemOps), required
3170  // for load/store the VecTy.
3171  unsigned VecTySize = DL.getTypeStoreSize(VecTy);
3172  unsigned LegalVTSize = LegalVT.getStoreSize();
3173  unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
3174 
3175  // Get the cost of one memory operation.
3176  Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
3177  LegalVT.getVectorNumElements());
3178  unsigned MemOpCost =
3179  getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
3180 
3181  VectorType *VT = VectorType::get(ScalarTy, VF);
3182  EVT ETy = TLI->getValueType(DL, VT);
3183  if (!ETy.isSimple())
3184  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3185  Alignment, AddressSpace);
3186 
3187  // TODO: Complete for other data-types and strides.
3188  // Each combination of Stride, ElementTy and VF results in a different
3189  // sequence; The cost tables are therefore accessed with:
3190  // Factor (stride) and VectorType=VFxElemType.
3191  // The Cost accounts only for the shuffle sequence;
3192  // The cost of the loads/stores is accounted for separately.
3193  //
3194  static const CostTblEntry AVX2InterleavedLoadTbl[] = {
3195  { 2, MVT::v4i64, 6 }, //(load 8i64 and) deinterleave into 2 x 4i64
3196  { 2, MVT::v4f64, 6 }, //(load 8f64 and) deinterleave into 2 x 4f64
3197 
3198  { 3, MVT::v2i8, 10 }, //(load 6i8 and) deinterleave into 3 x 2i8
3199  { 3, MVT::v4i8, 4 }, //(load 12i8 and) deinterleave into 3 x 4i8
3200  { 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8
3201  { 3, MVT::v16i8, 11}, //(load 48i8 and) deinterleave into 3 x 16i8
3202  { 3, MVT::v32i8, 13}, //(load 96i8 and) deinterleave into 3 x 32i8
3203  { 3, MVT::v8f32, 17 }, //(load 24f32 and)deinterleave into 3 x 8f32
3204 
3205  { 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8
3206  { 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8
3207  { 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8
3208  { 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8
3209  { 4, MVT::v32i8, 80 }, //(load 128i8 and) deinterleave into 4 x 32i8
3210 
3211  { 8, MVT::v8f32, 40 } //(load 64f32 and)deinterleave into 8 x 8f32
3212  };
3213 
3214  static const CostTblEntry AVX2InterleavedStoreTbl[] = {
3215  { 2, MVT::v4i64, 6 }, //interleave into 2 x 4i64 into 8i64 (and store)
3216  { 2, MVT::v4f64, 6 }, //interleave into 2 x 4f64 into 8f64 (and store)
3217 
3218  { 3, MVT::v2i8, 7 }, //interleave 3 x 2i8 into 6i8 (and store)
3219  { 3, MVT::v4i8, 8 }, //interleave 3 x 4i8 into 12i8 (and store)
3220  { 3, MVT::v8i8, 11 }, //interleave 3 x 8i8 into 24i8 (and store)
3221  { 3, MVT::v16i8, 11 }, //interleave 3 x 16i8 into 48i8 (and store)
3222  { 3, MVT::v32i8, 13 }, //interleave 3 x 32i8 into 96i8 (and store)
3223 
3224  { 4, MVT::v2i8, 12 }, //interleave 4 x 2i8 into 8i8 (and store)
3225  { 4, MVT::v4i8, 9 }, //interleave 4 x 4i8 into 16i8 (and store)
3226  { 4, MVT::v8i8, 10 }, //interleave 4 x 8i8 into 32i8 (and store)
3227  { 4, MVT::v16i8, 10 }, //interleave 4 x 16i8 into 64i8 (and store)
3228  { 4, MVT::v32i8, 12 } //interleave 4 x 32i8 into 128i8 (and store)
3229  };
3230 
3231  if (Opcode == Instruction::Load) {
3232  if (const auto *Entry =
3233  CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT()))
3234  return NumOfMemOps * MemOpCost + Entry->Cost;
3235  } else {
3236  assert(Opcode == Instruction::Store &&
3237  "Expected Store Instruction at this point");
3238  if (const auto *Entry =
3239  CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT()))
3240  return NumOfMemOps * MemOpCost + Entry->Cost;
3241  }
3242 
3243  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3244  Alignment, AddressSpace);
3245 }
3246 
3247 // Get estimation for interleaved load/store operations and strided load.
3248 // \p Indices contains indices for strided load.
3249 // \p Factor - the factor of interleaving.
3250 // AVX-512 provides 3-src shuffles that significantly reduces the cost.
3252  unsigned Factor,
3253  ArrayRef<unsigned> Indices,
3254  unsigned Alignment,
3255  unsigned AddressSpace,
3256  bool UseMaskForCond,
3257  bool UseMaskForGaps) {
3258 
3259  if (UseMaskForCond || UseMaskForGaps)
3260  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3261  Alignment, AddressSpace,
3262  UseMaskForCond, UseMaskForGaps);
3263 
3264  // VecTy for interleave memop is <VF*Factor x Elt>.
3265  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
3266  // VecTy = <12 x i32>.
3267 
3268  // Calculate the number of memory operations (NumOfMemOps), required
3269  // for load/store the VecTy.
3270  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
3271  unsigned VecTySize = DL.getTypeStoreSize(VecTy);
3272  unsigned LegalVTSize = LegalVT.getStoreSize();
3273  unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
3274 
3275  // Get the cost of one memory operation.
3276  Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
3277  LegalVT.getVectorNumElements());
3278  unsigned MemOpCost =
3279  getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
3280 
3281  unsigned VF = VecTy->getVectorNumElements() / Factor;
3282  MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
3283 
3284  if (Opcode == Instruction::Load) {
3285  // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
3286  // contain the cost of the optimized shuffle sequence that the
3287  // X86InterleavedAccess pass will generate.
3288  // The cost of loads and stores are computed separately from the table.
3289 
3290  // X86InterleavedAccess support only the following interleaved-access group.
3291  static const CostTblEntry AVX512InterleavedLoadTbl[] = {
3292  {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
3293  {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
3294  {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
3295  };
3296 
3297  if (const auto *Entry =
3298  CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
3299  return NumOfMemOps * MemOpCost + Entry->Cost;
3300  //If an entry does not exist, fallback to the default implementation.
3301 
3302  // Kind of shuffle depends on number of loaded values.
3303  // If we load the entire data in one register, we can use a 1-src shuffle.
3304  // Otherwise, we'll merge 2 sources in each operation.
3305  TTI::ShuffleKind ShuffleKind =
3306  (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
3307 
3308  unsigned ShuffleCost =
3309  getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr);
3310 
3311  unsigned NumOfLoadsInInterleaveGrp =
3312  Indices.size() ? Indices.size() : Factor;
3313  Type *ResultTy = VectorType::get(VecTy->getVectorElementType(),
3314  VecTy->getVectorNumElements() / Factor);
3315  unsigned NumOfResults =
3316  getTLI()->getTypeLegalizationCost(DL, ResultTy).first *
3317  NumOfLoadsInInterleaveGrp;
3318 
3319  // About a half of the loads may be folded in shuffles when we have only
3320  // one result. If we have more than one result, we do not fold loads at all.
3321  unsigned NumOfUnfoldedLoads =
3322  NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
3323 
3324  // Get a number of shuffle operations per result.
3325  unsigned NumOfShufflesPerResult =
3326  std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
3327 
3328  // The SK_MergeTwoSrc shuffle clobbers one of src operands.
3329  // When we have more than one destination, we need additional instructions
3330  // to keep sources.
3331  unsigned NumOfMoves = 0;
3332  if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
3333  NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
3334 
3335  int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
3336  NumOfUnfoldedLoads * MemOpCost + NumOfMoves;
3337 
3338  return Cost;
3339  }
3340 
3341  // Store.
3342  assert(Opcode == Instruction::Store &&
3343  "Expected Store Instruction at this point");
3344  // X86InterleavedAccess support only the following interleaved-access group.
3345  static const CostTblEntry AVX512InterleavedStoreTbl[] = {
3346  {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
3347  {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
3348  {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
3349 
3350  {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
3351  {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
3352  {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
3353  {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
3354  };
3355 
3356  if (const auto *Entry =
3357  CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
3358  return NumOfMemOps * MemOpCost + Entry->Cost;
3359  //If an entry does not exist, fallback to the default implementation.
3360 
3361  // There is no strided stores meanwhile. And store can't be folded in
3362  // shuffle.
3363  unsigned NumOfSources = Factor; // The number of values to be merged.
3364  unsigned ShuffleCost =
3365  getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr);
3366  unsigned NumOfShufflesPerStore = NumOfSources - 1;
3367 
3368  // The SK_MergeTwoSrc shuffle clobbers one of src operands.
3369  // We need additional instructions to keep sources.
3370  unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
3371  int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
3372  NumOfMoves;
3373  return Cost;
3374 }
3375 
3376 int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
3377  unsigned Factor,
3378  ArrayRef<unsigned> Indices,
3379  unsigned Alignment,
3380  unsigned AddressSpace,
3381  bool UseMaskForCond,
3382  bool UseMaskForGaps) {
3383  auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) {
3384  Type *EltTy = VecTy->getVectorElementType();
3385  if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
3386  EltTy->isIntegerTy(32) || EltTy->isPointerTy())
3387  return true;
3388  if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8))
3389  return HasBW;
3390  return false;
3391  };
3392  if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
3393  return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
3394  Alignment, AddressSpace,
3395  UseMaskForCond, UseMaskForGaps);
3396  if (ST->hasAVX2())
3397  return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
3398  Alignment, AddressSpace,
3399  UseMaskForCond, UseMaskForGaps);
3400 
3401  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3402  Alignment, AddressSpace,
3403  UseMaskForCond, UseMaskForGaps);
3404 }
bool hasAVX() const
Definition: X86Subtarget.h:559
Type * getVectorElementType() const
Definition: Type.h:370
constexpr bool isUInt< 32 >(uint64_t x)
Definition: MathExtras.h:348
X = FP_ROUND(Y, TRUNC) - Rounding &#39;Y&#39; from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:537
bool is64Bit() const
Is this x86_64? (disregarding specific ABI / programming model)
Definition: X86Subtarget.h:521
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:110
bool areFunctionArgsABICompatible(const Function *Caller, const Function *Callee, SmallPtrSetImpl< Argument *> &Args) const
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:595
unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >())
Definition: BasicTTIImpl.h:567
static IntegerType * getInt1Ty(LLVMContext &C)
Definition: Type.cpp:172
llvm::Optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1562
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:833
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:41
bool hasSSE41() const
Definition: X86Subtarget.h:557
This class represents lattice values for constants.
Definition: AllocatorList.h:23
unsigned minRequiredElementSize(const Value *Val, bool &isSigned)
static MVT getVectorVT(MVT VT, unsigned NumElements)
Cost tables and simple lookup functions.
int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty)
int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, bool VariableMask, unsigned Alignment)
Calculate the cost of Gather / Scatter operation.
bool hasPOPCNT() const
Definition: X86Subtarget.h:567
bool hasAVX2() const
Definition: X86Subtarget.h:560
const Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
bool isVector() const
Return true if this is a vector value type.
unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:506
void push_back(const T &Elt)
Definition: SmallVector.h:211
This file a TargetTransformInfo::Concept conforming object specific to the X86 target machine...
The main scalar evolution driver.
unsigned getRegisterBitWidth(bool Vector) const
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:252
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:250
unsigned getVectorNumElements() const
bool isLegalMaskedScatter(Type *DataType)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1185
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:365
bool useAVX512Regs() const
Definition: X86Subtarget.h:693
Type Conversion Cost Table.
Definition: CostTable.h:44
uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the next integer (mod 2**64) that is greater than or equal to Value and is a multiple of Alig...
Definition: MathExtras.h:684
[US]{MIN/MAX} - Binary minimum or maximum or signed or unsigned integers.
Definition: ISDOpcodes.h:383
Hexagon Common GEP
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:229
unsigned getAtomicMemIntrinsicMaxElementSize() const
Cost Table Entry.
Definition: CostTable.h:24
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1508
unsigned getNumberOfRegisters(bool Vector)
int getGatherOverhead() const
Definition: X86Subtarget.h:620
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:343
unsigned getMaxInterleaveFactor(unsigned VF)
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:209
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
bool isUnalignedMem32Slow() const
Definition: X86Subtarget.h:619
unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef< Value * > Args, FastMathFlags FMF, unsigned VF=1)
Get intrinsic cost based on arguments.
bool isLegalMaskedStore(Type *DataType)
unsigned getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
int getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value *> Args=ArrayRef< const Value *>())
int getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond=false, bool UseMaskForGaps=false)
const FeatureBitset & getFeatureBits() const
Shift and rotation operations.
Definition: ISDOpcodes.h:409
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:196
unsigned getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwise)
Try to calculate arithmetic and shuffle op costs for reduction operations.
const TypeConversionCostTblEntry * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntry > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table, TypeTy must be comparable to CompareTy by ==. ...
Definition: CostTable.h:54
unsigned getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isLSRCostLess(TargetTransformInfo::LSRCost &C1, TargetTransformInfo::LSRCost &C2)
bool hasVLX() const
Definition: X86Subtarget.h:656
unsigned getSizeInBits() const
unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I)
Definition: BasicTTIImpl.h:771
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1574
unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:633
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:244
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:477
PopcntSupportKind
Flags indicating the kind of support for population count.
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:883
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:200
llvm::Optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:32
Selects elements from the corresponding lane of either source operand.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:202
int getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr)
Reverse the order of the vector.
bool hasDQI() const
Definition: X86Subtarget.h:654
MVT getVectorElementType() const
Value * getOperand(unsigned i) const
Definition: User.h:169
Class to represent pointers.
Definition: DerivedTypes.h:498
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:523
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return &#39;this&#39;.
Definition: Type.h:303
bool is128BitVector() const
Return true if this is a 128-bit vector type.
ExtractSubvector Index indicates start offset.
bool isFloatTy() const
Return true if this is &#39;float&#39;, a 32-bit IEEE fp type.
Definition: Type.h:146
an instruction for type-safe pointer arithmetic to access elements of arrays and structs ...
Definition: Instructions.h:873
bool isSLM() const
Definition: X86Subtarget.h:707
bool hasSSSE3() const
Definition: X86Subtarget.h:556
If not nullptr, enable inline expansion of memcmp.
* if(!EatIfPresent(lltok::kw_thread_local)) return false
ParseOptionalThreadLocal := /*empty.
Container class for subtarget features.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:428
Machine Value Type.
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:45
unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Definition: BasicTTIImpl.h:849
Simple binary floating point operators.
Definition: ISDOpcodes.h:282
bool isLegalMaskedGather(Type *DataType)
unsigned getScalarSizeInBits() const
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:148
unsigned getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwise, bool)
Try to calculate op costs for min/max reduction operations.
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:223
int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Expected to fold away in lowering.
unsigned getUserCost(const User *U, ArrayRef< const Value * > Operands)
bool isLegalMaskedLoad(Type *DataType)
const TTI::MemCmpExpansionOptions * enableMemCmpExpansion(bool IsZeroCmp) const
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:526
Merge elements from two source vectors into one with any shuffle mask.
int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace)
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:280
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
bool hasXOP() const
Definition: X86Subtarget.h:583
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:555
bool hasSSE42() const
Definition: X86Subtarget.h:558
Extended Value Type.
Definition: ValueTypes.h:33
int getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm)
unsigned getUserCost(const User *U, ArrayRef< const Value *> Operands)
const TargetMachine & getTargetMachine() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace, const Instruction *I=nullptr)
int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp)
const SCEVConstant * getConstantStrideStep(ScalarEvolution *SE, const SCEV *Ptr)
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
OperandValueProperties
Additional properties of an operand&#39;s values.
unsigned getCFInstrCost(unsigned Opcode)
Definition: BasicTTIImpl.h:766
bool isAtom() const
TODO: to be removed later and replaced with suitable properties.
Definition: X86Subtarget.h:706
unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp)
Definition: BasicTTIImpl.h:614
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:946
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:264
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:239
unsigned getNumOperands() const
Definition: User.h:191
constexpr bool isInt< 32 >(int64_t x)
Definition: MathExtras.h:308
bool areFunctionArgsABICompatible(const Function *Caller, const Function *Callee, SmallPtrSetImpl< Argument *> &Args) const
unsigned getPreferVectorWidth() const
Definition: X86Subtarget.h:677
AddressSpace
Definition: NVPTXBaseInfo.h:21
bool hasVBMI() const
Definition: X86Subtarget.h:593
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:412
int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I=nullptr)
int getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwiseForm, bool IsUnsigned)
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target&#39;s TargetSubtargetInf...
unsigned getVectorNumElements() const
Definition: DerivedTypes.h:493
Class to represent vector types.
Definition: DerivedTypes.h:424
Class for arbitrary precision integers.
Definition: APInt.h:69
int getScatterOverhead() const
Definition: X86Subtarget.h:621
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:419
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:467
unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:818
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const
const CostTblEntry * CostTableLookup(ArrayRef< CostTblEntry > Tbl, int ISD, MVT Ty)
Find in cost table, TypeTy must be comparable to CompareTy by ==.
Definition: CostTable.h:31
This class represents an analyzed expression in the program.
bool isGLM() const
Definition: X86Subtarget.h:708
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return &#39;Legal&#39;) or we ...
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, const Instruction *I=nullptr)
#define I(x, y, z)
Definition: MD5.cpp:58
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:322
unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: BasicTTIImpl.h:811
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:272
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:698
InsertSubvector. Index indicates start offset.
unsigned Insns
TODO: Some of these could be merged.
bool hasCDI() const
Definition: X86Subtarget.h:650
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:184
const unsigned Kind
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
The cost of a typical &#39;add&#39; instruction.
bool hasSSE1() const
Definition: X86Subtarget.h:553
unsigned getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition: Type.cpp:114
LLVM Value Representation.
Definition: Value.h:72
uint64_t getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type...
Definition: DataLayout.h:421
bool hasDivRemOp(Type *DataType, bool IsSigned)
static VectorType * get(Type *ElementType, unsigned NumElements)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:605
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition: Compiler.h:250
bool hasMacroFusion() const
Definition: X86Subtarget.h:640
Broadcast element 0 to all other elements.
bool hasAVX512() const
Definition: X86Subtarget.h:561
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:58
Convenience struct for specifying and reasoning about fast-math flags.
Definition: Operator.h:159
bool hasBWI() const
Definition: X86Subtarget.h:655
OperandValueKind
Additional information about an operand&#39;s possible values.
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:442
This pass exposes codegen information to IR-level passes.
Conversion operators.
Definition: ISDOpcodes.h:464
bool hasFastGather() const
Definition: X86Subtarget.h:633
CacheLevel
The possible cache levels.
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:473
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:125
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
bool isDoubleTy() const
Return true if this is &#39;double&#39;, a 64-bit IEEE fp type.
Definition: Type.h:149
int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef< Type *> Tys, FastMathFlags FMF, unsigned ScalarizationCostPassed=UINT_MAX)
static IntegerType * getInt8Ty(LLVMContext &C)
Definition: Type.cpp:173
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond=false, bool UseMaskForGaps=false)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
std::pair< int, MVT > getTypeLegalizationCost(const DataLayout &DL, Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
bool hasSSE2() const
Definition: X86Subtarget.h:554
int getIntImmCost(int64_t)
Calculate the cost of materializing a 64-bit value.
Type * getContainedType(unsigned i) const
This method is used to implement the type iterator (defined at the end of the file).
Definition: Type.h:332
This file describes how to lower LLVM code to machine code.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Shuffle elements of single source vector with any shuffle mask.