LLVM  4.0.0
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 /// \file
10 /// This file implements a TargetTransformInfo analysis pass specific to the
11 /// X86 target machine. It uses the target's detailed information to provide
12 /// more precise answers to certain TTI queries, while letting the target
13 /// independent and default TTI implementations handle the rest.
14 ///
15 //===----------------------------------------------------------------------===//
16 /// About Cost Model numbers used below it's necessary to say the following:
17 /// the numbers correspond to some "generic" X86 CPU instead of usage of
18 /// concrete CPU model. Usually the numbers correspond to CPU where the feature
19 /// apeared at the first time. For example, if we do Subtarget.hasSSE42() in
20 /// the lookups below the cost is based on Nehalem as that was the first CPU
21 /// to support that feature level and thus has most likely the worst case cost.
22 /// Some examples of other technologies/CPUs:
23 /// SSE 3 - Pentium4 / Athlon64
24 /// SSE 4.1 - Penryn
25 /// SSE 4.2 - Nehalem
26 /// AVX - Sandy Bridge
27 /// AVX2 - Haswell
28 /// AVX-512 - Xeon Phi / Skylake
29 /// And some examples of instruction target dependent costs (latency)
30 /// divss sqrtss rsqrtss
31 /// AMD K7 11-16 19 3
32 /// Piledriver 9-24 13-15 5
33 /// Jaguar 14 16 2
34 /// Pentium II,III 18 30 2
35 /// Nehalem 7-14 7-18 3
36 /// Haswell 10-13 11 5
37 /// TODO: Develop and implement the target dependent cost model and
38 /// specialize cost numbers for different Cost Model Targets such as throughput,
39 /// code size, latency and uop count.
40 //===----------------------------------------------------------------------===//
41 
42 #include "X86TargetTransformInfo.h"
45 #include "llvm/IR/IntrinsicInst.h"
46 #include "llvm/Support/Debug.h"
47 #include "llvm/Target/CostTable.h"
49 
50 using namespace llvm;
51 
52 #define DEBUG_TYPE "x86tti"
53 
54 //===----------------------------------------------------------------------===//
55 //
56 // X86 cost model.
57 //
58 //===----------------------------------------------------------------------===//
59 
61 X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
62  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
63  // TODO: Currently the __builtin_popcount() implementation using SSE3
64  // instructions is inefficient. Once the problem is fixed, we should
65  // call ST->hasSSE3() instead of ST->hasPOPCNT().
67 }
68 
69 unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
70  if (Vector && !ST->hasSSE1())
71  return 0;
72 
73  if (ST->is64Bit()) {
74  if (Vector && ST->hasAVX512())
75  return 32;
76  return 16;
77  }
78  return 8;
79 }
80 
81 unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) {
82  if (Vector) {
83  if (ST->hasAVX512())
84  return 512;
85  if (ST->hasAVX())
86  return 256;
87  if (ST->hasSSE1())
88  return 128;
89  return 0;
90  }
91 
92  if (ST->is64Bit())
93  return 64;
94 
95  return 32;
96 }
97 
98 unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
99  // If the loop will not be vectorized, don't interleave the loop.
100  // Let regular unroll to unroll the loop, which saves the overflow
101  // check and memory check cost.
102  if (VF == 1)
103  return 1;
104 
105  if (ST->isAtom())
106  return 1;
107 
108  // Sandybridge and Haswell have multiple execution ports and pipelined
109  // vector units.
110  if (ST->hasAVX())
111  return 4;
112 
113  return 2;
114 }
115 
117  unsigned Opcode, Type *Ty,
119  TTI::OperandValueProperties Opd1PropInfo,
120  TTI::OperandValueProperties Opd2PropInfo,
122  // Legalize the type.
123  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
124 
125  int ISD = TLI->InstructionOpcodeToISD(Opcode);
126  assert(ISD && "Invalid opcode");
127 
128  static const CostTblEntry SLMCostTable[] = {
129  { ISD::MUL, MVT::v4i32, 11 }, // pmulld
130  { ISD::MUL, MVT::v8i16, 2 }, // pmullw
131  { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
132  { ISD::FMUL, MVT::f64, 2 }, // mulsd
133  { ISD::FMUL, MVT::v2f64, 4 }, // mulpd
134  { ISD::FMUL, MVT::v4f32, 2 }, // mulps
135  { ISD::FDIV, MVT::f32, 17 }, // divss
136  { ISD::FDIV, MVT::v4f32, 39 }, // divps
137  { ISD::FDIV, MVT::f64, 32 }, // divsd
138  { ISD::FDIV, MVT::v2f64, 69 }, // divpd
139  { ISD::FADD, MVT::v2f64, 2 }, // addpd
140  { ISD::FSUB, MVT::v2f64, 2 }, // subpd
141  // v2i64/v4i64 mul is custom lowered as a series of long
142  // multiplies(3), shifts(3) and adds(2).
143  // slm muldq version throughput is 2
144  { ISD::MUL, MVT::v2i64, 11 },
145  };
146 
147  if (ST->isSLM()) {
148  if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
149  // Check if the operands can be shrinked into a smaller datatype.
150  bool Op1Signed = false;
151  unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
152  bool Op2Signed = false;
153  unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
154 
155  bool signedMode = Op1Signed | Op2Signed;
156  unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
157 
158  if (OpMinSize <= 7)
159  return LT.first * 3; // pmullw/sext
160  if (!signedMode && OpMinSize <= 8)
161  return LT.first * 3; // pmullw/zext
162  if (OpMinSize <= 15)
163  return LT.first * 5; // pmullw/pmulhw/pshuf
164  if (!signedMode && OpMinSize <= 16)
165  return LT.first * 5; // pmullw/pmulhw/pshuf
166  }
167  if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
168  LT.second)) {
169  return LT.first * Entry->Cost;
170  }
171  }
172 
173  if (ISD == ISD::SDIV &&
175  Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
176  // On X86, vector signed division by constants power-of-two are
177  // normally expanded to the sequence SRA + SRL + ADD + SRA.
178  // The OperandValue properties many not be same as that of previous
179  // operation;conservatively assume OP_None.
180  int Cost = 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info,
183  Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
186  Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info,
189 
190  return Cost;
191  }
192 
193  static const CostTblEntry AVX512BWUniformConstCostTable[] = {
194  { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand.
195  { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand.
196  { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb.
197 
198  { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence
199  { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence
200  };
201 
203  ST->hasBWI()) {
204  if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
205  LT.second))
206  return LT.first * Entry->Cost;
207  }
208 
209  static const CostTblEntry AVX512UniformConstCostTable[] = {
210  { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
211  { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
212  };
213 
215  ST->hasAVX512()) {
216  if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
217  LT.second))
218  return LT.first * Entry->Cost;
219  }
220 
221  static const CostTblEntry AVX2UniformConstCostTable[] = {
222  { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand.
223  { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand.
224  { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb.
225 
226  { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
227 
228  { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
229  { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
230  { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
231  { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
232  };
233 
235  ST->hasAVX2()) {
236  if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
237  LT.second))
238  return LT.first * Entry->Cost;
239  }
240 
241  static const CostTblEntry SSE2UniformConstCostTable[] = {
242  { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand.
243  { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand.
244  { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
245 
246  { ISD::SHL, MVT::v32i8, 4 }, // 2*(psllw + pand).
247  { ISD::SRL, MVT::v32i8, 4 }, // 2*(psrlw + pand).
248  { ISD::SRA, MVT::v32i8, 8 }, // 2*(psrlw, pand, pxor, psubb).
249 
250  { ISD::SDIV, MVT::v16i16, 12 }, // pmulhw sequence
251  { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
252  { ISD::UDIV, MVT::v16i16, 12 }, // pmulhuw sequence
253  { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
254  { ISD::SDIV, MVT::v8i32, 38 }, // pmuludq sequence
255  { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
256  { ISD::UDIV, MVT::v8i32, 30 }, // pmuludq sequence
257  { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
258  };
259 
261  ST->hasSSE2()) {
262  // pmuldq sequence.
263  if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
264  return LT.first * 30;
265  if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
266  return LT.first * 15;
267 
268  if (const auto *Entry = CostTableLookup(SSE2UniformConstCostTable, ISD,
269  LT.second))
270  return LT.first * Entry->Cost;
271  }
272 
273  static const CostTblEntry AVX2UniformCostTable[] = {
274  // Uniform splats are cheaper for the following instructions.
275  { ISD::SHL, MVT::v16i16, 1 }, // psllw.
276  { ISD::SRL, MVT::v16i16, 1 }, // psrlw.
277  { ISD::SRA, MVT::v16i16, 1 }, // psraw.
278  };
279 
280  if (ST->hasAVX2() &&
282  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
283  if (const auto *Entry =
284  CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
285  return LT.first * Entry->Cost;
286  }
287 
288  static const CostTblEntry SSE2UniformCostTable[] = {
289  // Uniform splats are cheaper for the following instructions.
290  { ISD::SHL, MVT::v8i16, 1 }, // psllw.
291  { ISD::SHL, MVT::v4i32, 1 }, // pslld
292  { ISD::SHL, MVT::v2i64, 1 }, // psllq.
293 
294  { ISD::SRL, MVT::v8i16, 1 }, // psrlw.
295  { ISD::SRL, MVT::v4i32, 1 }, // psrld.
296  { ISD::SRL, MVT::v2i64, 1 }, // psrlq.
297 
298  { ISD::SRA, MVT::v8i16, 1 }, // psraw.
299  { ISD::SRA, MVT::v4i32, 1 }, // psrad.
300  };
301 
302  if (ST->hasSSE2() &&
304  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
305  if (const auto *Entry =
306  CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
307  return LT.first * Entry->Cost;
308  }
309 
310  static const CostTblEntry AVX512DQCostTable[] = {
311  { ISD::MUL, MVT::v2i64, 1 },
312  { ISD::MUL, MVT::v4i64, 1 },
313  { ISD::MUL, MVT::v8i64, 1 }
314  };
315 
316  // Look for AVX512DQ lowering tricks for custom cases.
317  if (ST->hasDQI())
318  if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
319  return LT.first * Entry->Cost;
320 
321  static const CostTblEntry AVX512BWCostTable[] = {
322  { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
323  { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
324  { ISD::SRA, MVT::v32i16, 1 }, // vpsravw
325 
326  { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence.
327  { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence.
328  { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence.
329 
330  { ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence.
331  { ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence.
332  { ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence.
333 
334  // Vectorizing division is a bad idea. See the SSE2 table for more comments.
335  { ISD::SDIV, MVT::v64i8, 64*20 },
336  { ISD::SDIV, MVT::v32i16, 32*20 },
337  { ISD::UDIV, MVT::v64i8, 64*20 },
338  { ISD::UDIV, MVT::v32i16, 32*20 }
339  };
340 
341  // Look for AVX512BW lowering tricks for custom cases.
342  if (ST->hasBWI())
343  if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
344  return LT.first * Entry->Cost;
345 
346  static const CostTblEntry AVX512CostTable[] = {
347  { ISD::SHL, MVT::v16i32, 1 },
348  { ISD::SRL, MVT::v16i32, 1 },
349  { ISD::SRA, MVT::v16i32, 1 },
350  { ISD::SHL, MVT::v8i64, 1 },
351  { ISD::SRL, MVT::v8i64, 1 },
352  { ISD::SRA, MVT::v8i64, 1 },
353 
354  { ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence.
355  { ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence.
356  { ISD::MUL, MVT::v16i32, 1 }, // pmulld
357  { ISD::MUL, MVT::v8i64, 8 }, // 3*pmuludq/3*shift/2*add
358 
359  // Vectorizing division is a bad idea. See the SSE2 table for more comments.
360  { ISD::SDIV, MVT::v16i32, 16*20 },
361  { ISD::SDIV, MVT::v8i64, 8*20 },
362  { ISD::UDIV, MVT::v16i32, 16*20 },
363  { ISD::UDIV, MVT::v8i64, 8*20 }
364  };
365 
366  if (ST->hasAVX512())
367  if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
368  return LT.first * Entry->Cost;
369 
370  static const CostTblEntry AVX2ShiftCostTable[] = {
371  // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
372  // customize them to detect the cases where shift amount is a scalar one.
373  { ISD::SHL, MVT::v4i32, 1 },
374  { ISD::SRL, MVT::v4i32, 1 },
375  { ISD::SRA, MVT::v4i32, 1 },
376  { ISD::SHL, MVT::v8i32, 1 },
377  { ISD::SRL, MVT::v8i32, 1 },
378  { ISD::SRA, MVT::v8i32, 1 },
379  { ISD::SHL, MVT::v2i64, 1 },
380  { ISD::SRL, MVT::v2i64, 1 },
381  { ISD::SHL, MVT::v4i64, 1 },
382  { ISD::SRL, MVT::v4i64, 1 },
383  };
384 
385  // Look for AVX2 lowering tricks.
386  if (ST->hasAVX2()) {
387  if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
388  (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
389  Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
390  // On AVX2, a packed v16i16 shift left by a constant build_vector
391  // is lowered into a vector multiply (vpmullw).
392  return LT.first;
393 
394  if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
395  return LT.first * Entry->Cost;
396  }
397 
398  static const CostTblEntry XOPShiftCostTable[] = {
399  // 128bit shifts take 1cy, but right shifts require negation beforehand.
400  { ISD::SHL, MVT::v16i8, 1 },
401  { ISD::SRL, MVT::v16i8, 2 },
402  { ISD::SRA, MVT::v16i8, 2 },
403  { ISD::SHL, MVT::v8i16, 1 },
404  { ISD::SRL, MVT::v8i16, 2 },
405  { ISD::SRA, MVT::v8i16, 2 },
406  { ISD::SHL, MVT::v4i32, 1 },
407  { ISD::SRL, MVT::v4i32, 2 },
408  { ISD::SRA, MVT::v4i32, 2 },
409  { ISD::SHL, MVT::v2i64, 1 },
410  { ISD::SRL, MVT::v2i64, 2 },
411  { ISD::SRA, MVT::v2i64, 2 },
412  // 256bit shifts require splitting if AVX2 didn't catch them above.
413  { ISD::SHL, MVT::v32i8, 2 },
414  { ISD::SRL, MVT::v32i8, 4 },
415  { ISD::SRA, MVT::v32i8, 4 },
416  { ISD::SHL, MVT::v16i16, 2 },
417  { ISD::SRL, MVT::v16i16, 4 },
418  { ISD::SRA, MVT::v16i16, 4 },
419  { ISD::SHL, MVT::v8i32, 2 },
420  { ISD::SRL, MVT::v8i32, 4 },
421  { ISD::SRA, MVT::v8i32, 4 },
422  { ISD::SHL, MVT::v4i64, 2 },
423  { ISD::SRL, MVT::v4i64, 4 },
424  { ISD::SRA, MVT::v4i64, 4 },
425  };
426 
427  // Look for XOP lowering tricks.
428  if (ST->hasXOP())
429  if (const auto *Entry = CostTableLookup(XOPShiftCostTable, ISD, LT.second))
430  return LT.first * Entry->Cost;
431 
432  static const CostTblEntry SSE2UniformShiftCostTable[] = {
433  // Uniform splats are cheaper for the following instructions.
434  { ISD::SHL, MVT::v16i16, 2 }, // psllw.
435  { ISD::SHL, MVT::v8i32, 2 }, // pslld
436  { ISD::SHL, MVT::v4i64, 2 }, // psllq.
437 
438  { ISD::SRL, MVT::v16i16, 2 }, // psrlw.
439  { ISD::SRL, MVT::v8i32, 2 }, // psrld.
440  { ISD::SRL, MVT::v4i64, 2 }, // psrlq.
441 
442  { ISD::SRA, MVT::v16i16, 2 }, // psraw.
443  { ISD::SRA, MVT::v8i32, 2 }, // psrad.
444  { ISD::SRA, MVT::v2i64, 4 }, // 2 x psrad + shuffle.
445  { ISD::SRA, MVT::v4i64, 8 }, // 2 x psrad + shuffle.
446  };
447 
448  if (ST->hasSSE2() &&
449  ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
450  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
451  if (const auto *Entry =
452  CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
453  return LT.first * Entry->Cost;
454  }
455 
456  if (ISD == ISD::SHL &&
457  Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) {
458  MVT VT = LT.second;
459  // Vector shift left by non uniform constant can be lowered
460  // into vector multiply.
461  if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
462  ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
463  ISD = ISD::MUL;
464  }
465 
466  static const CostTblEntry AVX2CostTable[] = {
467  { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence.
468  { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
469 
470  { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence.
471  { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
472 
473  { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence.
474  { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence.
475  { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence.
476  { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence.
477 
478  { ISD::SUB, MVT::v32i8, 1 }, // psubb
479  { ISD::ADD, MVT::v32i8, 1 }, // paddb
480  { ISD::SUB, MVT::v16i16, 1 }, // psubw
481  { ISD::ADD, MVT::v16i16, 1 }, // paddw
482  { ISD::SUB, MVT::v8i32, 1 }, // psubd
483  { ISD::ADD, MVT::v8i32, 1 }, // paddd
484  { ISD::SUB, MVT::v4i64, 1 }, // psubq
485  { ISD::ADD, MVT::v4i64, 1 }, // paddq
486 
487  { ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence.
488  { ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence.
489  { ISD::MUL, MVT::v16i16, 1 }, // pmullw
490  { ISD::MUL, MVT::v8i32, 1 }, // pmulld
491  { ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add
492 
493  { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
494  { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
495  { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
496  { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/
497  { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
498  { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
499  };
500 
501  // Look for AVX2 lowering tricks for custom cases.
502  if (ST->hasAVX2())
503  if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
504  return LT.first * Entry->Cost;
505 
506  static const CostTblEntry AVX1CostTable[] = {
507  // We don't have to scalarize unsupported ops. We can issue two half-sized
508  // operations and we only need to extract the upper YMM half.
509  // Two ops + 1 extract + 1 insert = 4.
510  { ISD::MUL, MVT::v16i16, 4 },
511  { ISD::MUL, MVT::v8i32, 4 },
512  { ISD::SUB, MVT::v32i8, 4 },
513  { ISD::ADD, MVT::v32i8, 4 },
514  { ISD::SUB, MVT::v16i16, 4 },
515  { ISD::ADD, MVT::v16i16, 4 },
516  { ISD::SUB, MVT::v8i32, 4 },
517  { ISD::ADD, MVT::v8i32, 4 },
518  { ISD::SUB, MVT::v4i64, 4 },
519  { ISD::ADD, MVT::v4i64, 4 },
520 
521  // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
522  // are lowered as a series of long multiplies(3), shifts(3) and adds(2)
523  // Because we believe v4i64 to be a legal type, we must also include the
524  // extract+insert in the cost table. Therefore, the cost here is 18
525  // instead of 8.
526  { ISD::MUL, MVT::v4i64, 18 },
527 
528  { ISD::MUL, MVT::v32i8, 26 }, // extend/pmullw/trunc sequence.
529 
530  { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
531  { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
532  { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
533  { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/
534  { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/
535  { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/
536 
537  // Vectorizing division is a bad idea. See the SSE2 table for more comments.
538  { ISD::SDIV, MVT::v32i8, 32*20 },
539  { ISD::SDIV, MVT::v16i16, 16*20 },
540  { ISD::SDIV, MVT::v8i32, 8*20 },
541  { ISD::SDIV, MVT::v4i64, 4*20 },
542  { ISD::UDIV, MVT::v32i8, 32*20 },
543  { ISD::UDIV, MVT::v16i16, 16*20 },
544  { ISD::UDIV, MVT::v8i32, 8*20 },
545  { ISD::UDIV, MVT::v4i64, 4*20 },
546  };
547 
548  if (ST->hasAVX())
549  if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
550  return LT.first * Entry->Cost;
551 
552  static const CostTblEntry SSE42CostTable[] = {
553  { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/
554  { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
555  { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/
556  { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
557  };
558 
559  if (ST->hasSSE42())
560  if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
561  return LT.first * Entry->Cost;
562 
563  static const CostTblEntry SSE41CostTable[] = {
564  { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence.
565  { ISD::SHL, MVT::v32i8, 2*11 }, // pblendvb sequence.
566  { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence.
567  { ISD::SHL, MVT::v16i16, 2*14 }, // pblendvb sequence.
568  { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
569  { ISD::SHL, MVT::v8i32, 2*4 }, // pslld/paddd/cvttps2dq/pmulld
570 
571  { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence.
572  { ISD::SRL, MVT::v32i8, 2*12 }, // pblendvb sequence.
573  { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence.
574  { ISD::SRL, MVT::v16i16, 2*14 }, // pblendvb sequence.
575  { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend.
576  { ISD::SRL, MVT::v8i32, 2*11 }, // Shift each lane + blend.
577 
578  { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence.
579  { ISD::SRA, MVT::v32i8, 2*24 }, // pblendvb sequence.
580  { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence.
581  { ISD::SRA, MVT::v16i16, 2*14 }, // pblendvb sequence.
582  { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
583  { ISD::SRA, MVT::v8i32, 2*12 }, // Shift each lane + blend.
584 
585  { ISD::MUL, MVT::v4i32, 1 } // pmulld
586  };
587 
588  if (ST->hasSSE41())
589  if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
590  return LT.first * Entry->Cost;
591 
592  static const CostTblEntry SSE2CostTable[] = {
593  // We don't correctly identify costs of casts because they are marked as
594  // custom.
595  { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence.
596  { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence.
597  { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
598  { ISD::SHL, MVT::v8i32, 2*2*5 }, // We optimized this using mul.
599  { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
600  { ISD::SHL, MVT::v4i64, 2*4 }, // splat+shuffle sequence.
601 
602  { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence.
603  { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence.
604  { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
605  { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
606  { ISD::SRL, MVT::v4i64, 2*4 }, // splat+shuffle sequence.
607 
608  { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence.
609  { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence.
610  { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend.
611  { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence.
612  { ISD::SRA, MVT::v4i64, 2*12 }, // srl/xor/sub sequence.
613 
614  { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence.
615  { ISD::MUL, MVT::v8i16, 1 }, // pmullw
616  { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
617  { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
618 
619  { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/
620  { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
621  { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
622  { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
623 
624  // It is not a good idea to vectorize division. We have to scalarize it and
625  // in the process we will often end up having to spilling regular
626  // registers. The overhead of division is going to dominate most kernels
627  // anyways so try hard to prevent vectorization of division - it is
628  // generally a bad idea. Assume somewhat arbitrarily that we have to be able
629  // to hide "20 cycles" for each lane.
630  { ISD::SDIV, MVT::v16i8, 16*20 },
631  { ISD::SDIV, MVT::v8i16, 8*20 },
632  { ISD::SDIV, MVT::v4i32, 4*20 },
633  { ISD::SDIV, MVT::v2i64, 2*20 },
634  { ISD::UDIV, MVT::v16i8, 16*20 },
635  { ISD::UDIV, MVT::v8i16, 8*20 },
636  { ISD::UDIV, MVT::v4i32, 4*20 },
637  { ISD::UDIV, MVT::v2i64, 2*20 },
638  };
639 
640  if (ST->hasSSE2())
641  if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
642  return LT.first * Entry->Cost;
643 
644  static const CostTblEntry SSE1CostTable[] = {
645  { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/
646  { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
647  };
648 
649  if (ST->hasSSE1())
650  if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
651  return LT.first * Entry->Cost;
652 
653  // Fallback to the default implementation.
654  return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
655 }
656 
657 int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
658  Type *SubTp) {
659  // 64-bit packed float vectors (v2f32) are widened to type v4f32.
660  // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
661  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
662 
663  // For Broadcasts we are splatting the first element from the first input
664  // register, so only need to reference that input and all the output
665  // registers are the same.
666  if (Kind == TTI::SK_Broadcast)
667  LT.first = 1;
668 
669  // We are going to permute multiple sources and the result will be in multiple
670  // destinations. Providing an accurate cost only for splits where the element
671  // type remains the same.
672  if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
673  MVT LegalVT = LT.second;
674  if (LegalVT.getVectorElementType().getSizeInBits() ==
676  LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) {
677 
678  unsigned VecTySize = DL.getTypeStoreSize(Tp);
679  unsigned LegalVTSize = LegalVT.getStoreSize();
680  // Number of source vectors after legalization:
681  unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
682  // Number of destination vectors after legalization:
683  unsigned NumOfDests = LT.first;
684 
685  Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(),
686  LegalVT.getVectorNumElements());
687 
688  unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
689  return NumOfShuffles *
690  getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
691  }
692 
693  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
694  }
695 
696  // For 2-input shuffles, we must account for splitting the 2 inputs into many.
697  if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
698  // We assume that source and destination have the same vector type.
699  int NumOfDests = LT.first;
700  int NumOfShufflesPerDest = LT.first * 2 - 1;
701  LT.first = NumOfDests * NumOfShufflesPerDest;
702  }
703 
704  static const CostTblEntry AVX512VBMIShuffleTbl[] = {
705  { TTI::SK_Reverse, MVT::v64i8, 1 }, // vpermb
706  { TTI::SK_Reverse, MVT::v32i8, 1 }, // vpermb
707 
708  { TTI::SK_PermuteSingleSrc, MVT::v64i8, 1 }, // vpermb
709  { TTI::SK_PermuteSingleSrc, MVT::v32i8, 1 }, // vpermb
710 
711  { TTI::SK_PermuteTwoSrc, MVT::v64i8, 1 }, // vpermt2b
712  { TTI::SK_PermuteTwoSrc, MVT::v32i8, 1 }, // vpermt2b
713  { TTI::SK_PermuteTwoSrc, MVT::v16i8, 1 } // vpermt2b
714  };
715 
716  if (ST->hasVBMI())
717  if (const auto *Entry =
718  CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
719  return LT.first * Entry->Cost;
720 
721  static const CostTblEntry AVX512BWShuffleTbl[] = {
722  { TTI::SK_Broadcast, MVT::v32i16, 1 }, // vpbroadcastw
723  { TTI::SK_Broadcast, MVT::v64i8, 1 }, // vpbroadcastb
724 
725  { TTI::SK_Reverse, MVT::v32i16, 1 }, // vpermw
726  { TTI::SK_Reverse, MVT::v16i16, 1 }, // vpermw
727  { TTI::SK_Reverse, MVT::v64i8, 2 }, // pshufb + vshufi64x2
728 
729  { TTI::SK_PermuteSingleSrc, MVT::v32i16, 1 }, // vpermw
730  { TTI::SK_PermuteSingleSrc, MVT::v16i16, 1 }, // vpermw
731  { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // vpermw
732  { TTI::SK_PermuteSingleSrc, MVT::v64i8, 8 }, // extend to v32i16
733  { TTI::SK_PermuteSingleSrc, MVT::v32i8, 3 }, // vpermw + zext/trunc
734 
735  { TTI::SK_PermuteTwoSrc, MVT::v32i16, 1 }, // vpermt2w
736  { TTI::SK_PermuteTwoSrc, MVT::v16i16, 1 }, // vpermt2w
737  { TTI::SK_PermuteTwoSrc, MVT::v8i16, 1 }, // vpermt2w
738  { TTI::SK_PermuteTwoSrc, MVT::v32i8, 3 }, // zext + vpermt2w + trunc
739  { TTI::SK_PermuteTwoSrc, MVT::v64i8, 19 }, // 6 * v32i8 + 1
740  { TTI::SK_PermuteTwoSrc, MVT::v16i8, 3 } // zext + vpermt2w + trunc
741  };
742 
743  if (ST->hasBWI())
744  if (const auto *Entry =
745  CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
746  return LT.first * Entry->Cost;
747 
748  static const CostTblEntry AVX512ShuffleTbl[] = {
749  { TTI::SK_Broadcast, MVT::v8f64, 1 }, // vbroadcastpd
750  { TTI::SK_Broadcast, MVT::v16f32, 1 }, // vbroadcastps
751  { TTI::SK_Broadcast, MVT::v8i64, 1 }, // vpbroadcastq
752  { TTI::SK_Broadcast, MVT::v16i32, 1 }, // vpbroadcastd
753 
754  { TTI::SK_Reverse, MVT::v8f64, 1 }, // vpermpd
755  { TTI::SK_Reverse, MVT::v16f32, 1 }, // vpermps
756  { TTI::SK_Reverse, MVT::v8i64, 1 }, // vpermq
757  { TTI::SK_Reverse, MVT::v16i32, 1 }, // vpermd
758 
759  { TTI::SK_PermuteSingleSrc, MVT::v8f64, 1 }, // vpermpd
760  { TTI::SK_PermuteSingleSrc, MVT::v4f64, 1 }, // vpermpd
761  { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // vpermpd
762  { TTI::SK_PermuteSingleSrc, MVT::v16f32, 1 }, // vpermps
763  { TTI::SK_PermuteSingleSrc, MVT::v8f32, 1 }, // vpermps
764  { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // vpermps
765  { TTI::SK_PermuteSingleSrc, MVT::v8i64, 1 }, // vpermq
766  { TTI::SK_PermuteSingleSrc, MVT::v4i64, 1 }, // vpermq
767  { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // vpermq
768  { TTI::SK_PermuteSingleSrc, MVT::v16i32, 1 }, // vpermd
769  { TTI::SK_PermuteSingleSrc, MVT::v8i32, 1 }, // vpermd
770  { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }, // vpermd
771  { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb
772 
773  { TTI::SK_PermuteTwoSrc, MVT::v8f64, 1 }, // vpermt2pd
774  { TTI::SK_PermuteTwoSrc, MVT::v16f32, 1 }, // vpermt2ps
775  { TTI::SK_PermuteTwoSrc, MVT::v8i64, 1 }, // vpermt2q
776  { TTI::SK_PermuteTwoSrc, MVT::v16i32, 1 }, // vpermt2d
777  { TTI::SK_PermuteTwoSrc, MVT::v4f64, 1 }, // vpermt2pd
778  { TTI::SK_PermuteTwoSrc, MVT::v8f32, 1 }, // vpermt2ps
779  { TTI::SK_PermuteTwoSrc, MVT::v4i64, 1 }, // vpermt2q
780  { TTI::SK_PermuteTwoSrc, MVT::v8i32, 1 }, // vpermt2d
781  { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // vpermt2pd
782  { TTI::SK_PermuteTwoSrc, MVT::v4f32, 1 }, // vpermt2ps
783  { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // vpermt2q
784  { TTI::SK_PermuteTwoSrc, MVT::v4i32, 1 } // vpermt2d
785  };
786 
787  if (ST->hasAVX512())
788  if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
789  return LT.first * Entry->Cost;
790 
791  static const CostTblEntry AVX2ShuffleTbl[] = {
792  { TTI::SK_Broadcast, MVT::v4f64, 1 }, // vbroadcastpd
793  { TTI::SK_Broadcast, MVT::v8f32, 1 }, // vbroadcastps
794  { TTI::SK_Broadcast, MVT::v4i64, 1 }, // vpbroadcastq
795  { TTI::SK_Broadcast, MVT::v8i32, 1 }, // vpbroadcastd
796  { TTI::SK_Broadcast, MVT::v16i16, 1 }, // vpbroadcastw
797  { TTI::SK_Broadcast, MVT::v32i8, 1 }, // vpbroadcastb
798 
799  { TTI::SK_Reverse, MVT::v4f64, 1 }, // vpermpd
800  { TTI::SK_Reverse, MVT::v8f32, 1 }, // vpermps
801  { TTI::SK_Reverse, MVT::v4i64, 1 }, // vpermq
802  { TTI::SK_Reverse, MVT::v8i32, 1 }, // vpermd
803  { TTI::SK_Reverse, MVT::v16i16, 2 }, // vperm2i128 + pshufb
804  { TTI::SK_Reverse, MVT::v32i8, 2 }, // vperm2i128 + pshufb
805 
806  { TTI::SK_Alternate, MVT::v16i16, 1 }, // vpblendw
807  { TTI::SK_Alternate, MVT::v32i8, 1 } // vpblendvb
808  };
809 
810  if (ST->hasAVX2())
811  if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
812  return LT.first * Entry->Cost;
813 
814  static const CostTblEntry AVX1ShuffleTbl[] = {
815  { TTI::SK_Broadcast, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd
816  { TTI::SK_Broadcast, MVT::v8f32, 2 }, // vperm2f128 + vpermilps
817  { TTI::SK_Broadcast, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd
818  { TTI::SK_Broadcast, MVT::v8i32, 2 }, // vperm2f128 + vpermilps
819  { TTI::SK_Broadcast, MVT::v16i16, 3 }, // vpshuflw + vpshufd + vinsertf128
820  { TTI::SK_Broadcast, MVT::v32i8, 2 }, // vpshufb + vinsertf128
821 
822  { TTI::SK_Reverse, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd
823  { TTI::SK_Reverse, MVT::v8f32, 2 }, // vperm2f128 + vpermilps
824  { TTI::SK_Reverse, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd
825  { TTI::SK_Reverse, MVT::v8i32, 2 }, // vperm2f128 + vpermilps
826  { TTI::SK_Reverse, MVT::v16i16, 4 }, // vextractf128 + 2*pshufb
827  // + vinsertf128
828  { TTI::SK_Reverse, MVT::v32i8, 4 }, // vextractf128 + 2*pshufb
829  // + vinsertf128
830 
831  { TTI::SK_Alternate, MVT::v4i64, 1 }, // vblendpd
832  { TTI::SK_Alternate, MVT::v4f64, 1 }, // vblendpd
833  { TTI::SK_Alternate, MVT::v8i32, 1 }, // vblendps
834  { TTI::SK_Alternate, MVT::v8f32, 1 }, // vblendps
835  { TTI::SK_Alternate, MVT::v16i16, 3 }, // vpand + vpandn + vpor
836  { TTI::SK_Alternate, MVT::v32i8, 3 } // vpand + vpandn + vpor
837  };
838 
839  if (ST->hasAVX())
840  if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
841  return LT.first * Entry->Cost;
842 
843  static const CostTblEntry SSE41ShuffleTbl[] = {
844  { TTI::SK_Alternate, MVT::v2i64, 1 }, // pblendw
845  { TTI::SK_Alternate, MVT::v2f64, 1 }, // movsd
846  { TTI::SK_Alternate, MVT::v4i32, 1 }, // pblendw
847  { TTI::SK_Alternate, MVT::v4f32, 1 }, // blendps
848  { TTI::SK_Alternate, MVT::v8i16, 1 }, // pblendw
849  { TTI::SK_Alternate, MVT::v16i8, 1 } // pblendvb
850  };
851 
852  if (ST->hasSSE41())
853  if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
854  return LT.first * Entry->Cost;
855 
856  static const CostTblEntry SSSE3ShuffleTbl[] = {
857  { TTI::SK_Broadcast, MVT::v8i16, 1 }, // pshufb
858  { TTI::SK_Broadcast, MVT::v16i8, 1 }, // pshufb
859 
860  { TTI::SK_Reverse, MVT::v8i16, 1 }, // pshufb
861  { TTI::SK_Reverse, MVT::v16i8, 1 }, // pshufb
862 
863  { TTI::SK_Alternate, MVT::v8i16, 3 }, // pshufb + pshufb + por
864  { TTI::SK_Alternate, MVT::v16i8, 3 } // pshufb + pshufb + por
865  };
866 
867  if (ST->hasSSSE3())
868  if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
869  return LT.first * Entry->Cost;
870 
871  static const CostTblEntry SSE2ShuffleTbl[] = {
872  { TTI::SK_Broadcast, MVT::v2f64, 1 }, // shufpd
873  { TTI::SK_Broadcast, MVT::v2i64, 1 }, // pshufd
874  { TTI::SK_Broadcast, MVT::v4i32, 1 }, // pshufd
875  { TTI::SK_Broadcast, MVT::v8i16, 2 }, // pshuflw + pshufd
876  { TTI::SK_Broadcast, MVT::v16i8, 3 }, // unpck + pshuflw + pshufd
877 
878  { TTI::SK_Reverse, MVT::v2f64, 1 }, // shufpd
879  { TTI::SK_Reverse, MVT::v2i64, 1 }, // pshufd
880  { TTI::SK_Reverse, MVT::v4i32, 1 }, // pshufd
881  { TTI::SK_Reverse, MVT::v8i16, 3 }, // pshuflw + pshufhw + pshufd
882  { TTI::SK_Reverse, MVT::v16i8, 9 }, // 2*pshuflw + 2*pshufhw
883  // + 2*pshufd + 2*unpck + packus
884 
885  { TTI::SK_Alternate, MVT::v2i64, 1 }, // movsd
886  { TTI::SK_Alternate, MVT::v2f64, 1 }, // movsd
887  { TTI::SK_Alternate, MVT::v4i32, 2 }, // 2*shufps
888  { TTI::SK_Alternate, MVT::v8i16, 3 }, // pand + pandn + por
889  { TTI::SK_Alternate, MVT::v16i8, 3 } // pand + pandn + por
890  };
891 
892  if (ST->hasSSE2())
893  if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
894  return LT.first * Entry->Cost;
895 
896  static const CostTblEntry SSE1ShuffleTbl[] = {
897  { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
898  { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
899  { TTI::SK_Alternate, MVT::v4f32, 2 } // 2*shufps
900  };
901 
902  if (ST->hasSSE1())
903  if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
904  return LT.first * Entry->Cost;
905 
906  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
907 }
908 
909 int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
910  int ISD = TLI->InstructionOpcodeToISD(Opcode);
911  assert(ISD && "Invalid opcode");
912 
913  // FIXME: Need a better design of the cost table to handle non-simple types of
914  // potential massive combinations (elem_num x src_type x dst_type).
915 
916  static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
923 
930 
937 
944  };
945 
946  // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
947  // 256-bit wide vectors.
948 
949  static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
953 
958 
959  // v16i1 -> v16i32 - load + broadcast
970 
981 
1005 
1010  };
1011 
1012  static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
1029 
1036 
1039 
1041  };
1042 
1043  static const TypeConversionCostTblEntry AVXConversionTbl[] = {
1060 
1068 
1081 
1095  // The generic code to compute the scalar overhead is currently broken.
1096  // Workaround this limitation by estimating the scalarization overhead
1097  // here. We have roughly 10 instructions per scalar element.
1098  // Multiply that by the vector width.
1099  // FIXME: remove that when PR19268 is fixed.
1104 
1107  // This node is expanded into scalarized operations but BasicTTI is overly
1108  // optimistic estimating its cost. It computes 3 per element (one
1109  // vector-extract, one scalar conversion and one vector-insert). The
1110  // problem is that the inserts form a read-modify-write chain so latency
1111  // should be factored in too. Inflating the cost per element by 1.
1114 
1117  };
1118 
1119  static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
1126 
1145 
1153 
1154  };
1155 
1156  static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
1157  // These are somewhat magic numbers justified by looking at the output of
1158  // Intel's IACA, running some kernels and making sure when we take
1159  // legalization into account the throughput will be overestimated.
1161  { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1168 
1169  { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1177 
1179 
1204 
1214  };
1215 
1216  std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
1217  std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);
1218 
1219  if (ST->hasSSE2() && !ST->hasAVX()) {
1220  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
1221  LTDest.second, LTSrc.second))
1222  return LTSrc.first * Entry->Cost;
1223  }
1224 
1225  EVT SrcTy = TLI->getValueType(DL, Src);
1226  EVT DstTy = TLI->getValueType(DL, Dst);
1227 
1228  // The function getSimpleVT only handles simple value types.
1229  if (!SrcTy.isSimple() || !DstTy.isSimple())
1230  return BaseT::getCastInstrCost(Opcode, Dst, Src);
1231 
1232  if (ST->hasDQI())
1233  if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
1234  DstTy.getSimpleVT(),
1235  SrcTy.getSimpleVT()))
1236  return Entry->Cost;
1237 
1238  if (ST->hasAVX512())
1239  if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
1240  DstTy.getSimpleVT(),
1241  SrcTy.getSimpleVT()))
1242  return Entry->Cost;
1243 
1244  if (ST->hasAVX2()) {
1245  if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
1246  DstTy.getSimpleVT(),
1247  SrcTy.getSimpleVT()))
1248  return Entry->Cost;
1249  }
1250 
1251  if (ST->hasAVX()) {
1252  if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
1253  DstTy.getSimpleVT(),
1254  SrcTy.getSimpleVT()))
1255  return Entry->Cost;
1256  }
1257 
1258  if (ST->hasSSE41()) {
1259  if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
1260  DstTy.getSimpleVT(),
1261  SrcTy.getSimpleVT()))
1262  return Entry->Cost;
1263  }
1264 
1265  if (ST->hasSSE2()) {
1266  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
1267  DstTy.getSimpleVT(),
1268  SrcTy.getSimpleVT()))
1269  return Entry->Cost;
1270  }
1271 
1272  return BaseT::getCastInstrCost(Opcode, Dst, Src);
1273 }
1274 
1275 int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
1276  // Legalize the type.
1277  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1278 
1279  MVT MTy = LT.second;
1280 
1281  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1282  assert(ISD && "Invalid opcode");
1283 
1284  static const CostTblEntry SSE2CostTbl[] = {
1285  { ISD::SETCC, MVT::v2i64, 8 },
1286  { ISD::SETCC, MVT::v4i32, 1 },
1287  { ISD::SETCC, MVT::v8i16, 1 },
1288  { ISD::SETCC, MVT::v16i8, 1 },
1289  };
1290 
1291  static const CostTblEntry SSE42CostTbl[] = {
1292  { ISD::SETCC, MVT::v2f64, 1 },
1293  { ISD::SETCC, MVT::v4f32, 1 },
1294  { ISD::SETCC, MVT::v2i64, 1 },
1295  };
1296 
1297  static const CostTblEntry AVX1CostTbl[] = {
1298  { ISD::SETCC, MVT::v4f64, 1 },
1299  { ISD::SETCC, MVT::v8f32, 1 },
1300  // AVX1 does not support 8-wide integer compare.
1301  { ISD::SETCC, MVT::v4i64, 4 },
1302  { ISD::SETCC, MVT::v8i32, 4 },
1303  { ISD::SETCC, MVT::v16i16, 4 },
1304  { ISD::SETCC, MVT::v32i8, 4 },
1305  };
1306 
1307  static const CostTblEntry AVX2CostTbl[] = {
1308  { ISD::SETCC, MVT::v4i64, 1 },
1309  { ISD::SETCC, MVT::v8i32, 1 },
1310  { ISD::SETCC, MVT::v16i16, 1 },
1311  { ISD::SETCC, MVT::v32i8, 1 },
1312  };
1313 
1314  static const CostTblEntry AVX512CostTbl[] = {
1315  { ISD::SETCC, MVT::v8i64, 1 },
1316  { ISD::SETCC, MVT::v16i32, 1 },
1317  { ISD::SETCC, MVT::v8f64, 1 },
1318  { ISD::SETCC, MVT::v16f32, 1 },
1319  };
1320 
1321  if (ST->hasAVX512())
1322  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
1323  return LT.first * Entry->Cost;
1324 
1325  if (ST->hasAVX2())
1326  if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
1327  return LT.first * Entry->Cost;
1328 
1329  if (ST->hasAVX())
1330  if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
1331  return LT.first * Entry->Cost;
1332 
1333  if (ST->hasSSE42())
1334  if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
1335  return LT.first * Entry->Cost;
1336 
1337  if (ST->hasSSE2())
1338  if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
1339  return LT.first * Entry->Cost;
1340 
1341  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
1342 }
1343 
1345  ArrayRef<Type *> Tys, FastMathFlags FMF) {
1346  // Costs should match the codegen from:
1347  // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
1348  // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
1349  // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
1350  // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
1351  // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
1352  static const CostTblEntry XOPCostTbl[] = {
1353  { ISD::BITREVERSE, MVT::v4i64, 4 },
1354  { ISD::BITREVERSE, MVT::v8i32, 4 },
1355  { ISD::BITREVERSE, MVT::v16i16, 4 },
1356  { ISD::BITREVERSE, MVT::v32i8, 4 },
1357  { ISD::BITREVERSE, MVT::v2i64, 1 },
1358  { ISD::BITREVERSE, MVT::v4i32, 1 },
1359  { ISD::BITREVERSE, MVT::v8i16, 1 },
1360  { ISD::BITREVERSE, MVT::v16i8, 1 },
1361  { ISD::BITREVERSE, MVT::i64, 3 },
1362  { ISD::BITREVERSE, MVT::i32, 3 },
1363  { ISD::BITREVERSE, MVT::i16, 3 },
1364  { ISD::BITREVERSE, MVT::i8, 3 }
1365  };
1366  static const CostTblEntry AVX2CostTbl[] = {
1367  { ISD::BITREVERSE, MVT::v4i64, 5 },
1368  { ISD::BITREVERSE, MVT::v8i32, 5 },
1369  { ISD::BITREVERSE, MVT::v16i16, 5 },
1370  { ISD::BITREVERSE, MVT::v32i8, 5 },
1371  { ISD::BSWAP, MVT::v4i64, 1 },
1372  { ISD::BSWAP, MVT::v8i32, 1 },
1373  { ISD::BSWAP, MVT::v16i16, 1 },
1374  { ISD::CTLZ, MVT::v4i64, 23 },
1375  { ISD::CTLZ, MVT::v8i32, 18 },
1376  { ISD::CTLZ, MVT::v16i16, 14 },
1377  { ISD::CTLZ, MVT::v32i8, 9 },
1378  { ISD::CTPOP, MVT::v4i64, 7 },
1379  { ISD::CTPOP, MVT::v8i32, 11 },
1380  { ISD::CTPOP, MVT::v16i16, 9 },
1381  { ISD::CTPOP, MVT::v32i8, 6 },
1382  { ISD::CTTZ, MVT::v4i64, 10 },
1383  { ISD::CTTZ, MVT::v8i32, 14 },
1384  { ISD::CTTZ, MVT::v16i16, 12 },
1385  { ISD::CTTZ, MVT::v32i8, 9 },
1386  { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/
1387  { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
1388  { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
1389  { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/
1390  { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
1391  { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
1392  };
1393  static const CostTblEntry AVX1CostTbl[] = {
1394  { ISD::BITREVERSE, MVT::v4i64, 10 },
1395  { ISD::BITREVERSE, MVT::v8i32, 10 },
1396  { ISD::BITREVERSE, MVT::v16i16, 10 },
1397  { ISD::BITREVERSE, MVT::v32i8, 10 },
1398  { ISD::BSWAP, MVT::v4i64, 4 },
1399  { ISD::BSWAP, MVT::v8i32, 4 },
1400  { ISD::BSWAP, MVT::v16i16, 4 },
1401  { ISD::CTLZ, MVT::v4i64, 46 },
1402  { ISD::CTLZ, MVT::v8i32, 36 },
1403  { ISD::CTLZ, MVT::v16i16, 28 },
1404  { ISD::CTLZ, MVT::v32i8, 18 },
1405  { ISD::CTPOP, MVT::v4i64, 14 },
1406  { ISD::CTPOP, MVT::v8i32, 22 },
1407  { ISD::CTPOP, MVT::v16i16, 18 },
1408  { ISD::CTPOP, MVT::v32i8, 12 },
1409  { ISD::CTTZ, MVT::v4i64, 20 },
1410  { ISD::CTTZ, MVT::v8i32, 28 },
1411  { ISD::CTTZ, MVT::v16i16, 24 },
1412  { ISD::CTTZ, MVT::v32i8, 18 },
1413  { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
1414  { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
1415  { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
1416  { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/
1417  { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/
1418  { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/
1419  };
1420  static const CostTblEntry SSE42CostTbl[] = {
1421  { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
1422  { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
1423  };
1424  static const CostTblEntry SSSE3CostTbl[] = {
1425  { ISD::BITREVERSE, MVT::v2i64, 5 },
1426  { ISD::BITREVERSE, MVT::v4i32, 5 },
1427  { ISD::BITREVERSE, MVT::v8i16, 5 },
1428  { ISD::BITREVERSE, MVT::v16i8, 5 },
1429  { ISD::BSWAP, MVT::v2i64, 1 },
1430  { ISD::BSWAP, MVT::v4i32, 1 },
1431  { ISD::BSWAP, MVT::v8i16, 1 },
1432  { ISD::CTLZ, MVT::v2i64, 23 },
1433  { ISD::CTLZ, MVT::v4i32, 18 },
1434  { ISD::CTLZ, MVT::v8i16, 14 },
1435  { ISD::CTLZ, MVT::v16i8, 9 },
1436  { ISD::CTPOP, MVT::v2i64, 7 },
1437  { ISD::CTPOP, MVT::v4i32, 11 },
1438  { ISD::CTPOP, MVT::v8i16, 9 },
1439  { ISD::CTPOP, MVT::v16i8, 6 },
1440  { ISD::CTTZ, MVT::v2i64, 10 },
1441  { ISD::CTTZ, MVT::v4i32, 14 },
1442  { ISD::CTTZ, MVT::v8i16, 12 },
1443  { ISD::CTTZ, MVT::v16i8, 9 }
1444  };
1445  static const CostTblEntry SSE2CostTbl[] = {
1446  { ISD::BSWAP, MVT::v2i64, 7 },
1447  { ISD::BSWAP, MVT::v4i32, 7 },
1448  { ISD::BSWAP, MVT::v8i16, 7 },
1449  { ISD::CTLZ, MVT::v2i64, 25 },
1450  { ISD::CTLZ, MVT::v4i32, 26 },
1451  { ISD::CTLZ, MVT::v8i16, 20 },
1452  { ISD::CTLZ, MVT::v16i8, 17 },
1453  { ISD::CTPOP, MVT::v2i64, 12 },
1454  { ISD::CTPOP, MVT::v4i32, 15 },
1455  { ISD::CTPOP, MVT::v8i16, 13 },
1456  { ISD::CTPOP, MVT::v16i8, 10 },
1457  { ISD::CTTZ, MVT::v2i64, 14 },
1458  { ISD::CTTZ, MVT::v4i32, 18 },
1459  { ISD::CTTZ, MVT::v8i16, 16 },
1460  { ISD::CTTZ, MVT::v16i8, 13 },
1461  { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/
1462  { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/
1463  };
1464  static const CostTblEntry SSE1CostTbl[] = {
1465  { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
1466  { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
1467  };
1468 
1469  unsigned ISD = ISD::DELETED_NODE;
1470  switch (IID) {
1471  default:
1472  break;
1473  case Intrinsic::bitreverse:
1474  ISD = ISD::BITREVERSE;
1475  break;
1476  case Intrinsic::bswap:
1477  ISD = ISD::BSWAP;
1478  break;
1479  case Intrinsic::ctlz:
1480  ISD = ISD::CTLZ;
1481  break;
1482  case Intrinsic::ctpop:
1483  ISD = ISD::CTPOP;
1484  break;
1485  case Intrinsic::cttz:
1486  ISD = ISD::CTTZ;
1487  break;
1488  case Intrinsic::sqrt:
1489  ISD = ISD::FSQRT;
1490  break;
1491  }
1492 
1493  // Legalize the type.
1494  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
1495  MVT MTy = LT.second;
1496 
1497  // Attempt to lookup cost.
1498  if (ST->hasXOP())
1499  if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
1500  return LT.first * Entry->Cost;
1501 
1502  if (ST->hasAVX2())
1503  if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
1504  return LT.first * Entry->Cost;
1505 
1506  if (ST->hasAVX())
1507  if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
1508  return LT.first * Entry->Cost;
1509 
1510  if (ST->hasSSE42())
1511  if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
1512  return LT.first * Entry->Cost;
1513 
1514  if (ST->hasSSSE3())
1515  if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
1516  return LT.first * Entry->Cost;
1517 
1518  if (ST->hasSSE2())
1519  if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
1520  return LT.first * Entry->Cost;
1521 
1522  if (ST->hasSSE1())
1523  if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
1524  return LT.first * Entry->Cost;
1525 
1526  return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF);
1527 }
1528 
1531  return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF);
1532 }
1533 
1534 int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
1535  assert(Val->isVectorTy() && "This must be a vector type");
1536 
1537  Type *ScalarType = Val->getScalarType();
1538 
1539  if (Index != -1U) {
1540  // Legalize the type.
1541  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
1542 
1543  // This type is legalized to a scalar type.
1544  if (!LT.second.isVector())
1545  return 0;
1546 
1547  // The type may be split. Normalize the index to the new type.
1548  unsigned Width = LT.second.getVectorNumElements();
1549  Index = Index % Width;
1550 
1551  // Floating point scalars are already located in index #0.
1552  if (ScalarType->isFloatingPointTy() && Index == 0)
1553  return 0;
1554  }
1555 
1556  // Add to the base cost if we know that the extracted element of a vector is
1557  // destined to be moved to and used in the integer register file.
1558  int RegisterFileMoveCost = 0;
1559  if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
1560  RegisterFileMoveCost = 1;
1561 
1562  return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
1563 }
1564 
1565 int X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
1566  assert (Ty->isVectorTy() && "Can only scalarize vectors");
1567  int Cost = 0;
1568 
1569  for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
1570  if (Insert)
1571  Cost += getVectorInstrCost(Instruction::InsertElement, Ty, i);
1572  if (Extract)
1573  Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, i);
1574  }
1575 
1576  return Cost;
1577 }
1578 
1579 int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
1580  unsigned AddressSpace) {
1581  // Handle non-power-of-two vectors such as <3 x float>
1582  if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
1583  unsigned NumElem = VTy->getVectorNumElements();
1584 
1585  // Handle a few common cases:
1586  // <3 x float>
1587  if (NumElem == 3 && VTy->getScalarSizeInBits() == 32)
1588  // Cost = 64 bit store + extract + 32 bit store.
1589  return 3;
1590 
1591  // <3 x double>
1592  if (NumElem == 3 && VTy->getScalarSizeInBits() == 64)
1593  // Cost = 128 bit store + unpack + 64 bit store.
1594  return 3;
1595 
1596  // Assume that all other non-power-of-two numbers are scalarized.
1597  if (!isPowerOf2_32(NumElem)) {
1598  int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
1599  AddressSpace);
1600  int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load,
1601  Opcode == Instruction::Store);
1602  return NumElem * Cost + SplitCost;
1603  }
1604  }
1605 
1606  // Legalize the type.
1607  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
1608  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1609  "Invalid Opcode");
1610 
1611  // Each load/store unit costs 1.
1612  int Cost = LT.first * 1;
1613 
1614  // This isn't exactly right. We're using slow unaligned 32-byte accesses as a
1615  // proxy for a double-pumped AVX memory interface such as on Sandybridge.
1616  if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow())
1617  Cost *= 2;
1618 
1619  return Cost;
1620 }
1621 
1622 int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
1623  unsigned Alignment,
1624  unsigned AddressSpace) {
1625  VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy);
1626  if (!SrcVTy)
1627  // To calculate scalar take the regular cost, without mask
1628  return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace);
1629 
1630  unsigned NumElem = SrcVTy->getVectorNumElements();
1631  VectorType *MaskTy =
1632  VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
1633  if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy)) ||
1634  (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy)) ||
1635  !isPowerOf2_32(NumElem)) {
1636  // Scalarization
1637  int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
1638  int ScalarCompareCost = getCmpSelInstrCost(
1639  Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr);
1640  int BranchCost = getCFInstrCost(Instruction::Br);
1641  int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
1642 
1643  int ValueSplitCost = getScalarizationOverhead(
1644  SrcVTy, Opcode == Instruction::Load, Opcode == Instruction::Store);
1645  int MemopCost =
1646  NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
1647  Alignment, AddressSpace);
1648  return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
1649  }
1650 
1651  // Legalize the type.
1652  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
1653  auto VT = TLI->getValueType(DL, SrcVTy);
1654  int Cost = 0;
1655  if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
1656  LT.second.getVectorNumElements() == NumElem)
1657  // Promotion requires expand/truncate for data and a shuffle for mask.
1658  Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, nullptr) +
1659  getShuffleCost(TTI::SK_Alternate, MaskTy, 0, nullptr);
1660 
1661  else if (LT.second.getVectorNumElements() > NumElem) {
1662  VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(),
1663  LT.second.getVectorNumElements());
1664  // Expanding requires fill mask with zeroes
1665  Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy);
1666  }
1667  if (!ST->hasAVX512())
1668  return Cost + LT.first*4; // Each maskmov costs 4
1669 
1670  // AVX-512 masked load/store is cheapper
1671  return Cost+LT.first;
1672 }
1673 
1675  const SCEV *Ptr) {
1676  // Address computations in vectorized code with non-consecutive addresses will
1677  // likely result in more instructions compared to scalar code where the
1678  // computation can more often be merged into the index mode. The resulting
1679  // extra micro-ops can significantly decrease throughput.
1680  unsigned NumVectorInstToHideOverhead = 10;
1681 
1682  // Cost modeling of Strided Access Computation is hidden by the indexing
1683  // modes of X86 regardless of the stride value. We dont believe that there
1684  // is a difference between constant strided access in gerenal and constant
1685  // strided value which is less than or equal to 64.
1686  // Even in the case of (loop invariant) stride whose value is not known at
1687  // compile time, the address computation will not incur more than one extra
1688  // ADD instruction.
1689  if (Ty->isVectorTy() && SE) {
1690  if (!BaseT::isStridedAccess(Ptr))
1691  return NumVectorInstToHideOverhead;
1692  if (!BaseT::getConstantStrideStep(SE, Ptr))
1693  return 1;
1694  }
1695 
1696  return BaseT::getAddressComputationCost(Ty, SE, Ptr);
1697 }
1698 
1699 int X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy,
1700  bool IsPairwise) {
1701 
1702  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1703 
1704  MVT MTy = LT.second;
1705 
1706  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1707  assert(ISD && "Invalid opcode");
1708 
1709  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
1710  // and make it as the cost.
1711 
1712  static const CostTblEntry SSE42CostTblPairWise[] = {
1713  { ISD::FADD, MVT::v2f64, 2 },
1714  { ISD::FADD, MVT::v4f32, 4 },
1715  { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
1716  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
1717  { ISD::ADD, MVT::v8i16, 5 },
1718  };
1719 
1720  static const CostTblEntry AVX1CostTblPairWise[] = {
1721  { ISD::FADD, MVT::v4f32, 4 },
1722  { ISD::FADD, MVT::v4f64, 5 },
1723  { ISD::FADD, MVT::v8f32, 7 },
1724  { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
1725  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
1726  { ISD::ADD, MVT::v4i64, 5 }, // The data reported by the IACA tool is "4.8".
1727  { ISD::ADD, MVT::v8i16, 5 },
1728  { ISD::ADD, MVT::v8i32, 5 },
1729  };
1730 
1731  static const CostTblEntry SSE42CostTblNoPairWise[] = {
1732  { ISD::FADD, MVT::v2f64, 2 },
1733  { ISD::FADD, MVT::v4f32, 4 },
1734  { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
1735  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
1736  { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
1737  };
1738 
1739  static const CostTblEntry AVX1CostTblNoPairWise[] = {
1740  { ISD::FADD, MVT::v4f32, 3 },
1741  { ISD::FADD, MVT::v4f64, 3 },
1742  { ISD::FADD, MVT::v8f32, 4 },
1743  { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
1744  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "2.8".
1745  { ISD::ADD, MVT::v4i64, 3 },
1746  { ISD::ADD, MVT::v8i16, 4 },
1747  { ISD::ADD, MVT::v8i32, 5 },
1748  };
1749 
1750  if (IsPairwise) {
1751  if (ST->hasAVX())
1752  if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
1753  return LT.first * Entry->Cost;
1754 
1755  if (ST->hasSSE42())
1756  if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
1757  return LT.first * Entry->Cost;
1758  } else {
1759  if (ST->hasAVX())
1760  if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
1761  return LT.first * Entry->Cost;
1762 
1763  if (ST->hasSSE42())
1764  if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
1765  return LT.first * Entry->Cost;
1766  }
1767 
1768  return BaseT::getReductionCost(Opcode, ValTy, IsPairwise);
1769 }
1770 
1771 /// \brief Calculate the cost of materializing a 64-bit value. This helper
1772 /// method might only calculate a fraction of a larger immediate. Therefore it
1773 /// is valid to return a cost of ZERO.
1774 int X86TTIImpl::getIntImmCost(int64_t Val) {
1775  if (Val == 0)
1776  return TTI::TCC_Free;
1777 
1778  if (isInt<32>(Val))
1779  return TTI::TCC_Basic;
1780 
1781  return 2 * TTI::TCC_Basic;
1782 }
1783 
1784 int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
1785  assert(Ty->isIntegerTy());
1786 
1787  unsigned BitSize = Ty->getPrimitiveSizeInBits();
1788  if (BitSize == 0)
1789  return ~0U;
1790 
1791  // Never hoist constants larger than 128bit, because this might lead to
1792  // incorrect code generation or assertions in codegen.
1793  // Fixme: Create a cost model for types larger than i128 once the codegen
1794  // issues have been fixed.
1795  if (BitSize > 128)
1796  return TTI::TCC_Free;
1797 
1798  if (Imm == 0)
1799  return TTI::TCC_Free;
1800 
1801  // Sign-extend all constants to a multiple of 64-bit.
1802  APInt ImmVal = Imm;
1803  if (BitSize & 0x3f)
1804  ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
1805 
1806  // Split the constant into 64-bit chunks and calculate the cost for each
1807  // chunk.
1808  int Cost = 0;
1809  for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
1810  APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
1811  int64_t Val = Tmp.getSExtValue();
1812  Cost += getIntImmCost(Val);
1813  }
1814  // We need at least one instruction to materialize the constant.
1815  return std::max(1, Cost);
1816 }
1817 
1818 int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
1819  Type *Ty) {
1820  assert(Ty->isIntegerTy());
1821 
1822  unsigned BitSize = Ty->getPrimitiveSizeInBits();
1823  // There is no cost model for constants with a bit size of 0. Return TCC_Free
1824  // here, so that constant hoisting will ignore this constant.
1825  if (BitSize == 0)
1826  return TTI::TCC_Free;
1827 
1828  unsigned ImmIdx = ~0U;
1829  switch (Opcode) {
1830  default:
1831  return TTI::TCC_Free;
1832  case Instruction::GetElementPtr:
1833  // Always hoist the base address of a GetElementPtr. This prevents the
1834  // creation of new constants for every base constant that gets constant
1835  // folded with the offset.
1836  if (Idx == 0)
1837  return 2 * TTI::TCC_Basic;
1838  return TTI::TCC_Free;
1839  case Instruction::Store:
1840  ImmIdx = 0;
1841  break;
1842  case Instruction::ICmp:
1843  // This is an imperfect hack to prevent constant hoisting of
1844  // compares that might be trying to check if a 64-bit value fits in
1845  // 32-bits. The backend can optimize these cases using a right shift by 32.
1846  // Ideally we would check the compare predicate here. There also other
1847  // similar immediates the backend can use shifts for.
1848  if (Idx == 1 && Imm.getBitWidth() == 64) {
1849  uint64_t ImmVal = Imm.getZExtValue();
1850  if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
1851  return TTI::TCC_Free;
1852  }
1853  ImmIdx = 1;
1854  break;
1855  case Instruction::And:
1856  // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
1857  // by using a 32-bit operation with implicit zero extension. Detect such
1858  // immediates here as the normal path expects bit 31 to be sign extended.
1859  if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue()))
1860  return TTI::TCC_Free;
1862  case Instruction::Add:
1863  case Instruction::Sub:
1864  case Instruction::Mul:
1865  case Instruction::UDiv:
1866  case Instruction::SDiv:
1867  case Instruction::URem:
1868  case Instruction::SRem:
1869  case Instruction::Or:
1870  case Instruction::Xor:
1871  ImmIdx = 1;
1872  break;
1873  // Always return TCC_Free for the shift value of a shift instruction.
1874  case Instruction::Shl:
1875  case Instruction::LShr:
1876  case Instruction::AShr:
1877  if (Idx == 1)
1878  return TTI::TCC_Free;
1879  break;
1880  case Instruction::Trunc:
1881  case Instruction::ZExt:
1882  case Instruction::SExt:
1883  case Instruction::IntToPtr:
1884  case Instruction::PtrToInt:
1885  case Instruction::BitCast:
1886  case Instruction::PHI:
1887  case Instruction::Call:
1888  case Instruction::Select:
1889  case Instruction::Ret:
1890  case Instruction::Load:
1891  break;
1892  }
1893 
1894  if (Idx == ImmIdx) {
1895  int NumConstants = (BitSize + 63) / 64;
1896  int Cost = X86TTIImpl::getIntImmCost(Imm, Ty);
1897  return (Cost <= NumConstants * TTI::TCC_Basic)
1898  ? static_cast<int>(TTI::TCC_Free)
1899  : Cost;
1900  }
1901 
1902  return X86TTIImpl::getIntImmCost(Imm, Ty);
1903 }
1904 
1905 int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
1906  Type *Ty) {
1907  assert(Ty->isIntegerTy());
1908 
1909  unsigned BitSize = Ty->getPrimitiveSizeInBits();
1910  // There is no cost model for constants with a bit size of 0. Return TCC_Free
1911  // here, so that constant hoisting will ignore this constant.
1912  if (BitSize == 0)
1913  return TTI::TCC_Free;
1914 
1915  switch (IID) {
1916  default:
1917  return TTI::TCC_Free;
1918  case Intrinsic::sadd_with_overflow:
1919  case Intrinsic::uadd_with_overflow:
1920  case Intrinsic::ssub_with_overflow:
1921  case Intrinsic::usub_with_overflow:
1922  case Intrinsic::smul_with_overflow:
1923  case Intrinsic::umul_with_overflow:
1924  if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
1925  return TTI::TCC_Free;
1926  break;
1927  case Intrinsic::experimental_stackmap:
1928  if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
1929  return TTI::TCC_Free;
1930  break;
1931  case Intrinsic::experimental_patchpoint_void:
1932  case Intrinsic::experimental_patchpoint_i64:
1933  if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
1934  return TTI::TCC_Free;
1935  break;
1936  }
1937  return X86TTIImpl::getIntImmCost(Imm, Ty);
1938 }
1939 
1940 // Return an average cost of Gather / Scatter instruction, maybe improved later
1941 int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
1942  unsigned Alignment, unsigned AddressSpace) {
1943 
1944  assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
1945  unsigned VF = SrcVTy->getVectorNumElements();
1946 
1947  // Try to reduce index size from 64 bit (default for GEP)
1948  // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
1949  // operation will use 16 x 64 indices which do not fit in a zmm and needs
1950  // to split. Also check that the base pointer is the same for all lanes,
1951  // and that there's at most one variable index.
1952  auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) {
1953  unsigned IndexSize = DL.getPointerSizeInBits();
1955  if (IndexSize < 64 || !GEP)
1956  return IndexSize;
1957 
1958  unsigned NumOfVarIndices = 0;
1959  Value *Ptrs = GEP->getPointerOperand();
1960  if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
1961  return IndexSize;
1962  for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
1963  if (isa<Constant>(GEP->getOperand(i)))
1964  continue;
1965  Type *IndxTy = GEP->getOperand(i)->getType();
1966  if (IndxTy->isVectorTy())
1967  IndxTy = IndxTy->getVectorElementType();
1968  if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
1969  !isa<SExtInst>(GEP->getOperand(i))) ||
1970  ++NumOfVarIndices > 1)
1971  return IndexSize; // 64
1972  }
1973  return (unsigned)32;
1974  };
1975 
1976 
1977  // Trying to reduce IndexSize to 32 bits for vector 16.
1978  // By default the IndexSize is equal to pointer size.
1979  unsigned IndexSize = (VF >= 16) ? getIndexSizeInBits(Ptr, DL) :
1981 
1982  Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(),
1983  IndexSize), VF);
1984  std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
1985  std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
1986  int SplitFactor = std::max(IdxsLT.first, SrcLT.first);
1987  if (SplitFactor > 1) {
1988  // Handle splitting of vector of pointers
1989  Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
1990  return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
1991  AddressSpace);
1992  }
1993 
1994  // The gather / scatter cost is given by Intel architects. It is a rough
1995  // number since we are looking at one instruction in a time.
1996  const int GSOverhead = 2;
1997  return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
1998  Alignment, AddressSpace);
1999 }
2000 
2001 /// Return the cost of full scalarization of gather / scatter operation.
2002 ///
2003 /// Opcode - Load or Store instruction.
2004 /// SrcVTy - The type of the data vector that should be gathered or scattered.
2005 /// VariableMask - The mask is non-constant at compile time.
2006 /// Alignment - Alignment for one element.
2007 /// AddressSpace - pointer[s] address space.
2008 ///
2009 int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
2010  bool VariableMask, unsigned Alignment,
2011  unsigned AddressSpace) {
2012  unsigned VF = SrcVTy->getVectorNumElements();
2013 
2014  int MaskUnpackCost = 0;
2015  if (VariableMask) {
2016  VectorType *MaskTy =
2017  VectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
2018  MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true);
2019  int ScalarCompareCost =
2020  getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()),
2021  nullptr);
2022  int BranchCost = getCFInstrCost(Instruction::Br);
2023  MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
2024  }
2025 
2026  // The cost of the scalar loads/stores.
2027  int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
2028  Alignment, AddressSpace);
2029 
2030  int InsertExtractCost = 0;
2031  if (Opcode == Instruction::Load)
2032  for (unsigned i = 0; i < VF; ++i)
2033  // Add the cost of inserting each scalar load into the vector
2034  InsertExtractCost +=
2035  getVectorInstrCost(Instruction::InsertElement, SrcVTy, i);
2036  else
2037  for (unsigned i = 0; i < VF; ++i)
2038  // Add the cost of extracting each element out of the data vector
2039  InsertExtractCost +=
2040  getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i);
2041 
2042  return MemoryOpCost + MaskUnpackCost + InsertExtractCost;
2043 }
2044 
2045 /// Calculate the cost of Gather / Scatter operation
2046 int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
2047  Value *Ptr, bool VariableMask,
2048  unsigned Alignment) {
2049  assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
2050  unsigned VF = SrcVTy->getVectorNumElements();
2051  PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
2052  if (!PtrTy && Ptr->getType()->isVectorTy())
2053  PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType());
2054  assert(PtrTy && "Unexpected type for Ptr argument");
2055  unsigned AddressSpace = PtrTy->getAddressSpace();
2056 
2057  bool Scalarize = false;
2058  if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) ||
2059  (Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy)))
2060  Scalarize = true;
2061  // Gather / Scatter for vector 2 is not profitable on KNL / SKX
2062  // Vector-4 of gather/scatter instruction does not exist on KNL.
2063  // We can extend it to 8 elements, but zeroing upper bits of
2064  // the mask vector will add more instructions. Right now we give the scalar
2065  // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction
2066  // is better in the VariableMask case.
2067  if (VF == 2 || (VF == 4 && !ST->hasVLX()))
2068  Scalarize = true;
2069 
2070  if (Scalarize)
2071  return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
2072  AddressSpace);
2073 
2074  return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
2075 }
2076 
2078  Type *ScalarTy = DataTy->getScalarType();
2079  int DataWidth = isa<PointerType>(ScalarTy) ?
2081 
2082  return ((DataWidth == 32 || DataWidth == 64) && ST->hasAVX()) ||
2083  ((DataWidth == 8 || DataWidth == 16) && ST->hasBWI());
2084 }
2085 
2087  return isLegalMaskedLoad(DataType);
2088 }
2089 
2091  // This function is called now in two cases: from the Loop Vectorizer
2092  // and from the Scalarizer.
2093  // When the Loop Vectorizer asks about legality of the feature,
2094  // the vectorization factor is not calculated yet. The Loop Vectorizer
2095  // sends a scalar type and the decision is based on the width of the
2096  // scalar element.
2097  // Later on, the cost model will estimate usage this intrinsic based on
2098  // the vector type.
2099  // The Scalarizer asks again about legality. It sends a vector type.
2100  // In this case we can reject non-power-of-2 vectors.
2101  if (isa<VectorType>(DataTy) && !isPowerOf2_32(DataTy->getVectorNumElements()))
2102  return false;
2103  Type *ScalarTy = DataTy->getScalarType();
2104  int DataWidth = isa<PointerType>(ScalarTy) ?
2106 
2107  // AVX-512 allows gather and scatter
2108  return (DataWidth == 32 || DataWidth == 64) && ST->hasAVX512();
2109 }
2110 
2112  return isLegalMaskedGather(DataType);
2113 }
2114 
2116  const Function *Callee) const {
2117  const TargetMachine &TM = getTLI()->getTargetMachine();
2118 
2119  // Work this as a subsetting of subtarget features.
2120  const FeatureBitset &CallerBits =
2121  TM.getSubtargetImpl(*Caller)->getFeatureBits();
2122  const FeatureBitset &CalleeBits =
2123  TM.getSubtargetImpl(*Callee)->getFeatureBits();
2124 
2125  // FIXME: This is likely too limiting as it will include subtarget features
2126  // that we might not care about for inlining, but it is conservatively
2127  // correct.
2128  return (CallerBits & CalleeBits) == CalleeBits;
2129 }
2130 
2132  // TODO: We expect this to be beneficial regardless of arch,
2133  // but there are currently some unexplained performance artifacts on Atom.
2134  // As a temporary solution, disable on Atom.
2135  return !(ST->isAtom() || ST->isSLM());
2136 }
2137 
2138 // Get estimation for interleaved load/store operations and strided load.
2139 // \p Indices contains indices for strided load.
2140 // \p Factor - the factor of interleaving.
2141 // AVX-512 provides 3-src shuffles that significantly reduces the cost.
2143  unsigned Factor,
2144  ArrayRef<unsigned> Indices,
2145  unsigned Alignment,
2146  unsigned AddressSpace) {
2147 
2148  // VecTy for interleave memop is <VF*Factor x Elt>.
2149  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
2150  // VecTy = <12 x i32>.
2151 
2152  // Calculate the number of memory operations (NumOfMemOps), required
2153  // for load/store the VecTy.
2154  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
2155  unsigned VecTySize = DL.getTypeStoreSize(VecTy);
2156  unsigned LegalVTSize = LegalVT.getStoreSize();
2157  unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
2158 
2159  // Get the cost of one memory operation.
2160  Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
2161  LegalVT.getVectorNumElements());
2162  unsigned MemOpCost =
2163  getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
2164 
2165  if (Opcode == Instruction::Load) {
2166  // Kind of shuffle depends on number of loaded values.
2167  // If we load the entire data in one register, we can use a 1-src shuffle.
2168  // Otherwise, we'll merge 2 sources in each operation.
2169  TTI::ShuffleKind ShuffleKind =
2170  (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
2171 
2172  unsigned ShuffleCost =
2173  getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr);
2174 
2175  unsigned NumOfLoadsInInterleaveGrp =
2176  Indices.size() ? Indices.size() : Factor;
2177  Type *ResultTy = VectorType::get(VecTy->getVectorElementType(),
2178  VecTy->getVectorNumElements() / Factor);
2179  unsigned NumOfResults =
2180  getTLI()->getTypeLegalizationCost(DL, ResultTy).first *
2181  NumOfLoadsInInterleaveGrp;
2182 
2183  // About a half of the loads may be folded in shuffles when we have only
2184  // one result. If we have more than one result, we do not fold loads at all.
2185  unsigned NumOfUnfoldedLoads =
2186  NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
2187 
2188  // Get a number of shuffle operations per result.
2189  unsigned NumOfShufflesPerResult =
2190  std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
2191 
2192  // The SK_MergeTwoSrc shuffle clobbers one of src operands.
2193  // When we have more than one destination, we need additional instructions
2194  // to keep sources.
2195  unsigned NumOfMoves = 0;
2196  if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
2197  NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
2198 
2199  int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
2200  NumOfUnfoldedLoads * MemOpCost + NumOfMoves;
2201 
2202  return Cost;
2203  }
2204 
2205  // Store.
2206  assert(Opcode == Instruction::Store &&
2207  "Expected Store Instruction at this point");
2208 
2209  // There is no strided stores meanwhile. And store can't be folded in
2210  // shuffle.
2211  unsigned NumOfSources = Factor; // The number of values to be merged.
2212  unsigned ShuffleCost =
2213  getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr);
2214  unsigned NumOfShufflesPerStore = NumOfSources - 1;
2215 
2216  // The SK_MergeTwoSrc shuffle clobbers one of src operands.
2217  // We need additional instructions to keep sources.
2218  unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
2219  int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
2220  NumOfMoves;
2221  return Cost;
2222 }
2223 
2224 int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
2225  unsigned Factor,
2226  ArrayRef<unsigned> Indices,
2227  unsigned Alignment,
2228  unsigned AddressSpace) {
2229  auto isSupportedOnAVX512 = [](Type *VecTy, bool &RequiresBW) {
2230  RequiresBW = false;
2231  Type *EltTy = VecTy->getVectorElementType();
2232  if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
2233  EltTy->isIntegerTy(32) || EltTy->isPointerTy())
2234  return true;
2235  if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8)) {
2236  RequiresBW = true;
2237  return true;
2238  }
2239  return false;
2240  };
2241  bool RequiresBW;
2242  bool HasAVX512Solution = isSupportedOnAVX512(VecTy, RequiresBW);
2243  if (ST->hasAVX512() && HasAVX512Solution && (!RequiresBW || ST->hasBWI()))
2244  return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
2245  Alignment, AddressSpace);
2246  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2247  Alignment, AddressSpace);
2248 }
constexpr bool isUInt< 32 >(uint64_t x)
Definition: MathExtras.h:315
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:467
APInt ashr(unsigned shiftAmt) const
Arithmetic right-shift function.
Definition: APInt.cpp:1035
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:102
bool hasDQI() const
Definition: X86Subtarget.h:479
static IntegerType * getInt1Ty(LLVMContext &C)
Definition: Type.cpp:166
unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace)
Definition: BasicTTIImpl.h:536
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1309
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:42
size_t i
unsigned minRequiredElementSize(const Value *Val, bool &isSigned)
Cost tables and simple lookup functions.
int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, bool VariableMask, unsigned Alignment)
Calculate the cost of Gather / Scatter operation.
int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef< Type * > Tys, FastMathFlags FMF)
const TargetMachine & getTargetMachine() const
const Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
unsigned getNumOperands() const
Definition: User.h:167
This file a TargetTransformInfo::Concept conforming object specific to the X86 target machine...
The main scalar evolution driver.
int getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >())
unsigned getSizeInBits() const
bool hasVLX() const
Definition: X86Subtarget.h:481
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:148
bool hasSSE41() const
Definition: X86Subtarget.h:414
bool isLegalMaskedScatter(Type *DataType)
Type Conversion Cost Table.
Definition: CostTable.h:45
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:471
Hexagon Common GEP
bool hasBWI() const
Definition: X86Subtarget.h:480
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace)
int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace)
Cost Table Entry.
Definition: CostTable.h:25
unsigned getNumberOfRegisters(bool Vector)
unsigned getMaxInterleaveFactor(unsigned VF)
unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src)
Definition: BasicTTIImpl.h:363
bool isLegalMaskedStore(Type *DataType)
unsigned getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
Definition: BasicTTIImpl.h:929
bool hasVBMI() const
Definition: X86Subtarget.h:447
Shift and rotation operations.
Definition: ISDOpcodes.h:344
const TypeConversionCostTblEntry * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntry > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table, TypeTy must be comparable to CompareTy by ==. ...
Definition: CostTable.h:55
unsigned getStoreSize() const
getStoreSize - Return the number of bytes overwritten by a store of the specified value type...
Type * getVectorElementType() const
Definition: Type.h:353
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace)
Definition: BasicTTIImpl.h:567
Choose alternate elements from vector.
unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwise)
Definition: BasicTTIImpl.h:934
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:410
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
PopcntSupportKind
Flags indicating the kind of support for population count.
int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm)
bool is64Bit() const
Is this x86_64? (disregarding specific ABI / programming model)
Definition: X86Subtarget.h:381
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:200
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:33
bool hasSSSE3() const
Definition: X86Subtarget.h:413
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
int getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr)
Reverse the order of the vector.
Type * getScalarType() const LLVM_READONLY
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.cpp:44
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:141
Class to represent pointers.
Definition: DerivedTypes.h:443
bool hasSSE2() const
Definition: X86Subtarget.h:411
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:453
unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy)
Definition: BasicTTIImpl.h:490
an instruction for type-safe pointer arithmetic to access elements of arrays and structs ...
Definition: Instructions.h:830
* if(!EatIfPresent(lltok::kw_thread_local)) return false
ParseOptionalThreadLocal := /*empty.
unsigned getVectorNumElements() const
constexpr bool isPowerOf2_32(uint32_t Value)
isPowerOf2_32 - This function returns true if the argument is a power of two > 0. ...
Definition: MathExtras.h:399
MVT - Machine Value Type.
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:45
Simple binary floating point operators.
Definition: ISDOpcodes.h:246
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:219
bool isLegalMaskedGather(Type *DataType)
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1321
bool hasAVX2() const
Definition: X86Subtarget.h:417
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:145
APInt Or(const APInt &LHS, const APInt &RHS)
Bitwise OR function for APInt.
Definition: APInt.h:1947
APInt Xor(const APInt &LHS, const APInt &RHS)
Bitwise XOR function for APInt.
Definition: APInt.h:1952
Expected to fold away in lowering.
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:939
bool isLegalMaskedLoad(Type *DataType)
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1255
Value * getOperand(unsigned i) const
Definition: User.h:145
Merge elements from two source vectors into one with any shuffle mask.
int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace)
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:485
EVT - Extended Value Type.
Definition: ValueTypes.h:31
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:213
bool hasXOP() const
Definition: X86Subtarget.h:438
int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp)
const SCEVConstant * getConstantStrideStep(ScalarEvolution *SE, const SCEV *Ptr)
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
OperandValueProperties
Additional properties of an operand's values.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
unsigned getCFInstrCost(unsigned Opcode)
Definition: BasicTTIImpl.h:485
unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp)
Definition: BasicTTIImpl.h:354
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:234
constexpr bool isInt< 32 >(int64_t x)
Definition: MathExtras.h:274
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:230
AddressSpace
Definition: NVPTXBaseInfo.h:22
bool isUnalignedMem32Slow() const
Definition: X86Subtarget.h:461
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:347
const FeatureBitset & getFeatureBits() const
getFeatureBits - Return the feature bits.
Class to represent vector types.
Definition: DerivedTypes.h:369
Class for arbitrary precision integers.
Definition: APInt.h:77
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src)
unsigned getRegisterBitWidth(bool Vector)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:195
std::pair< int, MVT > getTypeLegalizationCost(const DataLayout &DL, Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:400
unsigned getVectorNumElements() const
Definition: DerivedTypes.h:438
APInt And(const APInt &LHS, const APInt &RHS)
Bitwise AND function for APInt.
Definition: APInt.h:1942
bool hasPOPCNT() const
Definition: X86Subtarget.h:425
bool hasSSE1() const
Definition: X86Subtarget.h:410
const CostTblEntry * CostTableLookup(ArrayRef< CostTblEntry > Tbl, int ISD, MVT Ty)
Find in cost table, TypeTy must be comparable to CompareTy by ==.
Definition: CostTable.h:32
bool isAtom() const
Definition: X86Subtarget.h:487
This class represents an analyzed expression in the program.
bool hasSSE42() const
Definition: X86Subtarget.h:415
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:349
bool hasAVX512() const
Definition: X86Subtarget.h:418
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:287
unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: BasicTTIImpl.h:529
uint64_t getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type...
Definition: DataLayout.h:391
InsertSubvector. Index indicates start offset.
const unsigned Kind
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
The cost of a typical 'add' instruction.
int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace)
bool isSimple() const
isSimple - Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:107
bool isSLM() const
Definition: X86Subtarget.h:488
unsigned getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition: Type.cpp:108
LLVM Value Representation.
Definition: Value.h:71
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1007
static VectorType * get(Type *ElementType, unsigned NumElements)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:631
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition: Compiler.h:239
Broadcast element 0 to all other elements.
Primary interface to the complete machine description for the target machine.
int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy)
Convenience struct for specifying and reasoning about fast-math flags.
Definition: Operator.h:168
unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef< Value * > Args, FastMathFlags FMF)
Get intrinsic cost based on arguments.
Definition: BasicTTIImpl.h:684
OperandValueKind
Additional information about an operand's possible values.
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:377
This pass exposes codegen information to IR-level passes.
MVT getVectorElementType() const
Conversion operators.
Definition: ISDOpcodes.h:397
int * Ptr
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:406
static IntegerType * getInt8Ty(LLVMContext &C)
Definition: Type.cpp:167
bool hasAVX() const
Definition: X86Subtarget.h:416
MVT getSimpleVT() const
getSimpleVT - Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:226
int getIntImmCost(int64_t)
Calculate the cost of materializing a 64-bit value.
This file describes how to lower LLVM code to machine code.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Shuffle elements of single source vector with any shuffle mask.