LLVM  10.0.0svn
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements a TargetTransformInfo analysis pass specific to the
10 /// X86 target machine. It uses the target's detailed information to provide
11 /// more precise answers to certain TTI queries, while letting the target
12 /// independent and default TTI implementations handle the rest.
13 ///
14 //===----------------------------------------------------------------------===//
15 /// About Cost Model numbers used below it's necessary to say the following:
16 /// the numbers correspond to some "generic" X86 CPU instead of usage of
17 /// concrete CPU model. Usually the numbers correspond to CPU where the feature
18 /// apeared at the first time. For example, if we do Subtarget.hasSSE42() in
19 /// the lookups below the cost is based on Nehalem as that was the first CPU
20 /// to support that feature level and thus has most likely the worst case cost.
21 /// Some examples of other technologies/CPUs:
22 /// SSE 3 - Pentium4 / Athlon64
23 /// SSE 4.1 - Penryn
24 /// SSE 4.2 - Nehalem
25 /// AVX - Sandy Bridge
26 /// AVX2 - Haswell
27 /// AVX-512 - Xeon Phi / Skylake
28 /// And some examples of instruction target dependent costs (latency)
29 /// divss sqrtss rsqrtss
30 /// AMD K7 11-16 19 3
31 /// Piledriver 9-24 13-15 5
32 /// Jaguar 14 16 2
33 /// Pentium II,III 18 30 2
34 /// Nehalem 7-14 7-18 3
35 /// Haswell 10-13 11 5
36 /// TODO: Develop and implement the target dependent cost model and
37 /// specialize cost numbers for different Cost Model Targets such as throughput,
38 /// code size, latency and uop count.
39 //===----------------------------------------------------------------------===//
40 
41 #include "X86TargetTransformInfo.h"
44 #include "llvm/CodeGen/CostTable.h"
46 #include "llvm/IR/IntrinsicInst.h"
47 #include "llvm/Support/Debug.h"
48 
49 using namespace llvm;
50 
51 #define DEBUG_TYPE "x86tti"
52 
53 //===----------------------------------------------------------------------===//
54 //
55 // X86 cost model.
56 //
57 //===----------------------------------------------------------------------===//
58 
60 X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
61  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
62  // TODO: Currently the __builtin_popcount() implementation using SSE3
63  // instructions is inefficient. Once the problem is fixed, we should
64  // call ST->hasSSE3() instead of ST->hasPOPCNT().
66 }
67 
70  switch (Level) {
72  // - Penryn
73  // - Nehalem
74  // - Westmere
75  // - Sandy Bridge
76  // - Ivy Bridge
77  // - Haswell
78  // - Broadwell
79  // - Skylake
80  // - Kabylake
81  return 32 * 1024; // 32 KByte
83  // - Penryn
84  // - Nehalem
85  // - Westmere
86  // - Sandy Bridge
87  // - Ivy Bridge
88  // - Haswell
89  // - Broadwell
90  // - Skylake
91  // - Kabylake
92  return 256 * 1024; // 256 KByte
93  }
94 
95  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
96 }
97 
100  // - Penryn
101  // - Nehalem
102  // - Westmere
103  // - Sandy Bridge
104  // - Ivy Bridge
105  // - Haswell
106  // - Broadwell
107  // - Skylake
108  // - Kabylake
109  switch (Level) {
113  return 8;
114  }
115 
116  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
117 }
118 
119 unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
120  bool Vector = (ClassID == 1);
121  if (Vector && !ST->hasSSE1())
122  return 0;
123 
124  if (ST->is64Bit()) {
125  if (Vector && ST->hasAVX512())
126  return 32;
127  return 16;
128  }
129  return 8;
130 }
131 
133  unsigned PreferVectorWidth = ST->getPreferVectorWidth();
134  if (Vector) {
135  if (ST->hasAVX512() && PreferVectorWidth >= 512)
136  return 512;
137  if (ST->hasAVX() && PreferVectorWidth >= 256)
138  return 256;
139  if (ST->hasSSE1() && PreferVectorWidth >= 128)
140  return 128;
141  return 0;
142  }
143 
144  if (ST->is64Bit())
145  return 64;
146 
147  return 32;
148 }
149 
150 unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
151  return getRegisterBitWidth(true);
152 }
153 
154 unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
155  // If the loop will not be vectorized, don't interleave the loop.
156  // Let regular unroll to unroll the loop, which saves the overflow
157  // check and memory check cost.
158  if (VF == 1)
159  return 1;
160 
161  if (ST->isAtom())
162  return 1;
163 
164  // Sandybridge and Haswell have multiple execution ports and pipelined
165  // vector units.
166  if (ST->hasAVX())
167  return 4;
168 
169  return 2;
170 }
171 
173  unsigned Opcode, Type *Ty,
175  TTI::OperandValueProperties Opd1PropInfo,
176  TTI::OperandValueProperties Opd2PropInfo,
178  // Legalize the type.
179  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
180 
181  int ISD = TLI->InstructionOpcodeToISD(Opcode);
182  assert(ISD && "Invalid opcode");
183 
184  static const CostTblEntry GLMCostTable[] = {
185  { ISD::FDIV, MVT::f32, 18 }, // divss
186  { ISD::FDIV, MVT::v4f32, 35 }, // divps
187  { ISD::FDIV, MVT::f64, 33 }, // divsd
188  { ISD::FDIV, MVT::v2f64, 65 }, // divpd
189  };
190 
191  if (ST->isGLM())
192  if (const auto *Entry = CostTableLookup(GLMCostTable, ISD,
193  LT.second))
194  return LT.first * Entry->Cost;
195 
196  static const CostTblEntry SLMCostTable[] = {
197  { ISD::MUL, MVT::v4i32, 11 }, // pmulld
198  { ISD::MUL, MVT::v8i16, 2 }, // pmullw
199  { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
200  { ISD::FMUL, MVT::f64, 2 }, // mulsd
201  { ISD::FMUL, MVT::v2f64, 4 }, // mulpd
202  { ISD::FMUL, MVT::v4f32, 2 }, // mulps
203  { ISD::FDIV, MVT::f32, 17 }, // divss
204  { ISD::FDIV, MVT::v4f32, 39 }, // divps
205  { ISD::FDIV, MVT::f64, 32 }, // divsd
206  { ISD::FDIV, MVT::v2f64, 69 }, // divpd
207  { ISD::FADD, MVT::v2f64, 2 }, // addpd
208  { ISD::FSUB, MVT::v2f64, 2 }, // subpd
209  // v2i64/v4i64 mul is custom lowered as a series of long:
210  // multiplies(3), shifts(3) and adds(2)
211  // slm muldq version throughput is 2 and addq throughput 4
212  // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
213  // 3X4 (addq throughput) = 17
214  { ISD::MUL, MVT::v2i64, 17 },
215  // slm addq\subq throughput is 4
216  { ISD::ADD, MVT::v2i64, 4 },
217  { ISD::SUB, MVT::v2i64, 4 },
218  };
219 
220  if (ST->isSLM()) {
221  if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
222  // Check if the operands can be shrinked into a smaller datatype.
223  bool Op1Signed = false;
224  unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
225  bool Op2Signed = false;
226  unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
227 
228  bool signedMode = Op1Signed | Op2Signed;
229  unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
230 
231  if (OpMinSize <= 7)
232  return LT.first * 3; // pmullw/sext
233  if (!signedMode && OpMinSize <= 8)
234  return LT.first * 3; // pmullw/zext
235  if (OpMinSize <= 15)
236  return LT.first * 5; // pmullw/pmulhw/pshuf
237  if (!signedMode && OpMinSize <= 16)
238  return LT.first * 5; // pmullw/pmulhw/pshuf
239  }
240 
241  if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
242  LT.second)) {
243  return LT.first * Entry->Cost;
244  }
245  }
246 
247  if ((ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
248  ISD == ISD::UREM) &&
251  Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
252  if (ISD == ISD::SDIV || ISD == ISD::SREM) {
253  // On X86, vector signed division by constants power-of-two are
254  // normally expanded to the sequence SRA + SRL + ADD + SRA.
255  // The OperandValue properties may not be the same as that of the previous
256  // operation; conservatively assume OP_None.
257  int Cost =
258  2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, Op2Info,
261  Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
264  Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info,
267 
268  if (ISD == ISD::SREM) {
269  // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
270  Cost += getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info);
271  Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Op1Info, Op2Info);
272  }
273 
274  return Cost;
275  }
276 
277  // Vector unsigned division/remainder will be simplified to shifts/masks.
278  if (ISD == ISD::UDIV)
279  return getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
282 
283  if (ISD == ISD::UREM)
284  return getArithmeticInstrCost(Instruction::And, Ty, Op1Info, Op2Info,
287  }
288 
289  static const CostTblEntry AVX512BWUniformConstCostTable[] = {
290  { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand.
291  { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand.
292  { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb.
293  };
294 
296  ST->hasBWI()) {
297  if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
298  LT.second))
299  return LT.first * Entry->Cost;
300  }
301 
302  static const CostTblEntry AVX512UniformConstCostTable[] = {
303  { ISD::SRA, MVT::v2i64, 1 },
304  { ISD::SRA, MVT::v4i64, 1 },
305  { ISD::SRA, MVT::v8i64, 1 },
306  };
307 
309  ST->hasAVX512()) {
310  if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
311  LT.second))
312  return LT.first * Entry->Cost;
313  }
314 
315  static const CostTblEntry AVX2UniformConstCostTable[] = {
316  { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand.
317  { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand.
318  { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb.
319 
320  { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
321  };
322 
324  ST->hasAVX2()) {
325  if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
326  LT.second))
327  return LT.first * Entry->Cost;
328  }
329 
330  static const CostTblEntry SSE2UniformConstCostTable[] = {
331  { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand.
332  { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand.
333  { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
334 
335  { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split.
336  { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split.
337  { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
338  };
339 
340  // XOP has faster vXi8 shifts.
342  ST->hasSSE2() && !ST->hasXOP()) {
343  if (const auto *Entry =
344  CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
345  return LT.first * Entry->Cost;
346  }
347 
348  static const CostTblEntry AVX512BWConstCostTable[] = {
349  { ISD::SDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
350  { ISD::SREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
351  { ISD::UDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
352  { ISD::UREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
353  { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence
354  { ISD::SREM, MVT::v32i16, 8 }, // vpmulhw+mul+sub sequence
355  { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence
356  { ISD::UREM, MVT::v32i16, 8 }, // vpmulhuw+mul+sub sequence
357  };
358 
361  ST->hasBWI()) {
362  if (const auto *Entry =
363  CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
364  return LT.first * Entry->Cost;
365  }
366 
367  static const CostTblEntry AVX512ConstCostTable[] = {
368  { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
369  { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
370  { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
371  { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
372  };
373 
376  ST->hasAVX512()) {
377  if (const auto *Entry =
378  CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
379  return LT.first * Entry->Cost;
380  }
381 
382  static const CostTblEntry AVX2ConstCostTable[] = {
383  { ISD::SDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
384  { ISD::SREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
385  { ISD::UDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
386  { ISD::UREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
387  { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
388  { ISD::SREM, MVT::v16i16, 8 }, // vpmulhw+mul+sub sequence
389  { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
390  { ISD::UREM, MVT::v16i16, 8 }, // vpmulhuw+mul+sub sequence
391  { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
392  { ISD::SREM, MVT::v8i32, 19 }, // vpmuldq+mul+sub sequence
393  { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
394  { ISD::UREM, MVT::v8i32, 19 }, // vpmuludq+mul+sub sequence
395  };
396 
399  ST->hasAVX2()) {
400  if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
401  return LT.first * Entry->Cost;
402  }
403 
404  static const CostTblEntry SSE2ConstCostTable[] = {
405  { ISD::SDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
406  { ISD::SREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
407  { ISD::SDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
408  { ISD::SREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
409  { ISD::UDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
410  { ISD::UREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
411  { ISD::UDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
412  { ISD::UREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
413  { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
414  { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split.
415  { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
416  { ISD::SREM, MVT::v8i16, 8 }, // pmulhw+mul+sub sequence
417  { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
418  { ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split.
419  { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
420  { ISD::UREM, MVT::v8i16, 8 }, // pmulhuw+mul+sub sequence
421  { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split.
422  { ISD::SREM, MVT::v8i32, 48+2 }, // 2*pmuludq+mul+sub sequence + split.
423  { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
424  { ISD::SREM, MVT::v4i32, 24 }, // pmuludq+mul+sub sequence
425  { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split.
426  { ISD::UREM, MVT::v8i32, 40+2 }, // 2*pmuludq+mul+sub sequence + split.
427  { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
428  { ISD::UREM, MVT::v4i32, 20 }, // pmuludq+mul+sub sequence
429  };
430 
433  ST->hasSSE2()) {
434  // pmuldq sequence.
435  if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
436  return LT.first * 32;
437  if (ISD == ISD::SREM && LT.second == MVT::v8i32 && ST->hasAVX())
438  return LT.first * 38;
439  if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
440  return LT.first * 15;
441  if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41())
442  return LT.first * 20;
443 
444  if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
445  return LT.first * Entry->Cost;
446  }
447 
448  static const CostTblEntry AVX2UniformCostTable[] = {
449  // Uniform splats are cheaper for the following instructions.
450  { ISD::SHL, MVT::v16i16, 1 }, // psllw.
451  { ISD::SRL, MVT::v16i16, 1 }, // psrlw.
452  { ISD::SRA, MVT::v16i16, 1 }, // psraw.
453  };
454 
455  if (ST->hasAVX2() &&
457  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
458  if (const auto *Entry =
459  CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
460  return LT.first * Entry->Cost;
461  }
462 
463  static const CostTblEntry SSE2UniformCostTable[] = {
464  // Uniform splats are cheaper for the following instructions.
465  { ISD::SHL, MVT::v8i16, 1 }, // psllw.
466  { ISD::SHL, MVT::v4i32, 1 }, // pslld
467  { ISD::SHL, MVT::v2i64, 1 }, // psllq.
468 
469  { ISD::SRL, MVT::v8i16, 1 }, // psrlw.
470  { ISD::SRL, MVT::v4i32, 1 }, // psrld.
471  { ISD::SRL, MVT::v2i64, 1 }, // psrlq.
472 
473  { ISD::SRA, MVT::v8i16, 1 }, // psraw.
474  { ISD::SRA, MVT::v4i32, 1 }, // psrad.
475  };
476 
477  if (ST->hasSSE2() &&
479  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
480  if (const auto *Entry =
481  CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
482  return LT.first * Entry->Cost;
483  }
484 
485  static const CostTblEntry AVX512DQCostTable[] = {
486  { ISD::MUL, MVT::v2i64, 1 },
487  { ISD::MUL, MVT::v4i64, 1 },
488  { ISD::MUL, MVT::v8i64, 1 }
489  };
490 
491  // Look for AVX512DQ lowering tricks for custom cases.
492  if (ST->hasDQI())
493  if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
494  return LT.first * Entry->Cost;
495 
496  static const CostTblEntry AVX512BWCostTable[] = {
497  { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
498  { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
499  { ISD::SRA, MVT::v8i16, 1 }, // vpsravw
500 
501  { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
502  { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
503  { ISD::SRA, MVT::v16i16, 1 }, // vpsravw
504 
505  { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
506  { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
507  { ISD::SRA, MVT::v32i16, 1 }, // vpsravw
508 
509  { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence.
510  { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence.
511  { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence.
512 
513  { ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence.
514  { ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence.
515  { ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence.
516  };
517 
518  // Look for AVX512BW lowering tricks for custom cases.
519  if (ST->hasBWI())
520  if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
521  return LT.first * Entry->Cost;
522 
523  static const CostTblEntry AVX512CostTable[] = {
524  { ISD::SHL, MVT::v16i32, 1 },
525  { ISD::SRL, MVT::v16i32, 1 },
526  { ISD::SRA, MVT::v16i32, 1 },
527 
528  { ISD::SHL, MVT::v8i64, 1 },
529  { ISD::SRL, MVT::v8i64, 1 },
530 
531  { ISD::SRA, MVT::v2i64, 1 },
532  { ISD::SRA, MVT::v4i64, 1 },
533  { ISD::SRA, MVT::v8i64, 1 },
534 
535  { ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence.
536  { ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence.
537  { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org)
538  { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org)
539  { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org)
540  { ISD::MUL, MVT::v8i64, 8 }, // 3*pmuludq/3*shift/2*add
541 
542  { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
543  { ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
544  { ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
545 
546  { ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
547  { ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
548  { ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
549  };
550 
551  if (ST->hasAVX512())
552  if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
553  return LT.first * Entry->Cost;
554 
555  static const CostTblEntry AVX2ShiftCostTable[] = {
556  // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
557  // customize them to detect the cases where shift amount is a scalar one.
558  { ISD::SHL, MVT::v4i32, 1 },
559  { ISD::SRL, MVT::v4i32, 1 },
560  { ISD::SRA, MVT::v4i32, 1 },
561  { ISD::SHL, MVT::v8i32, 1 },
562  { ISD::SRL, MVT::v8i32, 1 },
563  { ISD::SRA, MVT::v8i32, 1 },
564  { ISD::SHL, MVT::v2i64, 1 },
565  { ISD::SRL, MVT::v2i64, 1 },
566  { ISD::SHL, MVT::v4i64, 1 },
567  { ISD::SRL, MVT::v4i64, 1 },
568  };
569 
570  // Look for AVX2 lowering tricks.
571  if (ST->hasAVX2()) {
572  if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
575  // On AVX2, a packed v16i16 shift left by a constant build_vector
576  // is lowered into a vector multiply (vpmullw).
577  return getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info,
580 
581  if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
582  return LT.first * Entry->Cost;
583  }
584 
585  static const CostTblEntry XOPShiftCostTable[] = {
586  // 128bit shifts take 1cy, but right shifts require negation beforehand.
587  { ISD::SHL, MVT::v16i8, 1 },
588  { ISD::SRL, MVT::v16i8, 2 },
589  { ISD::SRA, MVT::v16i8, 2 },
590  { ISD::SHL, MVT::v8i16, 1 },
591  { ISD::SRL, MVT::v8i16, 2 },
592  { ISD::SRA, MVT::v8i16, 2 },
593  { ISD::SHL, MVT::v4i32, 1 },
594  { ISD::SRL, MVT::v4i32, 2 },
595  { ISD::SRA, MVT::v4i32, 2 },
596  { ISD::SHL, MVT::v2i64, 1 },
597  { ISD::SRL, MVT::v2i64, 2 },
598  { ISD::SRA, MVT::v2i64, 2 },
599  // 256bit shifts require splitting if AVX2 didn't catch them above.
600  { ISD::SHL, MVT::v32i8, 2+2 },
601  { ISD::SRL, MVT::v32i8, 4+2 },
602  { ISD::SRA, MVT::v32i8, 4+2 },
603  { ISD::SHL, MVT::v16i16, 2+2 },
604  { ISD::SRL, MVT::v16i16, 4+2 },
605  { ISD::SRA, MVT::v16i16, 4+2 },
606  { ISD::SHL, MVT::v8i32, 2+2 },
607  { ISD::SRL, MVT::v8i32, 4+2 },
608  { ISD::SRA, MVT::v8i32, 4+2 },
609  { ISD::SHL, MVT::v4i64, 2+2 },
610  { ISD::SRL, MVT::v4i64, 4+2 },
611  { ISD::SRA, MVT::v4i64, 4+2 },
612  };
613 
614  // Look for XOP lowering tricks.
615  if (ST->hasXOP()) {
616  // If the right shift is constant then we'll fold the negation so
617  // it's as cheap as a left shift.
618  int ShiftISD = ISD;
619  if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) &&
622  ShiftISD = ISD::SHL;
623  if (const auto *Entry =
624  CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
625  return LT.first * Entry->Cost;
626  }
627 
628  static const CostTblEntry SSE2UniformShiftCostTable[] = {
629  // Uniform splats are cheaper for the following instructions.
630  { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split.
631  { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split.
632  { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split.
633 
634  { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split.
635  { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split.
636  { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split.
637 
638  { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split.
639  { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split.
640  { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle.
641  { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split.
642  };
643 
644  if (ST->hasSSE2() &&
646  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
647 
648  // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
649  if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2())
650  return LT.first * 4; // 2*psrad + shuffle.
651 
652  if (const auto *Entry =
653  CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
654  return LT.first * Entry->Cost;
655  }
656 
657  if (ISD == ISD::SHL &&
659  MVT VT = LT.second;
660  // Vector shift left by non uniform constant can be lowered
661  // into vector multiply.
662  if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
663  ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
664  ISD = ISD::MUL;
665  }
666 
667  static const CostTblEntry AVX2CostTable[] = {
668  { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence.
669  { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
670 
671  { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence.
672  { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
673 
674  { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence.
675  { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence.
676  { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence.
677  { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence.
678 
679  { ISD::SUB, MVT::v32i8, 1 }, // psubb
680  { ISD::ADD, MVT::v32i8, 1 }, // paddb
681  { ISD::SUB, MVT::v16i16, 1 }, // psubw
682  { ISD::ADD, MVT::v16i16, 1 }, // paddw
683  { ISD::SUB, MVT::v8i32, 1 }, // psubd
684  { ISD::ADD, MVT::v8i32, 1 }, // paddd
685  { ISD::SUB, MVT::v4i64, 1 }, // psubq
686  { ISD::ADD, MVT::v4i64, 1 }, // paddq
687 
688  { ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence.
689  { ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence.
690  { ISD::MUL, MVT::v16i16, 1 }, // pmullw
691  { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org)
692  { ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add
693 
694  { ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
695  { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
696  { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
697  { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
698  { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
699  { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
700 
701  { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
702  { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
703  { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
704  { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/
705  { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
706  { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
707  };
708 
709  // Look for AVX2 lowering tricks for custom cases.
710  if (ST->hasAVX2())
711  if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
712  return LT.first * Entry->Cost;
713 
714  static const CostTblEntry AVX1CostTable[] = {
715  // We don't have to scalarize unsupported ops. We can issue two half-sized
716  // operations and we only need to extract the upper YMM half.
717  // Two ops + 1 extract + 1 insert = 4.
718  { ISD::MUL, MVT::v16i16, 4 },
719  { ISD::MUL, MVT::v8i32, 4 },
720  { ISD::SUB, MVT::v32i8, 4 },
721  { ISD::ADD, MVT::v32i8, 4 },
722  { ISD::SUB, MVT::v16i16, 4 },
723  { ISD::ADD, MVT::v16i16, 4 },
724  { ISD::SUB, MVT::v8i32, 4 },
725  { ISD::ADD, MVT::v8i32, 4 },
726  { ISD::SUB, MVT::v4i64, 4 },
727  { ISD::ADD, MVT::v4i64, 4 },
728 
729  // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
730  // are lowered as a series of long multiplies(3), shifts(3) and adds(2)
731  // Because we believe v4i64 to be a legal type, we must also include the
732  // extract+insert in the cost table. Therefore, the cost here is 18
733  // instead of 8.
734  { ISD::MUL, MVT::v4i64, 18 },
735 
736  { ISD::MUL, MVT::v32i8, 26 }, // extend/pmullw/trunc sequence.
737 
738  { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
739  { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
740  { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
741  { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/
742  { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/
743  { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/
744  };
745 
746  if (ST->hasAVX())
747  if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
748  return LT.first * Entry->Cost;
749 
750  static const CostTblEntry SSE42CostTable[] = {
751  { ISD::FADD, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
752  { ISD::FADD, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
753  { ISD::FADD, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
754  { ISD::FADD, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
755 
756  { ISD::FSUB, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
757  { ISD::FSUB, MVT::f32 , 1 }, // Nehalem from http://www.agner.org/
758  { ISD::FSUB, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
759  { ISD::FSUB, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
760 
761  { ISD::FMUL, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
762  { ISD::FMUL, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
763  { ISD::FMUL, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
764  { ISD::FMUL, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
765 
766  { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/
767  { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
768  { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/
769  { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
770  };
771 
772  if (ST->hasSSE42())
773  if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
774  return LT.first * Entry->Cost;
775 
776  static const CostTblEntry SSE41CostTable[] = {
777  { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence.
778  { ISD::SHL, MVT::v32i8, 2*11+2 }, // pblendvb sequence + split.
779  { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence.
780  { ISD::SHL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
781  { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
782  { ISD::SHL, MVT::v8i32, 2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split
783 
784  { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence.
785  { ISD::SRL, MVT::v32i8, 2*12+2 }, // pblendvb sequence + split.
786  { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence.
787  { ISD::SRL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
788  { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend.
789  { ISD::SRL, MVT::v8i32, 2*11+2 }, // Shift each lane + blend + split.
790 
791  { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence.
792  { ISD::SRA, MVT::v32i8, 2*24+2 }, // pblendvb sequence + split.
793  { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence.
794  { ISD::SRA, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
795  { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
796  { ISD::SRA, MVT::v8i32, 2*12+2 }, // Shift each lane + blend + split.
797 
798  { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org)
799  };
800 
801  if (ST->hasSSE41())
802  if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
803  return LT.first * Entry->Cost;
804 
805  static const CostTblEntry SSE2CostTable[] = {
806  // We don't correctly identify costs of casts because they are marked as
807  // custom.
808  { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence.
809  { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence.
810  { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
811  { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
812  { ISD::SHL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
813 
814  { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence.
815  { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence.
816  { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
817  { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
818  { ISD::SRL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
819 
820  { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence.
821  { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence.
822  { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend.
823  { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence.
824  { ISD::SRA, MVT::v4i64, 2*12+2 }, // srl/xor/sub sequence+split.
825 
826  { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence.
827  { ISD::MUL, MVT::v8i16, 1 }, // pmullw
828  { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
829  { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
830 
831  { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/
832  { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
833  { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
834  { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
835 
836  { ISD::FADD, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
837  { ISD::FADD, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
838 
839  { ISD::FSUB, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
840  { ISD::FSUB, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
841  };
842 
843  if (ST->hasSSE2())
844  if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
845  return LT.first * Entry->Cost;
846 
847  static const CostTblEntry SSE1CostTable[] = {
848  { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/
849  { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
850 
851  { ISD::FADD, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
852  { ISD::FADD, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
853 
854  { ISD::FSUB, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
855  { ISD::FSUB, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
856 
857  { ISD::ADD, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
858  { ISD::ADD, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
859  { ISD::ADD, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
860 
861  { ISD::SUB, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
862  { ISD::SUB, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
863  { ISD::SUB, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
864  };
865 
866  if (ST->hasSSE1())
867  if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
868  return LT.first * Entry->Cost;
869 
870  // It is not a good idea to vectorize division. We have to scalarize it and
871  // in the process we will often end up having to spilling regular
872  // registers. The overhead of division is going to dominate most kernels
873  // anyways so try hard to prevent vectorization of division - it is
874  // generally a bad idea. Assume somewhat arbitrarily that we have to be able
875  // to hide "20 cycles" for each lane.
876  if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM ||
877  ISD == ISD::UDIV || ISD == ISD::UREM)) {
878  int ScalarCost = getArithmeticInstrCost(
879  Opcode, Ty->getScalarType(), Op1Info, Op2Info,
881  return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
882  }
883 
884  // Fallback to the default implementation.
885  return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
886 }
887 
889  Type *SubTp) {
890  // 64-bit packed float vectors (v2f32) are widened to type v4f32.
891  // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
892  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
893 
894  // Treat Transpose as 2-op shuffles - there's no difference in lowering.
895  if (Kind == TTI::SK_Transpose)
896  Kind = TTI::SK_PermuteTwoSrc;
897 
898  // For Broadcasts we are splatting the first element from the first input
899  // register, so only need to reference that input and all the output
900  // registers are the same.
901  if (Kind == TTI::SK_Broadcast)
902  LT.first = 1;
903 
904  // Subvector extractions are free if they start at the beginning of a
905  // vector and cheap if the subvectors are aligned.
906  if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
907  int NumElts = LT.second.getVectorNumElements();
908  if ((Index % NumElts) == 0)
909  return 0;
910  std::pair<int, MVT> SubLT = TLI->getTypeLegalizationCost(DL, SubTp);
911  if (SubLT.second.isVector()) {
912  int NumSubElts = SubLT.second.getVectorNumElements();
913  if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
914  return SubLT.first;
915  // Handle some cases for widening legalization. For now we only handle
916  // cases where the original subvector was naturally aligned and evenly
917  // fit in its legalized subvector type.
918  // FIXME: Remove some of the alignment restrictions.
919  // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
920  // vectors.
921  int OrigSubElts = SubTp->getVectorNumElements();
922  if (NumSubElts > OrigSubElts &&
923  (Index % OrigSubElts) == 0 && (NumSubElts % OrigSubElts) == 0 &&
924  LT.second.getVectorElementType() ==
925  SubLT.second.getVectorElementType() &&
926  LT.second.getVectorElementType().getSizeInBits() ==
928  assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
929  "Unexpected number of elements!");
931  LT.second.getVectorNumElements());
933  SubLT.second.getVectorNumElements());
934  int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
935  int ExtractCost = getShuffleCost(TTI::SK_ExtractSubvector, VecTy,
936  ExtractIndex, SubTy);
937 
938  // If the original size is 32-bits or more, we can use pshufd. Otherwise
939  // if we have SSSE3 we can use pshufb.
940  if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
941  return ExtractCost + 1; // pshufd or pshufb
942 
943  assert(SubTp->getPrimitiveSizeInBits() == 16 &&
944  "Unexpected vector size");
945 
946  return ExtractCost + 2; // worst case pshufhw + pshufd
947  }
948  }
949  }
950 
951  // We are going to permute multiple sources and the result will be in multiple
952  // destinations. Providing an accurate cost only for splits where the element
953  // type remains the same.
954  if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
955  MVT LegalVT = LT.second;
956  if (LegalVT.isVector() &&
957  LegalVT.getVectorElementType().getSizeInBits() ==
959  LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) {
960 
961  unsigned VecTySize = DL.getTypeStoreSize(Tp);
962  unsigned LegalVTSize = LegalVT.getStoreSize();
963  // Number of source vectors after legalization:
964  unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
965  // Number of destination vectors after legalization:
966  unsigned NumOfDests = LT.first;
967 
968  Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(),
969  LegalVT.getVectorNumElements());
970 
971  unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
972  return NumOfShuffles *
973  getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
974  }
975 
976  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
977  }
978 
979  // For 2-input shuffles, we must account for splitting the 2 inputs into many.
980  if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
981  // We assume that source and destination have the same vector type.
982  int NumOfDests = LT.first;
983  int NumOfShufflesPerDest = LT.first * 2 - 1;
984  LT.first = NumOfDests * NumOfShufflesPerDest;
985  }
986 
987  static const CostTblEntry AVX512VBMIShuffleTbl[] = {
988  {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
989  {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
990 
991  {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
992  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
993 
994  {TTI::SK_PermuteTwoSrc, MVT::v64i8, 1}, // vpermt2b
995  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 1}, // vpermt2b
996  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1} // vpermt2b
997  };
998 
999  if (ST->hasVBMI())
1000  if (const auto *Entry =
1001  CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1002  return LT.first * Entry->Cost;
1003 
1004  static const CostTblEntry AVX512BWShuffleTbl[] = {
1005  {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1006  {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1007 
1008  {TTI::SK_Reverse, MVT::v32i16, 1}, // vpermw
1009  {TTI::SK_Reverse, MVT::v16i16, 1}, // vpermw
1010  {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
1011 
1012  {TTI::SK_PermuteSingleSrc, MVT::v32i16, 1}, // vpermw
1013  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 1}, // vpermw
1014  {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // vpermw
1015  {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
1016  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 3}, // vpermw + zext/trunc
1017 
1018  {TTI::SK_PermuteTwoSrc, MVT::v32i16, 1}, // vpermt2w
1019  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 1}, // vpermt2w
1020  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpermt2w
1021  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 3}, // zext + vpermt2w + trunc
1022  {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1023  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3} // zext + vpermt2w + trunc
1024  };
1025 
1026  if (ST->hasBWI())
1027  if (const auto *Entry =
1028  CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1029  return LT.first * Entry->Cost;
1030 
1031  static const CostTblEntry AVX512ShuffleTbl[] = {
1032  {TTI::SK_Broadcast, MVT::v8f64, 1}, // vbroadcastpd
1033  {TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps
1034  {TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq
1035  {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd
1036 
1037  {TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd
1038  {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps
1039  {TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq
1040  {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd
1041 
1042  {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd
1043  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1044  {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // vpermpd
1045  {TTI::SK_PermuteSingleSrc, MVT::v16f32, 1}, // vpermps
1046  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1047  {TTI::SK_PermuteSingleSrc, MVT::v4f32, 1}, // vpermps
1048  {TTI::SK_PermuteSingleSrc, MVT::v8i64, 1}, // vpermq
1049  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1050  {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // vpermq
1051  {TTI::SK_PermuteSingleSrc, MVT::v16i32, 1}, // vpermd
1052  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1053  {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // vpermd
1054  {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1055 
1056  {TTI::SK_PermuteTwoSrc, MVT::v8f64, 1}, // vpermt2pd
1057  {TTI::SK_PermuteTwoSrc, MVT::v16f32, 1}, // vpermt2ps
1058  {TTI::SK_PermuteTwoSrc, MVT::v8i64, 1}, // vpermt2q
1059  {TTI::SK_PermuteTwoSrc, MVT::v16i32, 1}, // vpermt2d
1060  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 1}, // vpermt2pd
1061  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 1}, // vpermt2ps
1062  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 1}, // vpermt2q
1063  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 1}, // vpermt2d
1064  {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1}, // vpermt2pd
1065  {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1}, // vpermt2ps
1066  {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1}, // vpermt2q
1067  {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1} // vpermt2d
1068  };
1069 
1070  if (ST->hasAVX512())
1071  if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1072  return LT.first * Entry->Cost;
1073 
1074  static const CostTblEntry AVX2ShuffleTbl[] = {
1075  {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1076  {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1077  {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1078  {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1079  {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1080  {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1081 
1082  {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1083  {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1084  {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1085  {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1086  {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1087  {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1088 
1089  {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1090  {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1091 
1092  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1093  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1094  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1095  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1096  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1097  // + vpblendvb
1098  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1099  // + vpblendvb
1100 
1101  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1102  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1103  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1104  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1105  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1106  // + vpblendvb
1107  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
1108  // + vpblendvb
1109  };
1110 
1111  if (ST->hasAVX2())
1112  if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1113  return LT.first * Entry->Cost;
1114 
1115  static const CostTblEntry XOPShuffleTbl[] = {
1116  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
1117  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
1118  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
1119  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
1120  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1121  // + vinsertf128
1122  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
1123  // + vinsertf128
1124 
1125  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1126  // + vinsertf128
1127  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
1128  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
1129  // + vinsertf128
1130  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
1131  };
1132 
1133  if (ST->hasXOP())
1134  if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1135  return LT.first * Entry->Cost;
1136 
1137  static const CostTblEntry AVX1ShuffleTbl[] = {
1138  {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1139  {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1140  {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1141  {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1142  {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1143  {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
1144 
1145  {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1146  {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1147  {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1148  {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1149  {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1150  // + vinsertf128
1151  {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
1152  // + vinsertf128
1153 
1154  {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
1155  {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
1156  {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
1157  {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
1158  {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
1159  {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
1160 
1161  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
1162  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
1163  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1164  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1165  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
1166  // + 2*por + vinsertf128
1167  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
1168  // + 2*por + vinsertf128
1169 
1170  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
1171  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
1172  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1173  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1174  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
1175  // + 4*por + vinsertf128
1176  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
1177  // + 4*por + vinsertf128
1178  };
1179 
1180  if (ST->hasAVX())
1181  if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
1182  return LT.first * Entry->Cost;
1183 
1184  static const CostTblEntry SSE41ShuffleTbl[] = {
1185  {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
1186  {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1187  {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
1188  {TTI::SK_Select, MVT::v4f32, 1}, // blendps
1189  {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
1190  {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
1191  };
1192 
1193  if (ST->hasSSE41())
1194  if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
1195  return LT.first * Entry->Cost;
1196 
1197  static const CostTblEntry SSSE3ShuffleTbl[] = {
1198  {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
1199  {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
1200 
1201  {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
1202  {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
1203 
1204  {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
1205  {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
1206 
1207  {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
1208  {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1209 
1210  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
1211  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
1212  };
1213 
1214  if (ST->hasSSSE3())
1215  if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
1216  return LT.first * Entry->Cost;
1217 
1218  static const CostTblEntry SSE2ShuffleTbl[] = {
1219  {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
1220  {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
1221  {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
1222  {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
1223  {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
1224 
1225  {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
1226  {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
1227  {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
1228  {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
1229  {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
1230  // + 2*pshufd + 2*unpck + packus
1231 
1232  {TTI::SK_Select, MVT::v2i64, 1}, // movsd
1233  {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1234  {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
1235  {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
1236  {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
1237 
1238  {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
1239  {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
1240  {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
1241  {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
1242  // + pshufd/unpck
1243  { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
1244  // + 2*pshufd + 2*unpck + 2*packus
1245 
1246  { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
1247  { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
1248  { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
1249  { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
1250  { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
1251  };
1252 
1253  if (ST->hasSSE2())
1254  if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
1255  return LT.first * Entry->Cost;
1256 
1257  static const CostTblEntry SSE1ShuffleTbl[] = {
1258  { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
1259  { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
1260  { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
1261  { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
1262  { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
1263  };
1264 
1265  if (ST->hasSSE1())
1266  if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
1267  return LT.first * Entry->Cost;
1268 
1269  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
1270 }
1271 
1272 int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1273  const Instruction *I) {
1274  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1275  assert(ISD && "Invalid opcode");
1276 
1277  // FIXME: Need a better design of the cost table to handle non-simple types of
1278  // potential massive combinations (elem_num x src_type x dst_type).
1279 
1280  static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
1283 
1284  // Mask sign extend has an instruction.
1291 
1292  // Mask zero extend is a load + broadcast.
1299  };
1300 
1301  static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
1308 
1315 
1322 
1329  };
1330 
1331  // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
1332  // 256-bit wide vectors.
1333 
1334  static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
1338 
1343 
1344  // v16i1 -> v16i32 - load + broadcast
1357 
1366 
1391 
1395 
1405  };
1406 
1407  static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
1424 
1431 
1434 
1436  };
1437 
1438  static const TypeConversionCostTblEntry AVXConversionTbl[] = {
1455 
1466 
1479 
1495  // The generic code to compute the scalar overhead is currently broken.
1496  // Workaround this limitation by estimating the scalarization overhead
1497  // here. We have roughly 10 instructions per scalar element.
1498  // Multiply that by the vector width.
1499  // FIXME: remove that when PR19268 is fixed.
1502 
1505  // This node is expanded into scalarized operations but BasicTTI is overly
1506  // optimistic estimating its cost. It computes 3 per element (one
1507  // vector-extract, one scalar conversion and one vector-insert). The
1508  // problem is that the inserts form a read-modify-write chain so latency
1509  // should be factored in too. Inflating the cost per element by 1.
1512 
1515  };
1516 
1517  static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
1524 
1543 
1551  { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1 }, // PSHUFB
1552 
1554  };
1555 
1556  static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
1557  // These are somewhat magic numbers justified by looking at the output of
1558  // Intel's IACA, running some kernels and making sure when we take
1559  // legalization into account the throughput will be overestimated.
1561  { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1569 
1570  { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1578 
1580 
1584 
1609 
1610  { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // PAND+PACKUSWB
1614  { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 3 }, // PAND+3*PACKUSWB
1622  { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB
1623  { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW
1624  { ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1 }, // PSHUFD
1625  };
1626 
1627  std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
1628  std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);
1629 
1630  if (ST->hasSSE2() && !ST->hasAVX()) {
1631  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
1632  LTDest.second, LTSrc.second))
1633  return LTSrc.first * Entry->Cost;
1634  }
1635 
1636  EVT SrcTy = TLI->getValueType(DL, Src);
1637  EVT DstTy = TLI->getValueType(DL, Dst);
1638 
1639  // The function getSimpleVT only handles simple value types.
1640  if (!SrcTy.isSimple() || !DstTy.isSimple())
1641  return BaseT::getCastInstrCost(Opcode, Dst, Src);
1642 
1643  MVT SimpleSrcTy = SrcTy.getSimpleVT();
1644  MVT SimpleDstTy = DstTy.getSimpleVT();
1645 
1646  // Make sure that neither type is going to be split before using the
1647  // AVX512 tables. This handles -mprefer-vector-width=256
1648  // with -min-legal-vector-width<=256
1649  if (TLI->getTypeAction(SimpleSrcTy) != TargetLowering::TypeSplitVector &&
1650  TLI->getTypeAction(SimpleDstTy) != TargetLowering::TypeSplitVector) {
1651  if (ST->hasBWI())
1652  if (const auto *Entry = ConvertCostTableLookup(AVX512BWConversionTbl, ISD,
1653  SimpleDstTy, SimpleSrcTy))
1654  return Entry->Cost;
1655 
1656  if (ST->hasDQI())
1657  if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
1658  SimpleDstTy, SimpleSrcTy))
1659  return Entry->Cost;
1660 
1661  if (ST->hasAVX512())
1662  if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
1663  SimpleDstTy, SimpleSrcTy))
1664  return Entry->Cost;
1665  }
1666 
1667  if (ST->hasAVX2()) {
1668  if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
1669  SimpleDstTy, SimpleSrcTy))
1670  return Entry->Cost;
1671  }
1672 
1673  if (ST->hasAVX()) {
1674  if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
1675  SimpleDstTy, SimpleSrcTy))
1676  return Entry->Cost;
1677  }
1678 
1679  if (ST->hasSSE41()) {
1680  if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
1681  SimpleDstTy, SimpleSrcTy))
1682  return Entry->Cost;
1683  }
1684 
1685  if (ST->hasSSE2()) {
1686  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
1687  SimpleDstTy, SimpleSrcTy))
1688  return Entry->Cost;
1689  }
1690 
1691  return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
1692 }
1693 
1694 int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1695  const Instruction *I) {
1696  // Legalize the type.
1697  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1698 
1699  MVT MTy = LT.second;
1700 
1701  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1702  assert(ISD && "Invalid opcode");
1703 
1704  unsigned ExtraCost = 0;
1705  if (I && (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp)) {
1706  // Some vector comparison predicates cost extra instructions.
1707  if (MTy.isVector() &&
1708  !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
1709  (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
1710  ST->hasBWI())) {
1711  switch (cast<CmpInst>(I)->getPredicate()) {
1712  case CmpInst::Predicate::ICMP_NE:
1713  // xor(cmpeq(x,y),-1)
1714  ExtraCost = 1;
1715  break;
1716  case CmpInst::Predicate::ICMP_SGE:
1717  case CmpInst::Predicate::ICMP_SLE:
1718  // xor(cmpgt(x,y),-1)
1719  ExtraCost = 1;
1720  break;
1721  case CmpInst::Predicate::ICMP_ULT:
1722  case CmpInst::Predicate::ICMP_UGT:
1723  // cmpgt(xor(x,signbit),xor(y,signbit))
1724  // xor(cmpeq(pmaxu(x,y),x),-1)
1725  ExtraCost = 2;
1726  break;
1727  case CmpInst::Predicate::ICMP_ULE:
1728  case CmpInst::Predicate::ICMP_UGE:
1729  if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
1730  (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
1731  // cmpeq(psubus(x,y),0)
1732  // cmpeq(pminu(x,y),x)
1733  ExtraCost = 1;
1734  } else {
1735  // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
1736  ExtraCost = 3;
1737  }
1738  break;
1739  default:
1740  break;
1741  }
1742  }
1743  }
1744 
1745  static const CostTblEntry SLMCostTbl[] = {
1746  // slm pcmpeq/pcmpgt throughput is 2
1747  { ISD::SETCC, MVT::v2i64, 2 },
1748  };
1749 
1750  static const CostTblEntry AVX512BWCostTbl[] = {
1751  { ISD::SETCC, MVT::v32i16, 1 },
1752  { ISD::SETCC, MVT::v64i8, 1 },
1753 
1754  { ISD::SELECT, MVT::v32i16, 1 },
1755  { ISD::SELECT, MVT::v64i8, 1 },
1756  };
1757 
1758  static const CostTblEntry AVX512CostTbl[] = {
1759  { ISD::SETCC, MVT::v8i64, 1 },
1760  { ISD::SETCC, MVT::v16i32, 1 },
1761  { ISD::SETCC, MVT::v8f64, 1 },
1762  { ISD::SETCC, MVT::v16f32, 1 },
1763 
1764  { ISD::SELECT, MVT::v8i64, 1 },
1765  { ISD::SELECT, MVT::v16i32, 1 },
1766  { ISD::SELECT, MVT::v8f64, 1 },
1767  { ISD::SELECT, MVT::v16f32, 1 },
1768  };
1769 
1770  static const CostTblEntry AVX2CostTbl[] = {
1771  { ISD::SETCC, MVT::v4i64, 1 },
1772  { ISD::SETCC, MVT::v8i32, 1 },
1773  { ISD::SETCC, MVT::v16i16, 1 },
1774  { ISD::SETCC, MVT::v32i8, 1 },
1775 
1776  { ISD::SELECT, MVT::v4i64, 1 }, // pblendvb
1777  { ISD::SELECT, MVT::v8i32, 1 }, // pblendvb
1778  { ISD::SELECT, MVT::v16i16, 1 }, // pblendvb
1779  { ISD::SELECT, MVT::v32i8, 1 }, // pblendvb
1780  };
1781 
1782  static const CostTblEntry AVX1CostTbl[] = {
1783  { ISD::SETCC, MVT::v4f64, 1 },
1784  { ISD::SETCC, MVT::v8f32, 1 },
1785  // AVX1 does not support 8-wide integer compare.
1786  { ISD::SETCC, MVT::v4i64, 4 },
1787  { ISD::SETCC, MVT::v8i32, 4 },
1788  { ISD::SETCC, MVT::v16i16, 4 },
1789  { ISD::SETCC, MVT::v32i8, 4 },
1790 
1791  { ISD::SELECT, MVT::v4f64, 1 }, // vblendvpd
1792  { ISD::SELECT, MVT::v8f32, 1 }, // vblendvps
1793  { ISD::SELECT, MVT::v4i64, 1 }, // vblendvpd
1794  { ISD::SELECT, MVT::v8i32, 1 }, // vblendvps
1795  { ISD::SELECT, MVT::v16i16, 3 }, // vandps + vandnps + vorps
1796  { ISD::SELECT, MVT::v32i8, 3 }, // vandps + vandnps + vorps
1797  };
1798 
1799  static const CostTblEntry SSE42CostTbl[] = {
1800  { ISD::SETCC, MVT::v2f64, 1 },
1801  { ISD::SETCC, MVT::v4f32, 1 },
1802  { ISD::SETCC, MVT::v2i64, 1 },
1803  };
1804 
1805  static const CostTblEntry SSE41CostTbl[] = {
1806  { ISD::SELECT, MVT::v2f64, 1 }, // blendvpd
1807  { ISD::SELECT, MVT::v4f32, 1 }, // blendvps
1808  { ISD::SELECT, MVT::v2i64, 1 }, // pblendvb
1809  { ISD::SELECT, MVT::v4i32, 1 }, // pblendvb
1810  { ISD::SELECT, MVT::v8i16, 1 }, // pblendvb
1811  { ISD::SELECT, MVT::v16i8, 1 }, // pblendvb
1812  };
1813 
1814  static const CostTblEntry SSE2CostTbl[] = {
1815  { ISD::SETCC, MVT::v2f64, 2 },
1816  { ISD::SETCC, MVT::f64, 1 },
1817  { ISD::SETCC, MVT::v2i64, 8 },
1818  { ISD::SETCC, MVT::v4i32, 1 },
1819  { ISD::SETCC, MVT::v8i16, 1 },
1820  { ISD::SETCC, MVT::v16i8, 1 },
1821 
1822  { ISD::SELECT, MVT::v2f64, 3 }, // andpd + andnpd + orpd
1823  { ISD::SELECT, MVT::v2i64, 3 }, // pand + pandn + por
1824  { ISD::SELECT, MVT::v4i32, 3 }, // pand + pandn + por
1825  { ISD::SELECT, MVT::v8i16, 3 }, // pand + pandn + por
1826  { ISD::SELECT, MVT::v16i8, 3 }, // pand + pandn + por
1827  };
1828 
1829  static const CostTblEntry SSE1CostTbl[] = {
1830  { ISD::SETCC, MVT::v4f32, 2 },
1831  { ISD::SETCC, MVT::f32, 1 },
1832 
1833  { ISD::SELECT, MVT::v4f32, 3 }, // andps + andnps + orps
1834  };
1835 
1836  if (ST->isSLM())
1837  if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
1838  return LT.first * (ExtraCost + Entry->Cost);
1839 
1840  if (ST->hasBWI())
1841  if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
1842  return LT.first * (ExtraCost + Entry->Cost);
1843 
1844  if (ST->hasAVX512())
1845  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
1846  return LT.first * (ExtraCost + Entry->Cost);
1847 
1848  if (ST->hasAVX2())
1849  if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
1850  return LT.first * (ExtraCost + Entry->Cost);
1851 
1852  if (ST->hasAVX())
1853  if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
1854  return LT.first * (ExtraCost + Entry->Cost);
1855 
1856  if (ST->hasSSE42())
1857  if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
1858  return LT.first * (ExtraCost + Entry->Cost);
1859 
1860  if (ST->hasSSE41())
1861  if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
1862  return LT.first * (ExtraCost + Entry->Cost);
1863 
1864  if (ST->hasSSE2())
1865  if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
1866  return LT.first * (ExtraCost + Entry->Cost);
1867 
1868  if (ST->hasSSE1())
1869  if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
1870  return LT.first * (ExtraCost + Entry->Cost);
1871 
1872  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
1873 }
1874 
1876 
1879  unsigned ScalarizationCostPassed) {
1880  // Costs should match the codegen from:
1881  // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
1882  // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
1883  // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
1884  // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
1885  // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
1886  static const CostTblEntry AVX512CDCostTbl[] = {
1887  { ISD::CTLZ, MVT::v8i64, 1 },
1888  { ISD::CTLZ, MVT::v16i32, 1 },
1889  { ISD::CTLZ, MVT::v32i16, 8 },
1890  { ISD::CTLZ, MVT::v64i8, 20 },
1891  { ISD::CTLZ, MVT::v4i64, 1 },
1892  { ISD::CTLZ, MVT::v8i32, 1 },
1893  { ISD::CTLZ, MVT::v16i16, 4 },
1894  { ISD::CTLZ, MVT::v32i8, 10 },
1895  { ISD::CTLZ, MVT::v2i64, 1 },
1896  { ISD::CTLZ, MVT::v4i32, 1 },
1897  { ISD::CTLZ, MVT::v8i16, 4 },
1898  { ISD::CTLZ, MVT::v16i8, 4 },
1899  };
1900  static const CostTblEntry AVX512BWCostTbl[] = {
1901  { ISD::BITREVERSE, MVT::v8i64, 5 },
1902  { ISD::BITREVERSE, MVT::v16i32, 5 },
1903  { ISD::BITREVERSE, MVT::v32i16, 5 },
1904  { ISD::BITREVERSE, MVT::v64i8, 5 },
1905  { ISD::CTLZ, MVT::v8i64, 23 },
1906  { ISD::CTLZ, MVT::v16i32, 22 },
1907  { ISD::CTLZ, MVT::v32i16, 18 },
1908  { ISD::CTLZ, MVT::v64i8, 17 },
1909  { ISD::CTPOP, MVT::v8i64, 7 },
1910  { ISD::CTPOP, MVT::v16i32, 11 },
1911  { ISD::CTPOP, MVT::v32i16, 9 },
1912  { ISD::CTPOP, MVT::v64i8, 6 },
1913  { ISD::CTTZ, MVT::v8i64, 10 },
1914  { ISD::CTTZ, MVT::v16i32, 14 },
1915  { ISD::CTTZ, MVT::v32i16, 12 },
1916  { ISD::CTTZ, MVT::v64i8, 9 },
1917  { ISD::SADDSAT, MVT::v32i16, 1 },
1918  { ISD::SADDSAT, MVT::v64i8, 1 },
1919  { ISD::SSUBSAT, MVT::v32i16, 1 },
1920  { ISD::SSUBSAT, MVT::v64i8, 1 },
1921  { ISD::UADDSAT, MVT::v32i16, 1 },
1922  { ISD::UADDSAT, MVT::v64i8, 1 },
1923  { ISD::USUBSAT, MVT::v32i16, 1 },
1924  { ISD::USUBSAT, MVT::v64i8, 1 },
1925  };
1926  static const CostTblEntry AVX512CostTbl[] = {
1927  { ISD::BITREVERSE, MVT::v8i64, 36 },
1928  { ISD::BITREVERSE, MVT::v16i32, 24 },
1929  { ISD::CTLZ, MVT::v8i64, 29 },
1930  { ISD::CTLZ, MVT::v16i32, 35 },
1931  { ISD::CTPOP, MVT::v8i64, 16 },
1932  { ISD::CTPOP, MVT::v16i32, 24 },
1933  { ISD::CTTZ, MVT::v8i64, 20 },
1934  { ISD::CTTZ, MVT::v16i32, 28 },
1935  { ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd
1936  { ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq
1937  { ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq
1938  { ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq
1939  { ISD::UADDSAT, MVT::v16i32, 3 }, // not + pminud + paddd
1940  { ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq
1941  { ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq
1942  { ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq
1943  };
1944  static const CostTblEntry XOPCostTbl[] = {
1945  { ISD::BITREVERSE, MVT::v4i64, 4 },
1946  { ISD::BITREVERSE, MVT::v8i32, 4 },
1947  { ISD::BITREVERSE, MVT::v16i16, 4 },
1948  { ISD::BITREVERSE, MVT::v32i8, 4 },
1949  { ISD::BITREVERSE, MVT::v2i64, 1 },
1950  { ISD::BITREVERSE, MVT::v4i32, 1 },
1951  { ISD::BITREVERSE, MVT::v8i16, 1 },
1952  { ISD::BITREVERSE, MVT::v16i8, 1 },
1953  { ISD::BITREVERSE, MVT::i64, 3 },
1954  { ISD::BITREVERSE, MVT::i32, 3 },
1955  { ISD::BITREVERSE, MVT::i16, 3 },
1956  { ISD::BITREVERSE, MVT::i8, 3 }
1957  };
1958  static const CostTblEntry AVX2CostTbl[] = {
1959  { ISD::BITREVERSE, MVT::v4i64, 5 },
1960  { ISD::BITREVERSE, MVT::v8i32, 5 },
1961  { ISD::BITREVERSE, MVT::v16i16, 5 },
1962  { ISD::BITREVERSE, MVT::v32i8, 5 },
1963  { ISD::BSWAP, MVT::v4i64, 1 },
1964  { ISD::BSWAP, MVT::v8i32, 1 },
1965  { ISD::BSWAP, MVT::v16i16, 1 },
1966  { ISD::CTLZ, MVT::v4i64, 23 },
1967  { ISD::CTLZ, MVT::v8i32, 18 },
1968  { ISD::CTLZ, MVT::v16i16, 14 },
1969  { ISD::CTLZ, MVT::v32i8, 9 },
1970  { ISD::CTPOP, MVT::v4i64, 7 },
1971  { ISD::CTPOP, MVT::v8i32, 11 },
1972  { ISD::CTPOP, MVT::v16i16, 9 },
1973  { ISD::CTPOP, MVT::v32i8, 6 },
1974  { ISD::CTTZ, MVT::v4i64, 10 },
1975  { ISD::CTTZ, MVT::v8i32, 14 },
1976  { ISD::CTTZ, MVT::v16i16, 12 },
1977  { ISD::CTTZ, MVT::v32i8, 9 },
1978  { ISD::SADDSAT, MVT::v16i16, 1 },
1979  { ISD::SADDSAT, MVT::v32i8, 1 },
1980  { ISD::SSUBSAT, MVT::v16i16, 1 },
1981  { ISD::SSUBSAT, MVT::v32i8, 1 },
1982  { ISD::UADDSAT, MVT::v16i16, 1 },
1983  { ISD::UADDSAT, MVT::v32i8, 1 },
1984  { ISD::UADDSAT, MVT::v8i32, 3 }, // not + pminud + paddd
1985  { ISD::USUBSAT, MVT::v16i16, 1 },
1986  { ISD::USUBSAT, MVT::v32i8, 1 },
1987  { ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd
1988  { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/
1989  { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
1990  { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
1991  { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/
1992  { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
1993  { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
1994  };
1995  static const CostTblEntry AVX1CostTbl[] = {
1996  { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert
1997  { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert
1998  { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert
1999  { ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert
2000  { ISD::BSWAP, MVT::v4i64, 4 },
2001  { ISD::BSWAP, MVT::v8i32, 4 },
2002  { ISD::BSWAP, MVT::v16i16, 4 },
2003  { ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert
2004  { ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert
2005  { ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert
2006  { ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
2007  { ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert
2008  { ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert
2009  { ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert
2010  { ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert
2011  { ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert
2012  { ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert
2013  { ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert
2014  { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
2015  { ISD::SADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2016  { ISD::SADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2017  { ISD::SSUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2018  { ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2019  { ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2020  { ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2021  { ISD::UADDSAT, MVT::v8i32, 8 }, // 2 x 128-bit Op + extract/insert
2022  { ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2023  { ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2024  { ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert
2025  { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
2026  { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
2027  { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
2028  { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/
2029  { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/
2030  { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/
2031  };
2032  static const CostTblEntry GLMCostTbl[] = {
2033  { ISD::FSQRT, MVT::f32, 19 }, // sqrtss
2034  { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps
2035  { ISD::FSQRT, MVT::f64, 34 }, // sqrtsd
2036  { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd
2037  };
2038  static const CostTblEntry SLMCostTbl[] = {
2039  { ISD::FSQRT, MVT::f32, 20 }, // sqrtss
2040  { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps
2041  { ISD::FSQRT, MVT::f64, 35 }, // sqrtsd
2042  { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd
2043  };
2044  static const CostTblEntry SSE42CostTbl[] = {
2045  { ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd
2046  { ISD::UADDSAT, MVT::v4i32, 3 }, // not + pminud + paddd
2047  { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
2048  { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
2049  };
2050  static const CostTblEntry SSSE3CostTbl[] = {
2051  { ISD::BITREVERSE, MVT::v2i64, 5 },
2052  { ISD::BITREVERSE, MVT::v4i32, 5 },
2053  { ISD::BITREVERSE, MVT::v8i16, 5 },
2054  { ISD::BITREVERSE, MVT::v16i8, 5 },
2055  { ISD::BSWAP, MVT::v2i64, 1 },
2056  { ISD::BSWAP, MVT::v4i32, 1 },
2057  { ISD::BSWAP, MVT::v8i16, 1 },
2058  { ISD::CTLZ, MVT::v2i64, 23 },
2059  { ISD::CTLZ, MVT::v4i32, 18 },
2060  { ISD::CTLZ, MVT::v8i16, 14 },
2061  { ISD::CTLZ, MVT::v16i8, 9 },
2062  { ISD::CTPOP, MVT::v2i64, 7 },
2063  { ISD::CTPOP, MVT::v4i32, 11 },
2064  { ISD::CTPOP, MVT::v8i16, 9 },
2065  { ISD::CTPOP, MVT::v16i8, 6 },
2066  { ISD::CTTZ, MVT::v2i64, 10 },
2067  { ISD::CTTZ, MVT::v4i32, 14 },
2068  { ISD::CTTZ, MVT::v8i16, 12 },
2069  { ISD::CTTZ, MVT::v16i8, 9 }
2070  };
2071  static const CostTblEntry SSE2CostTbl[] = {
2072  { ISD::BITREVERSE, MVT::v2i64, 29 },
2073  { ISD::BITREVERSE, MVT::v4i32, 27 },
2074  { ISD::BITREVERSE, MVT::v8i16, 27 },
2075  { ISD::BITREVERSE, MVT::v16i8, 20 },
2076  { ISD::BSWAP, MVT::v2i64, 7 },
2077  { ISD::BSWAP, MVT::v4i32, 7 },
2078  { ISD::BSWAP, MVT::v8i16, 7 },
2079  { ISD::CTLZ, MVT::v2i64, 25 },
2080  { ISD::CTLZ, MVT::v4i32, 26 },
2081  { ISD::CTLZ, MVT::v8i16, 20 },
2082  { ISD::CTLZ, MVT::v16i8, 17 },
2083  { ISD::CTPOP, MVT::v2i64, 12 },
2084  { ISD::CTPOP, MVT::v4i32, 15 },
2085  { ISD::CTPOP, MVT::v8i16, 13 },
2086  { ISD::CTPOP, MVT::v16i8, 10 },
2087  { ISD::CTTZ, MVT::v2i64, 14 },
2088  { ISD::CTTZ, MVT::v4i32, 18 },
2089  { ISD::CTTZ, MVT::v8i16, 16 },
2090  { ISD::CTTZ, MVT::v16i8, 13 },
2091  { ISD::SADDSAT, MVT::v8i16, 1 },
2092  { ISD::SADDSAT, MVT::v16i8, 1 },
2093  { ISD::SSUBSAT, MVT::v8i16, 1 },
2094  { ISD::SSUBSAT, MVT::v16i8, 1 },
2095  { ISD::UADDSAT, MVT::v8i16, 1 },
2096  { ISD::UADDSAT, MVT::v16i8, 1 },
2097  { ISD::USUBSAT, MVT::v8i16, 1 },
2098  { ISD::USUBSAT, MVT::v16i8, 1 },
2099  { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/
2100  { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/
2101  };
2102  static const CostTblEntry SSE1CostTbl[] = {
2103  { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
2104  { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
2105  };
2106  static const CostTblEntry LZCNT64CostTbl[] = { // 64-bit targets
2107  { ISD::CTLZ, MVT::i64, 1 },
2108  };
2109  static const CostTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
2110  { ISD::CTLZ, MVT::i32, 1 },
2111  { ISD::CTLZ, MVT::i16, 1 },
2112  { ISD::CTLZ, MVT::i8, 1 },
2113  };
2114  static const CostTblEntry POPCNT64CostTbl[] = { // 64-bit targets
2115  { ISD::CTPOP, MVT::i64, 1 },
2116  };
2117  static const CostTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
2118  { ISD::CTPOP, MVT::i32, 1 },
2119  { ISD::CTPOP, MVT::i16, 1 },
2120  { ISD::CTPOP, MVT::i8, 1 },
2121  };
2122  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
2123  { ISD::BITREVERSE, MVT::i64, 14 },
2124  { ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV
2125  { ISD::CTPOP, MVT::i64, 10 },
2126  { ISD::SADDO, MVT::i64, 1 },
2127  { ISD::UADDO, MVT::i64, 1 },
2128  };
2129  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
2130  { ISD::BITREVERSE, MVT::i32, 14 },
2131  { ISD::BITREVERSE, MVT::i16, 14 },
2132  { ISD::BITREVERSE, MVT::i8, 11 },
2133  { ISD::CTLZ, MVT::i32, 4 }, // BSR+XOR or BSR+XOR+CMOV
2134  { ISD::CTLZ, MVT::i16, 4 }, // BSR+XOR or BSR+XOR+CMOV
2135  { ISD::CTLZ, MVT::i8, 4 }, // BSR+XOR or BSR+XOR+CMOV
2136  { ISD::CTPOP, MVT::i32, 8 },
2137  { ISD::CTPOP, MVT::i16, 9 },
2138  { ISD::CTPOP, MVT::i8, 7 },
2139  { ISD::SADDO, MVT::i32, 1 },
2140  { ISD::SADDO, MVT::i16, 1 },
2141  { ISD::SADDO, MVT::i8, 1 },
2142  { ISD::UADDO, MVT::i32, 1 },
2143  { ISD::UADDO, MVT::i16, 1 },
2144  { ISD::UADDO, MVT::i8, 1 },
2145  };
2146 
2147  Type *OpTy = RetTy;
2148  unsigned ISD = ISD::DELETED_NODE;
2149  switch (IID) {
2150  default:
2151  break;
2152  case Intrinsic::bitreverse:
2153  ISD = ISD::BITREVERSE;
2154  break;
2155  case Intrinsic::bswap:
2156  ISD = ISD::BSWAP;
2157  break;
2158  case Intrinsic::ctlz:
2159  ISD = ISD::CTLZ;
2160  break;
2161  case Intrinsic::ctpop:
2162  ISD = ISD::CTPOP;
2163  break;
2164  case Intrinsic::cttz:
2165  ISD = ISD::CTTZ;
2166  break;
2167  case Intrinsic::sadd_sat:
2168  ISD = ISD::SADDSAT;
2169  break;
2170  case Intrinsic::ssub_sat:
2171  ISD = ISD::SSUBSAT;
2172  break;
2173  case Intrinsic::uadd_sat:
2174  ISD = ISD::UADDSAT;
2175  break;
2176  case Intrinsic::usub_sat:
2177  ISD = ISD::USUBSAT;
2178  break;
2179  case Intrinsic::sqrt:
2180  ISD = ISD::FSQRT;
2181  break;
2182  case Intrinsic::sadd_with_overflow:
2183  case Intrinsic::ssub_with_overflow:
2184  // SSUBO has same costs so don't duplicate.
2185  ISD = ISD::SADDO;
2186  OpTy = RetTy->getContainedType(0);
2187  break;
2188  case Intrinsic::uadd_with_overflow:
2189  case Intrinsic::usub_with_overflow:
2190  // USUBO has same costs so don't duplicate.
2191  ISD = ISD::UADDO;
2192  OpTy = RetTy->getContainedType(0);
2193  break;
2194  }
2195 
2196  if (ISD != ISD::DELETED_NODE) {
2197  // Legalize the type.
2198  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, OpTy);
2199  MVT MTy = LT.second;
2200 
2201  // Attempt to lookup cost.
2202  if (ST->isGLM())
2203  if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
2204  return LT.first * Entry->Cost;
2205 
2206  if (ST->isSLM())
2207  if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
2208  return LT.first * Entry->Cost;
2209 
2210  if (ST->hasCDI())
2211  if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
2212  return LT.first * Entry->Cost;
2213 
2214  if (ST->hasBWI())
2215  if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
2216  return LT.first * Entry->Cost;
2217 
2218  if (ST->hasAVX512())
2219  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2220  return LT.first * Entry->Cost;
2221 
2222  if (ST->hasXOP())
2223  if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
2224  return LT.first * Entry->Cost;
2225 
2226  if (ST->hasAVX2())
2227  if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
2228  return LT.first * Entry->Cost;
2229 
2230  if (ST->hasAVX())
2231  if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
2232  return LT.first * Entry->Cost;
2233 
2234  if (ST->hasSSE42())
2235  if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
2236  return LT.first * Entry->Cost;
2237 
2238  if (ST->hasSSSE3())
2239  if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
2240  return LT.first * Entry->Cost;
2241 
2242  if (ST->hasSSE2())
2243  if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
2244  return LT.first * Entry->Cost;
2245 
2246  if (ST->hasSSE1())
2247  if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
2248  return LT.first * Entry->Cost;
2249 
2250  if (ST->hasLZCNT()) {
2251  if (ST->is64Bit())
2252  if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
2253  return LT.first * Entry->Cost;
2254 
2255  if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
2256  return LT.first * Entry->Cost;
2257  }
2258 
2259  if (ST->hasPOPCNT()) {
2260  if (ST->is64Bit())
2261  if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
2262  return LT.first * Entry->Cost;
2263 
2264  if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
2265  return LT.first * Entry->Cost;
2266  }
2267 
2268  // TODO - add BMI (TZCNT) scalar handling
2269 
2270  if (ST->is64Bit())
2271  if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
2272  return LT.first * Entry->Cost;
2273 
2274  if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
2275  return LT.first * Entry->Cost;
2276  }
2277 
2278  return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed);
2279 }
2280 
2283  unsigned VF) {
2284  static const CostTblEntry AVX512CostTbl[] = {
2285  { ISD::ROTL, MVT::v8i64, 1 },
2286  { ISD::ROTL, MVT::v4i64, 1 },
2287  { ISD::ROTL, MVT::v2i64, 1 },
2288  { ISD::ROTL, MVT::v16i32, 1 },
2289  { ISD::ROTL, MVT::v8i32, 1 },
2290  { ISD::ROTL, MVT::v4i32, 1 },
2291  { ISD::ROTR, MVT::v8i64, 1 },
2292  { ISD::ROTR, MVT::v4i64, 1 },
2293  { ISD::ROTR, MVT::v2i64, 1 },
2294  { ISD::ROTR, MVT::v16i32, 1 },
2295  { ISD::ROTR, MVT::v8i32, 1 },
2296  { ISD::ROTR, MVT::v4i32, 1 }
2297  };
2298  // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
2299  static const CostTblEntry XOPCostTbl[] = {
2300  { ISD::ROTL, MVT::v4i64, 4 },
2301  { ISD::ROTL, MVT::v8i32, 4 },
2302  { ISD::ROTL, MVT::v16i16, 4 },
2303  { ISD::ROTL, MVT::v32i8, 4 },
2304  { ISD::ROTL, MVT::v2i64, 1 },
2305  { ISD::ROTL, MVT::v4i32, 1 },
2306  { ISD::ROTL, MVT::v8i16, 1 },
2307  { ISD::ROTL, MVT::v16i8, 1 },
2308  { ISD::ROTR, MVT::v4i64, 6 },
2309  { ISD::ROTR, MVT::v8i32, 6 },
2310  { ISD::ROTR, MVT::v16i16, 6 },
2311  { ISD::ROTR, MVT::v32i8, 6 },
2312  { ISD::ROTR, MVT::v2i64, 2 },
2313  { ISD::ROTR, MVT::v4i32, 2 },
2314  { ISD::ROTR, MVT::v8i16, 2 },
2315  { ISD::ROTR, MVT::v16i8, 2 }
2316  };
2317  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
2318  { ISD::ROTL, MVT::i64, 1 },
2319  { ISD::ROTR, MVT::i64, 1 },
2320  { ISD::FSHL, MVT::i64, 4 }
2321  };
2322  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
2323  { ISD::ROTL, MVT::i32, 1 },
2324  { ISD::ROTL, MVT::i16, 1 },
2325  { ISD::ROTL, MVT::i8, 1 },
2326  { ISD::ROTR, MVT::i32, 1 },
2327  { ISD::ROTR, MVT::i16, 1 },
2328  { ISD::ROTR, MVT::i8, 1 },
2329  { ISD::FSHL, MVT::i32, 4 },
2330  { ISD::FSHL, MVT::i16, 4 },
2331  { ISD::FSHL, MVT::i8, 4 }
2332  };
2333 
2334  unsigned ISD = ISD::DELETED_NODE;
2335  switch (IID) {
2336  default:
2337  break;
2338  case Intrinsic::fshl:
2339  ISD = ISD::FSHL;
2340  if (Args[0] == Args[1])
2341  ISD = ISD::ROTL;
2342  break;
2343  case Intrinsic::fshr:
2344  // FSHR has same costs so don't duplicate.
2345  ISD = ISD::FSHL;
2346  if (Args[0] == Args[1])
2347  ISD = ISD::ROTR;
2348  break;
2349  }
2350 
2351  if (ISD != ISD::DELETED_NODE) {
2352  // Legalize the type.
2353  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
2354  MVT MTy = LT.second;
2355 
2356  // Attempt to lookup cost.
2357  if (ST->hasAVX512())
2358  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2359  return LT.first * Entry->Cost;
2360 
2361  if (ST->hasXOP())
2362  if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
2363  return LT.first * Entry->Cost;
2364 
2365  if (ST->is64Bit())
2366  if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
2367  return LT.first * Entry->Cost;
2368 
2369  if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
2370  return LT.first * Entry->Cost;
2371  }
2372 
2373  return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF);
2374 }
2375 
2376 int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
2377  assert(Val->isVectorTy() && "This must be a vector type");
2378 
2379  Type *ScalarType = Val->getScalarType();
2380 
2381  if (Index != -1U) {
2382  // Legalize the type.
2383  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
2384 
2385  // This type is legalized to a scalar type.
2386  if (!LT.second.isVector())
2387  return 0;
2388 
2389  // The type may be split. Normalize the index to the new type.
2390  unsigned Width = LT.second.getVectorNumElements();
2391  Index = Index % Width;
2392 
2393  // Floating point scalars are already located in index #0.
2394  if (ScalarType->isFloatingPointTy() && Index == 0)
2395  return 0;
2396  }
2397 
2398  // Add to the base cost if we know that the extracted element of a vector is
2399  // destined to be moved to and used in the integer register file.
2400  int RegisterFileMoveCost = 0;
2401  if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
2402  RegisterFileMoveCost = 1;
2403 
2404  return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
2405 }
2406 
2407 int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
2408  unsigned AddressSpace, const Instruction *I) {
2409  // Handle non-power-of-two vectors such as <3 x float>
2410  if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
2411  unsigned NumElem = VTy->getVectorNumElements();
2412 
2413  // Handle a few common cases:
2414  // <3 x float>
2415  if (NumElem == 3 && VTy->getScalarSizeInBits() == 32)
2416  // Cost = 64 bit store + extract + 32 bit store.
2417  return 3;
2418 
2419  // <3 x double>
2420  if (NumElem == 3 && VTy->getScalarSizeInBits() == 64)
2421  // Cost = 128 bit store + unpack + 64 bit store.
2422  return 3;
2423 
2424  // Assume that all other non-power-of-two numbers are scalarized.
2425  if (!isPowerOf2_32(NumElem)) {
2426  int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
2427  AddressSpace);
2428  int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load,
2429  Opcode == Instruction::Store);
2430  return NumElem * Cost + SplitCost;
2431  }
2432  }
2433 
2434  // Legalize the type.
2435  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
2436  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
2437  "Invalid Opcode");
2438 
2439  // Each load/store unit costs 1.
2440  int Cost = LT.first * 1;
2441 
2442  // This isn't exactly right. We're using slow unaligned 32-byte accesses as a
2443  // proxy for a double-pumped AVX memory interface such as on Sandybridge.
2444  if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow())
2445  Cost *= 2;
2446 
2447  return Cost;
2448 }
2449 
2450 int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
2451  unsigned Alignment,
2452  unsigned AddressSpace) {
2453  bool IsLoad = (Instruction::Load == Opcode);
2454  bool IsStore = (Instruction::Store == Opcode);
2455 
2456  VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy);
2457  if (!SrcVTy)
2458  // To calculate scalar take the regular cost, without mask
2459  return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace);
2460 
2461  unsigned NumElem = SrcVTy->getVectorNumElements();
2462  VectorType *MaskTy =
2463  VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
2464  if ((IsLoad && !isLegalMaskedLoad(SrcVTy, MaybeAlign(Alignment))) ||
2465  (IsStore && !isLegalMaskedStore(SrcVTy, MaybeAlign(Alignment))) ||
2466  !isPowerOf2_32(NumElem)) {
2467  // Scalarization
2468  int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
2469  int ScalarCompareCost = getCmpSelInstrCost(
2470  Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr);
2471  int BranchCost = getCFInstrCost(Instruction::Br);
2472  int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
2473 
2474  int ValueSplitCost = getScalarizationOverhead(SrcVTy, IsLoad, IsStore);
2475  int MemopCost =
2476  NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
2477  Alignment, AddressSpace);
2478  return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
2479  }
2480 
2481  // Legalize the type.
2482  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
2483  auto VT = TLI->getValueType(DL, SrcVTy);
2484  int Cost = 0;
2485  if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
2486  LT.second.getVectorNumElements() == NumElem)
2487  // Promotion requires expand/truncate for data and a shuffle for mask.
2488  Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, 0, nullptr) +
2489  getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, 0, nullptr);
2490 
2491  else if (LT.second.getVectorNumElements() > NumElem) {
2492  VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(),
2493  LT.second.getVectorNumElements());
2494  // Expanding requires fill mask with zeroes
2495  Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy);
2496  }
2497 
2498  // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
2499  if (!ST->hasAVX512())
2500  return Cost + LT.first * (IsLoad ? 2 : 8);
2501 
2502  // AVX-512 masked load/store is cheapper
2503  return Cost + LT.first;
2504 }
2505 
2507  const SCEV *Ptr) {
2508  // Address computations in vectorized code with non-consecutive addresses will
2509  // likely result in more instructions compared to scalar code where the
2510  // computation can more often be merged into the index mode. The resulting
2511  // extra micro-ops can significantly decrease throughput.
2512  const unsigned NumVectorInstToHideOverhead = 10;
2513 
2514  // Cost modeling of Strided Access Computation is hidden by the indexing
2515  // modes of X86 regardless of the stride value. We dont believe that there
2516  // is a difference between constant strided access in gerenal and constant
2517  // strided value which is less than or equal to 64.
2518  // Even in the case of (loop invariant) stride whose value is not known at
2519  // compile time, the address computation will not incur more than one extra
2520  // ADD instruction.
2521  if (Ty->isVectorTy() && SE) {
2522  if (!BaseT::isStridedAccess(Ptr))
2523  return NumVectorInstToHideOverhead;
2524  if (!BaseT::getConstantStrideStep(SE, Ptr))
2525  return 1;
2526  }
2527 
2528  return BaseT::getAddressComputationCost(Ty, SE, Ptr);
2529 }
2530 
2531 int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
2532  bool IsPairwise) {
2533  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
2534  // and make it as the cost.
2535 
2536  static const CostTblEntry SSE2CostTblPairWise[] = {
2537  { ISD::FADD, MVT::v2f64, 2 },
2538  { ISD::FADD, MVT::v4f32, 4 },
2539  { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
2540  { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32.
2541  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
2542  { ISD::ADD, MVT::v2i16, 3 }, // FIXME: chosen to be less than v4i16
2543  { ISD::ADD, MVT::v4i16, 4 }, // FIXME: chosen to be less than v8i16
2544  { ISD::ADD, MVT::v8i16, 5 },
2545  { ISD::ADD, MVT::v2i8, 2 },
2546  { ISD::ADD, MVT::v4i8, 2 },
2547  { ISD::ADD, MVT::v8i8, 2 },
2548  { ISD::ADD, MVT::v16i8, 3 },
2549  };
2550 
2551  static const CostTblEntry AVX1CostTblPairWise[] = {
2552  { ISD::FADD, MVT::v4f64, 5 },
2553  { ISD::FADD, MVT::v8f32, 7 },
2554  { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
2555  { ISD::ADD, MVT::v4i64, 5 }, // The data reported by the IACA tool is "4.8".
2556  { ISD::ADD, MVT::v8i32, 5 },
2557  { ISD::ADD, MVT::v16i16, 6 },
2558  { ISD::ADD, MVT::v32i8, 4 },
2559  };
2560 
2561  static const CostTblEntry SSE2CostTblNoPairWise[] = {
2562  { ISD::FADD, MVT::v2f64, 2 },
2563  { ISD::FADD, MVT::v4f32, 4 },
2564  { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
2565  { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
2566  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
2567  { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
2568  { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
2569  { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
2570  { ISD::ADD, MVT::v2i8, 2 },
2571  { ISD::ADD, MVT::v4i8, 2 },
2572  { ISD::ADD, MVT::v8i8, 2 },
2573  { ISD::ADD, MVT::v16i8, 3 },
2574  };
2575 
2576  static const CostTblEntry AVX1CostTblNoPairWise[] = {
2577  { ISD::FADD, MVT::v4f64, 3 },
2578  { ISD::FADD, MVT::v4f32, 3 },
2579  { ISD::FADD, MVT::v8f32, 4 },
2580  { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
2581  { ISD::ADD, MVT::v4i64, 3 },
2582  { ISD::ADD, MVT::v8i32, 5 },
2583  { ISD::ADD, MVT::v16i16, 5 },
2584  { ISD::ADD, MVT::v32i8, 4 },
2585  };
2586 
2587  int ISD = TLI->InstructionOpcodeToISD(Opcode);
2588  assert(ISD && "Invalid opcode");
2589 
2590  // Before legalizing the type, give a chance to look up illegal narrow types
2591  // in the table.
2592  // FIXME: Is there a better way to do this?
2593  EVT VT = TLI->getValueType(DL, ValTy);
2594  if (VT.isSimple()) {
2595  MVT MTy = VT.getSimpleVT();
2596  if (IsPairwise) {
2597  if (ST->hasAVX())
2598  if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
2599  return Entry->Cost;
2600 
2601  if (ST->hasSSE2())
2602  if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy))
2603  return Entry->Cost;
2604  } else {
2605  if (ST->hasAVX())
2606  if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
2607  return Entry->Cost;
2608 
2609  if (ST->hasSSE2())
2610  if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
2611  return Entry->Cost;
2612  }
2613  }
2614 
2615  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2616 
2617  MVT MTy = LT.second;
2618 
2619  if (IsPairwise) {
2620  if (ST->hasAVX())
2621  if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
2622  return LT.first * Entry->Cost;
2623 
2624  if (ST->hasSSE2())
2625  if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy))
2626  return LT.first * Entry->Cost;
2627  } else {
2628  if (ST->hasAVX())
2629  if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
2630  return LT.first * Entry->Cost;
2631 
2632  if (ST->hasSSE2())
2633  if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
2634  return LT.first * Entry->Cost;
2635  }
2636 
2637  static const CostTblEntry AVX2BoolReduction[] = {
2638  { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
2639  { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
2640  { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
2641  { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
2642  };
2643 
2644  static const CostTblEntry AVX1BoolReduction[] = {
2645  { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
2646  { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
2647  { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
2648  { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
2649  { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
2650  { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
2651  { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
2652  { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
2653  };
2654 
2655  static const CostTblEntry SSE2BoolReduction[] = {
2656  { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
2657  { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
2658  { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
2659  { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
2660  { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
2661  { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
2662  { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
2663  { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
2664  };
2665 
2666  // Handle bool allof/anyof patterns.
2667  if (ValTy->getVectorElementType()->isIntegerTy(1)) {
2668  if (ST->hasAVX2())
2669  if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
2670  return LT.first * Entry->Cost;
2671  if (ST->hasAVX())
2672  if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
2673  return LT.first * Entry->Cost;
2674  if (ST->hasSSE2())
2675  if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
2676  return LT.first * Entry->Cost;
2677  }
2678 
2679  return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise);
2680 }
2681 
2683  bool IsPairwise, bool IsUnsigned) {
2684  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2685 
2686  MVT MTy = LT.second;
2687 
2688  int ISD;
2689  if (ValTy->isIntOrIntVectorTy()) {
2690  ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
2691  } else {
2692  assert(ValTy->isFPOrFPVectorTy() &&
2693  "Expected float point or integer vector type.");
2694  ISD = ISD::FMINNUM;
2695  }
2696 
2697  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
2698  // and make it as the cost.
2699 
2700  static const CostTblEntry SSE1CostTblPairWise[] = {
2701  {ISD::FMINNUM, MVT::v4f32, 4},
2702  };
2703 
2704  static const CostTblEntry SSE2CostTblPairWise[] = {
2705  {ISD::FMINNUM, MVT::v2f64, 3},
2706  {ISD::SMIN, MVT::v2i64, 6},
2707  {ISD::UMIN, MVT::v2i64, 8},
2708  {ISD::SMIN, MVT::v4i32, 6},
2709  {ISD::UMIN, MVT::v4i32, 8},
2710  {ISD::SMIN, MVT::v8i16, 4},
2711  {ISD::UMIN, MVT::v8i16, 6},
2712  {ISD::SMIN, MVT::v16i8, 8},
2713  {ISD::UMIN, MVT::v16i8, 6},
2714  };
2715 
2716  static const CostTblEntry SSE41CostTblPairWise[] = {
2717  {ISD::FMINNUM, MVT::v4f32, 2},
2718  {ISD::SMIN, MVT::v2i64, 9},
2719  {ISD::UMIN, MVT::v2i64,10},
2720  {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
2721  {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
2722  {ISD::SMIN, MVT::v8i16, 2},
2723  {ISD::UMIN, MVT::v8i16, 2},
2724  {ISD::SMIN, MVT::v16i8, 3},
2725  {ISD::UMIN, MVT::v16i8, 3},
2726  };
2727 
2728  static const CostTblEntry SSE42CostTblPairWise[] = {
2729  {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
2730  {ISD::UMIN, MVT::v2i64, 8}, // The data reported by the IACA is "8.6"
2731  };
2732 
2733  static const CostTblEntry AVX1CostTblPairWise[] = {
2734  {ISD::FMINNUM, MVT::v4f32, 1},
2735  {ISD::FMINNUM, MVT::v4f64, 1},
2736  {ISD::FMINNUM, MVT::v8f32, 2},
2737  {ISD::SMIN, MVT::v2i64, 3},
2738  {ISD::UMIN, MVT::v2i64, 3},
2739  {ISD::SMIN, MVT::v4i32, 1},
2740  {ISD::UMIN, MVT::v4i32, 1},
2741  {ISD::SMIN, MVT::v8i16, 1},
2742  {ISD::UMIN, MVT::v8i16, 1},
2743  {ISD::SMIN, MVT::v16i8, 2},
2744  {ISD::UMIN, MVT::v16i8, 2},
2745  {ISD::SMIN, MVT::v4i64, 7},
2746  {ISD::UMIN, MVT::v4i64, 7},
2747  {ISD::SMIN, MVT::v8i32, 3},
2748  {ISD::UMIN, MVT::v8i32, 3},
2749  {ISD::SMIN, MVT::v16i16, 3},
2750  {ISD::UMIN, MVT::v16i16, 3},
2751  {ISD::SMIN, MVT::v32i8, 3},
2752  {ISD::UMIN, MVT::v32i8, 3},
2753  };
2754 
2755  static const CostTblEntry AVX2CostTblPairWise[] = {
2756  {ISD::SMIN, MVT::v4i64, 2},
2757  {ISD::UMIN, MVT::v4i64, 2},
2758  {ISD::SMIN, MVT::v8i32, 1},
2759  {ISD::UMIN, MVT::v8i32, 1},
2760  {ISD::SMIN, MVT::v16i16, 1},
2761  {ISD::UMIN, MVT::v16i16, 1},
2762  {ISD::SMIN, MVT::v32i8, 2},
2763  {ISD::UMIN, MVT::v32i8, 2},
2764  };
2765 
2766  static const CostTblEntry AVX512CostTblPairWise[] = {
2767  {ISD::FMINNUM, MVT::v8f64, 1},
2768  {ISD::FMINNUM, MVT::v16f32, 2},
2769  {ISD::SMIN, MVT::v8i64, 2},
2770  {ISD::UMIN, MVT::v8i64, 2},
2771  {ISD::SMIN, MVT::v16i32, 1},
2772  {ISD::UMIN, MVT::v16i32, 1},
2773  };
2774 
2775  static const CostTblEntry SSE1CostTblNoPairWise[] = {
2776  {ISD::FMINNUM, MVT::v4f32, 4},
2777  };
2778 
2779  static const CostTblEntry SSE2CostTblNoPairWise[] = {
2780  {ISD::FMINNUM, MVT::v2f64, 3},
2781  {ISD::SMIN, MVT::v2i64, 6},
2782  {ISD::UMIN, MVT::v2i64, 8},
2783  {ISD::SMIN, MVT::v4i32, 6},
2784  {ISD::UMIN, MVT::v4i32, 8},
2785  {ISD::SMIN, MVT::v8i16, 4},
2786  {ISD::UMIN, MVT::v8i16, 6},
2787  {ISD::SMIN, MVT::v16i8, 8},
2788  {ISD::UMIN, MVT::v16i8, 6},
2789  };
2790 
2791  static const CostTblEntry SSE41CostTblNoPairWise[] = {
2792  {ISD::FMINNUM, MVT::v4f32, 3},
2793  {ISD::SMIN, MVT::v2i64, 9},
2794  {ISD::UMIN, MVT::v2i64,11},
2795  {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
2796  {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
2797  {ISD::SMIN, MVT::v8i16, 1}, // The data reported by the IACA is "1.5"
2798  {ISD::UMIN, MVT::v8i16, 2}, // The data reported by the IACA is "1.8"
2799  {ISD::SMIN, MVT::v16i8, 3},
2800  {ISD::UMIN, MVT::v16i8, 3},
2801  };
2802 
2803  static const CostTblEntry SSE42CostTblNoPairWise[] = {
2804  {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
2805  {ISD::UMIN, MVT::v2i64, 9}, // The data reported by the IACA is "8.6"
2806  };
2807 
2808  static const CostTblEntry AVX1CostTblNoPairWise[] = {
2809  {ISD::FMINNUM, MVT::v4f32, 1},
2810  {ISD::FMINNUM, MVT::v4f64, 1},
2811  {ISD::FMINNUM, MVT::v8f32, 1},
2812  {ISD::SMIN, MVT::v2i64, 3},
2813  {ISD::UMIN, MVT::v2i64, 3},
2814  {ISD::SMIN, MVT::v4i32, 1},
2815  {ISD::UMIN, MVT::v4i32, 1},
2816  {ISD::SMIN, MVT::v8i16, 1},
2817  {ISD::UMIN, MVT::v8i16, 1},
2818  {ISD::SMIN, MVT::v16i8, 2},
2819  {ISD::UMIN, MVT::v16i8, 2},
2820  {ISD::SMIN, MVT::v4i64, 7},
2821  {ISD::UMIN, MVT::v4i64, 7},
2822  {ISD::SMIN, MVT::v8i32, 2},
2823  {ISD::UMIN, MVT::v8i32, 2},
2824  {ISD::SMIN, MVT::v16i16, 2},
2825  {ISD::UMIN, MVT::v16i16, 2},
2826  {ISD::SMIN, MVT::v32i8, 2},
2827  {ISD::UMIN, MVT::v32i8, 2},
2828  };
2829 
2830  static const CostTblEntry AVX2CostTblNoPairWise[] = {
2831  {ISD::SMIN, MVT::v4i64, 1},
2832  {ISD::UMIN, MVT::v4i64, 1},
2833  {ISD::SMIN, MVT::v8i32, 1},
2834  {ISD::UMIN, MVT::v8i32, 1},
2835  {ISD::SMIN, MVT::v16i16, 1},
2836  {ISD::UMIN, MVT::v16i16, 1},
2837  {ISD::SMIN, MVT::v32i8, 1},
2838  {ISD::UMIN, MVT::v32i8, 1},
2839  };
2840 
2841  static const CostTblEntry AVX512CostTblNoPairWise[] = {
2842  {ISD::FMINNUM, MVT::v8f64, 1},
2843  {ISD::FMINNUM, MVT::v16f32, 2},
2844  {ISD::SMIN, MVT::v8i64, 1},
2845  {ISD::UMIN, MVT::v8i64, 1},
2846  {ISD::SMIN, MVT::v16i32, 1},
2847  {ISD::UMIN, MVT::v16i32, 1},
2848  };
2849 
2850  if (IsPairwise) {
2851  if (ST->hasAVX512())
2852  if (const auto *Entry = CostTableLookup(AVX512CostTblPairWise, ISD, MTy))
2853  return LT.first * Entry->Cost;
2854 
2855  if (ST->hasAVX2())
2856  if (const auto *Entry = CostTableLookup(AVX2CostTblPairWise, ISD, MTy))
2857  return LT.first * Entry->Cost;
2858 
2859  if (ST->hasAVX())
2860  if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
2861  return LT.first * Entry->Cost;
2862 
2863  if (ST->hasSSE42())
2864  if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
2865  return LT.first * Entry->Cost;
2866 
2867  if (ST->hasSSE41())
2868  if (const auto *Entry = CostTableLookup(SSE41CostTblPairWise, ISD, MTy))
2869  return LT.first * Entry->Cost;
2870 
2871  if (ST->hasSSE2())
2872  if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy))
2873  return LT.first * Entry->Cost;
2874 
2875  if (ST->hasSSE1())
2876  if (const auto *Entry = CostTableLookup(SSE1CostTblPairWise, ISD, MTy))
2877  return LT.first * Entry->Cost;
2878  } else {
2879  if (ST->hasAVX512())
2880  if (const auto *Entry =
2881  CostTableLookup(AVX512CostTblNoPairWise, ISD, MTy))
2882  return LT.first * Entry->Cost;
2883 
2884  if (ST->hasAVX2())
2885  if (const auto *Entry = CostTableLookup(AVX2CostTblNoPairWise, ISD, MTy))
2886  return LT.first * Entry->Cost;
2887 
2888  if (ST->hasAVX())
2889  if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
2890  return LT.first * Entry->Cost;
2891 
2892  if (ST->hasSSE42())
2893  if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
2894  return LT.first * Entry->Cost;
2895 
2896  if (ST->hasSSE41())
2897  if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy))
2898  return LT.first * Entry->Cost;
2899 
2900  if (ST->hasSSE2())
2901  if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
2902  return LT.first * Entry->Cost;
2903 
2904  if (ST->hasSSE1())
2905  if (const auto *Entry = CostTableLookup(SSE1CostTblNoPairWise, ISD, MTy))
2906  return LT.first * Entry->Cost;
2907  }
2908 
2909  return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned);
2910 }
2911 
2912 /// Calculate the cost of materializing a 64-bit value. This helper
2913 /// method might only calculate a fraction of a larger immediate. Therefore it
2914 /// is valid to return a cost of ZERO.
2915 int X86TTIImpl::getIntImmCost(int64_t Val) {
2916  if (Val == 0)
2917  return TTI::TCC_Free;
2918 
2919  if (isInt<32>(Val))
2920  return TTI::TCC_Basic;
2921 
2922  return 2 * TTI::TCC_Basic;
2923 }
2924 
2925 int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
2926  assert(Ty->isIntegerTy());
2927 
2928  unsigned BitSize = Ty->getPrimitiveSizeInBits();
2929  if (BitSize == 0)
2930  return ~0U;
2931 
2932  // Never hoist constants larger than 128bit, because this might lead to
2933  // incorrect code generation or assertions in codegen.
2934  // Fixme: Create a cost model for types larger than i128 once the codegen
2935  // issues have been fixed.
2936  if (BitSize > 128)
2937  return TTI::TCC_Free;
2938 
2939  if (Imm == 0)
2940  return TTI::TCC_Free;
2941 
2942  // Sign-extend all constants to a multiple of 64-bit.
2943  APInt ImmVal = Imm;
2944  if (BitSize % 64 != 0)
2945  ImmVal = Imm.sext(alignTo(BitSize, 64));
2946 
2947  // Split the constant into 64-bit chunks and calculate the cost for each
2948  // chunk.
2949  int Cost = 0;
2950  for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
2951  APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
2952  int64_t Val = Tmp.getSExtValue();
2953  Cost += getIntImmCost(Val);
2954  }
2955  // We need at least one instruction to materialize the constant.
2956  return std::max(1, Cost);
2957 }
2958 
2959 int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
2960  Type *Ty) {
2961  assert(Ty->isIntegerTy());
2962 
2963  unsigned BitSize = Ty->getPrimitiveSizeInBits();
2964  // There is no cost model for constants with a bit size of 0. Return TCC_Free
2965  // here, so that constant hoisting will ignore this constant.
2966  if (BitSize == 0)
2967  return TTI::TCC_Free;
2968 
2969  unsigned ImmIdx = ~0U;
2970  switch (Opcode) {
2971  default:
2972  return TTI::TCC_Free;
2973  case Instruction::GetElementPtr:
2974  // Always hoist the base address of a GetElementPtr. This prevents the
2975  // creation of new constants for every base constant that gets constant
2976  // folded with the offset.
2977  if (Idx == 0)
2978  return 2 * TTI::TCC_Basic;
2979  return TTI::TCC_Free;
2980  case Instruction::Store:
2981  ImmIdx = 0;
2982  break;
2983  case Instruction::ICmp:
2984  // This is an imperfect hack to prevent constant hoisting of
2985  // compares that might be trying to check if a 64-bit value fits in
2986  // 32-bits. The backend can optimize these cases using a right shift by 32.
2987  // Ideally we would check the compare predicate here. There also other
2988  // similar immediates the backend can use shifts for.
2989  if (Idx == 1 && Imm.getBitWidth() == 64) {
2990  uint64_t ImmVal = Imm.getZExtValue();
2991  if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
2992  return TTI::TCC_Free;
2993  }
2994  ImmIdx = 1;
2995  break;
2996  case Instruction::And:
2997  // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
2998  // by using a 32-bit operation with implicit zero extension. Detect such
2999  // immediates here as the normal path expects bit 31 to be sign extended.
3000  if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue()))
3001  return TTI::TCC_Free;
3002  ImmIdx = 1;
3003  break;
3004  case Instruction::Add:
3005  case Instruction::Sub:
3006  // For add/sub, we can use the opposite instruction for INT32_MIN.
3007  if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000)
3008  return TTI::TCC_Free;
3009  ImmIdx = 1;
3010  break;
3011  case Instruction::UDiv:
3012  case Instruction::SDiv:
3013  case Instruction::URem:
3014  case Instruction::SRem:
3015  // Division by constant is typically expanded later into a different
3016  // instruction sequence. This completely changes the constants.
3017  // Report them as "free" to stop ConstantHoist from marking them as opaque.
3018  return TTI::TCC_Free;
3019  case Instruction::Mul:
3020  case Instruction::Or:
3021  case Instruction::Xor:
3022  ImmIdx = 1;
3023  break;
3024  // Always return TCC_Free for the shift value of a shift instruction.
3025  case Instruction::Shl:
3026  case Instruction::LShr:
3027  case Instruction::AShr:
3028  if (Idx == 1)
3029  return TTI::TCC_Free;
3030  break;
3031  case Instruction::Trunc:
3032  case Instruction::ZExt:
3033  case Instruction::SExt:
3034  case Instruction::IntToPtr:
3035  case Instruction::PtrToInt:
3036  case Instruction::BitCast:
3037  case Instruction::PHI:
3038  case Instruction::Call:
3039  case Instruction::Select:
3040  case Instruction::Ret:
3041  case Instruction::Load:
3042  break;
3043  }
3044 
3045  if (Idx == ImmIdx) {
3046  int NumConstants = divideCeil(BitSize, 64);
3047  int Cost = X86TTIImpl::getIntImmCost(Imm, Ty);
3048  return (Cost <= NumConstants * TTI::TCC_Basic)
3049  ? static_cast<int>(TTI::TCC_Free)
3050  : Cost;
3051  }
3052 
3053  return X86TTIImpl::getIntImmCost(Imm, Ty);
3054 }
3055 
3056 int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
3057  Type *Ty) {
3058  assert(Ty->isIntegerTy());
3059 
3060  unsigned BitSize = Ty->getPrimitiveSizeInBits();
3061  // There is no cost model for constants with a bit size of 0. Return TCC_Free
3062  // here, so that constant hoisting will ignore this constant.
3063  if (BitSize == 0)
3064  return TTI::TCC_Free;
3065 
3066  switch (IID) {
3067  default:
3068  return TTI::TCC_Free;
3069  case Intrinsic::sadd_with_overflow:
3070  case Intrinsic::uadd_with_overflow:
3071  case Intrinsic::ssub_with_overflow:
3072  case Intrinsic::usub_with_overflow:
3073  case Intrinsic::smul_with_overflow:
3074  case Intrinsic::umul_with_overflow:
3075  if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
3076  return TTI::TCC_Free;
3077  break;
3078  case Intrinsic::experimental_stackmap:
3079  if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
3080  return TTI::TCC_Free;
3081  break;
3082  case Intrinsic::experimental_patchpoint_void:
3083  case Intrinsic::experimental_patchpoint_i64:
3084  if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
3085  return TTI::TCC_Free;
3086  break;
3087  }
3088  return X86TTIImpl::getIntImmCost(Imm, Ty);
3089 }
3090 
3091 unsigned X86TTIImpl::getUserCost(const User *U,
3093  if (isa<StoreInst>(U)) {
3094  Value *Ptr = U->getOperand(1);
3095  // Store instruction with index and scale costs 2 Uops.
3096  // Check the preceding GEP to identify non-const indices.
3097  if (auto GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
3098  if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
3099  return TTI::TCC_Basic * 2;
3100  }
3101  return TTI::TCC_Basic;
3102  }
3103  return BaseT::getUserCost(U, Operands);
3104 }
3105 
3106 // Return an average cost of Gather / Scatter instruction, maybe improved later
3107 int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
3108  unsigned Alignment, unsigned AddressSpace) {
3109 
3110  assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
3111  unsigned VF = SrcVTy->getVectorNumElements();
3112 
3113  // Try to reduce index size from 64 bit (default for GEP)
3114  // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
3115  // operation will use 16 x 64 indices which do not fit in a zmm and needs
3116  // to split. Also check that the base pointer is the same for all lanes,
3117  // and that there's at most one variable index.
3118  auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) {
3119  unsigned IndexSize = DL.getPointerSizeInBits();
3121  if (IndexSize < 64 || !GEP)
3122  return IndexSize;
3123 
3124  unsigned NumOfVarIndices = 0;
3125  Value *Ptrs = GEP->getPointerOperand();
3126  if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
3127  return IndexSize;
3128  for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
3129  if (isa<Constant>(GEP->getOperand(i)))
3130  continue;
3131  Type *IndxTy = GEP->getOperand(i)->getType();
3132  if (IndxTy->isVectorTy())
3133  IndxTy = IndxTy->getVectorElementType();
3134  if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
3135  !isa<SExtInst>(GEP->getOperand(i))) ||
3136  ++NumOfVarIndices > 1)
3137  return IndexSize; // 64
3138  }
3139  return (unsigned)32;
3140  };
3141 
3142 
3143  // Trying to reduce IndexSize to 32 bits for vector 16.
3144  // By default the IndexSize is equal to pointer size.
3145  unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
3146  ? getIndexSizeInBits(Ptr, DL)
3148 
3149  Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(),
3150  IndexSize), VF);
3151  std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
3152  std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
3153  int SplitFactor = std::max(IdxsLT.first, SrcLT.first);
3154  if (SplitFactor > 1) {
3155  // Handle splitting of vector of pointers
3156  Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
3157  return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
3158  AddressSpace);
3159  }
3160 
3161  // The gather / scatter cost is given by Intel architects. It is a rough
3162  // number since we are looking at one instruction in a time.
3163  const int GSOverhead = (Opcode == Instruction::Load)
3164  ? ST->getGatherOverhead()
3165  : ST->getScatterOverhead();
3166  return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
3167  Alignment, AddressSpace);
3168 }
3169 
3170 /// Return the cost of full scalarization of gather / scatter operation.
3171 ///
3172 /// Opcode - Load or Store instruction.
3173 /// SrcVTy - The type of the data vector that should be gathered or scattered.
3174 /// VariableMask - The mask is non-constant at compile time.
3175 /// Alignment - Alignment for one element.
3176 /// AddressSpace - pointer[s] address space.
3177 ///
3178 int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
3179  bool VariableMask, unsigned Alignment,
3180  unsigned AddressSpace) {
3181  unsigned VF = SrcVTy->getVectorNumElements();
3182 
3183  int MaskUnpackCost = 0;
3184  if (VariableMask) {
3185  VectorType *MaskTy =
3186  VectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
3187  MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true);
3188  int ScalarCompareCost =
3189  getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()),
3190  nullptr);
3191  int BranchCost = getCFInstrCost(Instruction::Br);
3192  MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
3193  }
3194 
3195  // The cost of the scalar loads/stores.
3196  int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
3197  Alignment, AddressSpace);
3198 
3199  int InsertExtractCost = 0;
3200  if (Opcode == Instruction::Load)
3201  for (unsigned i = 0; i < VF; ++i)
3202  // Add the cost of inserting each scalar load into the vector
3203  InsertExtractCost +=
3204  getVectorInstrCost(Instruction::InsertElement, SrcVTy, i);
3205  else
3206  for (unsigned i = 0; i < VF; ++i)
3207  // Add the cost of extracting each element out of the data vector
3208  InsertExtractCost +=
3209  getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i);
3210 
3211  return MemoryOpCost + MaskUnpackCost + InsertExtractCost;
3212 }
3213 
3214 /// Calculate the cost of Gather / Scatter operation
3215 int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
3216  Value *Ptr, bool VariableMask,
3217  unsigned Alignment) {
3218  assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
3219  unsigned VF = SrcVTy->getVectorNumElements();
3220  PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
3221  if (!PtrTy && Ptr->getType()->isVectorTy())
3222  PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType());
3223  assert(PtrTy && "Unexpected type for Ptr argument");
3224  unsigned AddressSpace = PtrTy->getAddressSpace();
3225 
3226  bool Scalarize = false;
3227  if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) ||
3228  (Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy)))
3229  Scalarize = true;
3230  // Gather / Scatter for vector 2 is not profitable on KNL / SKX
3231  // Vector-4 of gather/scatter instruction does not exist on KNL.
3232  // We can extend it to 8 elements, but zeroing upper bits of
3233  // the mask vector will add more instructions. Right now we give the scalar
3234  // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction
3235  // is better in the VariableMask case.
3236  if (ST->hasAVX512() && (VF == 2 || (VF == 4 && !ST->hasVLX())))
3237  Scalarize = true;
3238 
3239  if (Scalarize)
3240  return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
3241  AddressSpace);
3242 
3243  return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
3244 }
3245 
3248  // X86 specific here are "instruction number 1st priority".
3249  return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
3250  C1.NumIVMuls, C1.NumBaseAdds,
3251  C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
3252  std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
3253  C2.NumIVMuls, C2.NumBaseAdds,
3254  C2.ScaleCost, C2.ImmCost, C2.SetupCost);
3255 }
3256 
3258  return ST->hasMacroFusion() || ST->hasBranchFusion();
3259 }
3260 
3262  if (!ST->hasAVX())
3263  return false;
3264 
3265  // The backend can't handle a single element vector.
3266  if (isa<VectorType>(DataTy) && DataTy->getVectorNumElements() == 1)
3267  return false;
3268  Type *ScalarTy = DataTy->getScalarType();
3269 
3270  if (ScalarTy->isPointerTy())
3271  return true;
3272 
3273  if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
3274  return true;
3275 
3276  if (!ScalarTy->isIntegerTy())
3277  return false;
3278 
3279  unsigned IntWidth = ScalarTy->getIntegerBitWidth();
3280  return IntWidth == 32 || IntWidth == 64 ||
3281  ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
3282 }
3283 
3285  return isLegalMaskedLoad(DataType, Alignment);
3286 }
3287 
3289  unsigned DataSize = DL.getTypeStoreSize(DataType);
3290  // The only supported nontemporal loads are for aligned vectors of 16 or 32
3291  // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
3292  // (the equivalent stores only require AVX).
3293  if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
3294  return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
3295 
3296  return false;
3297 }
3298 
3300  unsigned DataSize = DL.getTypeStoreSize(DataType);
3301 
3302  // SSE4A supports nontemporal stores of float and double at arbitrary
3303  // alignment.
3304  if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
3305  return true;
3306 
3307  // Besides the SSE4A subtarget exception above, only aligned stores are
3308  // available nontemporaly on any other subtarget. And only stores with a size
3309  // of 4..32 bytes (powers of 2, only) are permitted.
3310  if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
3311  !isPowerOf2_32(DataSize))
3312  return false;
3313 
3314  // 32-byte vector nontemporal stores are supported by AVX (the equivalent
3315  // loads require AVX2).
3316  if (DataSize == 32)
3317  return ST->hasAVX();
3318  else if (DataSize == 16)
3319  return ST->hasSSE1();
3320  return true;
3321 }
3322 
3324  if (!isa<VectorType>(DataTy))
3325  return false;
3326 
3327  if (!ST->hasAVX512())
3328  return false;
3329 
3330  // The backend can't handle a single element vector.
3331  if (DataTy->getVectorNumElements() == 1)
3332  return false;
3333 
3334  Type *ScalarTy = DataTy->getVectorElementType();
3335 
3336  if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
3337  return true;
3338 
3339  if (!ScalarTy->isIntegerTy())
3340  return false;
3341 
3342  unsigned IntWidth = ScalarTy->getIntegerBitWidth();
3343  return IntWidth == 32 || IntWidth == 64 ||
3344  ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
3345 }
3346 
3348  return isLegalMaskedExpandLoad(DataTy);
3349 }
3350 
3352  // Some CPUs have better gather performance than others.
3353  // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
3354  // enable gather with a -march.
3355  if (!(ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2())))
3356  return false;
3357 
3358  // This function is called now in two cases: from the Loop Vectorizer
3359  // and from the Scalarizer.
3360  // When the Loop Vectorizer asks about legality of the feature,
3361  // the vectorization factor is not calculated yet. The Loop Vectorizer
3362  // sends a scalar type and the decision is based on the width of the
3363  // scalar element.
3364  // Later on, the cost model will estimate usage this intrinsic based on
3365  // the vector type.
3366  // The Scalarizer asks again about legality. It sends a vector type.
3367  // In this case we can reject non-power-of-2 vectors.
3368  // We also reject single element vectors as the type legalizer can't
3369  // scalarize it.
3370  if (isa<VectorType>(DataTy)) {
3371  unsigned NumElts = DataTy->getVectorNumElements();
3372  if (NumElts == 1 || !isPowerOf2_32(NumElts))
3373  return false;
3374  }
3375  Type *ScalarTy = DataTy->getScalarType();
3376  if (ScalarTy->isPointerTy())
3377  return true;
3378 
3379  if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
3380  return true;
3381 
3382  if (!ScalarTy->isIntegerTy())
3383  return false;
3384 
3385  unsigned IntWidth = ScalarTy->getIntegerBitWidth();
3386  return IntWidth == 32 || IntWidth == 64;
3387 }
3388 
3390  // AVX2 doesn't support scatter
3391  if (!ST->hasAVX512())
3392  return false;
3393  return isLegalMaskedGather(DataType);
3394 }
3395 
3396 bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
3397  EVT VT = TLI->getValueType(DL, DataType);
3398  return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
3399 }
3400 
3402  return false;
3403 }
3404 
3406  const Function *Callee) const {
3407  const TargetMachine &TM = getTLI()->getTargetMachine();
3408 
3409  // Work this as a subsetting of subtarget features.
3410  const FeatureBitset &CallerBits =
3411  TM.getSubtargetImpl(*Caller)->getFeatureBits();
3412  const FeatureBitset &CalleeBits =
3413  TM.getSubtargetImpl(*Callee)->getFeatureBits();
3414 
3415  FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
3416  FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
3417  return (RealCallerBits & RealCalleeBits) == RealCalleeBits;
3418 }
3419 
3421  const Function *Caller, const Function *Callee,
3423  if (!BaseT::areFunctionArgsABICompatible(Caller, Callee, Args))
3424  return false;
3425 
3426  // If we get here, we know the target features match. If one function
3427  // considers 512-bit vectors legal and the other does not, consider them
3428  // incompatible.
3429  // FIXME Look at the arguments and only consider 512 bit or larger vectors?
3430  const TargetMachine &TM = getTLI()->getTargetMachine();
3431 
3432  return TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
3433  TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs();
3434 }
3435 
3437 X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3439  Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3440  Options.NumLoadsPerBlock = 2;
3441  if (IsZeroCmp) {
3442  // Only enable vector loads for equality comparison. Right now the vector
3443  // version is not as fast for three way compare (see #33329).
3444  const unsigned PreferredWidth = ST->getPreferVectorWidth();
3445  if (PreferredWidth >= 512 && ST->hasAVX512()) Options.LoadSizes.push_back(64);
3446  if (PreferredWidth >= 256 && ST->hasAVX2()) Options.LoadSizes.push_back(32);
3447  if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
3448  // All GPR and vector loads can be unaligned. SIMD compare requires integer
3449  // vectors (SSE2/AVX2).
3450  Options.AllowOverlappingLoads = true;
3451  }
3452  if (ST->is64Bit()) {
3453  Options.LoadSizes.push_back(8);
3454  }
3455  Options.LoadSizes.push_back(4);
3456  Options.LoadSizes.push_back(2);
3457  Options.LoadSizes.push_back(1);
3458  return Options;
3459 }
3460 
3462  // TODO: We expect this to be beneficial regardless of arch,
3463  // but there are currently some unexplained performance artifacts on Atom.
3464  // As a temporary solution, disable on Atom.
3465  return !(ST->isAtom());
3466 }
3467 
3468 // Get estimation for interleaved load/store operations for AVX2.
3469 // \p Factor is the interleaved-access factor (stride) - number of
3470 // (interleaved) elements in the group.
3471 // \p Indices contains the indices for a strided load: when the
3472 // interleaved load has gaps they indicate which elements are used.
3473 // If Indices is empty (or if the number of indices is equal to the size
3474 // of the interleaved-access as given in \p Factor) the access has no gaps.
3475 //
3476 // As opposed to AVX-512, AVX2 does not have generic shuffles that allow
3477 // computing the cost using a generic formula as a function of generic
3478 // shuffles. We therefore use a lookup table instead, filled according to
3479 // the instruction sequences that codegen currently generates.
3481  unsigned Factor,
3482  ArrayRef<unsigned> Indices,
3483  unsigned Alignment,
3484  unsigned AddressSpace,
3485  bool UseMaskForCond,
3486  bool UseMaskForGaps) {
3487 
3488  if (UseMaskForCond || UseMaskForGaps)
3489  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3490  Alignment, AddressSpace,
3491  UseMaskForCond, UseMaskForGaps);
3492 
3493  // We currently Support only fully-interleaved groups, with no gaps.
3494  // TODO: Support also strided loads (interleaved-groups with gaps).
3495  if (Indices.size() && Indices.size() != Factor)
3496  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3497  Alignment, AddressSpace);
3498 
3499  // VecTy for interleave memop is <VF*Factor x Elt>.
3500  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
3501  // VecTy = <12 x i32>.
3502  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
3503 
3504  // This function can be called with VecTy=<6xi128>, Factor=3, in which case
3505  // the VF=2, while v2i128 is an unsupported MVT vector type
3506  // (see MachineValueType.h::getVectorVT()).
3507  if (!LegalVT.isVector())
3508  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3509  Alignment, AddressSpace);
3510 
3511  unsigned VF = VecTy->getVectorNumElements() / Factor;
3512  Type *ScalarTy = VecTy->getVectorElementType();
3513 
3514  // Calculate the number of memory operations (NumOfMemOps), required
3515  // for load/store the VecTy.
3516  unsigned VecTySize = DL.getTypeStoreSize(VecTy);
3517  unsigned LegalVTSize = LegalVT.getStoreSize();
3518  unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
3519 
3520  // Get the cost of one memory operation.
3521  Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
3522  LegalVT.getVectorNumElements());
3523  unsigned MemOpCost =
3524  getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
3525 
3526  VectorType *VT = VectorType::get(ScalarTy, VF);
3527  EVT ETy = TLI->getValueType(DL, VT);
3528  if (!ETy.isSimple())
3529  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3530  Alignment, AddressSpace);
3531 
3532  // TODO: Complete for other data-types and strides.
3533  // Each combination of Stride, ElementTy and VF results in a different
3534  // sequence; The cost tables are therefore accessed with:
3535  // Factor (stride) and VectorType=VFxElemType.
3536  // The Cost accounts only for the shuffle sequence;
3537  // The cost of the loads/stores is accounted for separately.
3538  //
3539  static const CostTblEntry AVX2InterleavedLoadTbl[] = {
3540  { 2, MVT::v4i64, 6 }, //(load 8i64 and) deinterleave into 2 x 4i64
3541  { 2, MVT::v4f64, 6 }, //(load 8f64 and) deinterleave into 2 x 4f64
3542 
3543  { 3, MVT::v2i8, 10 }, //(load 6i8 and) deinterleave into 3 x 2i8
3544  { 3, MVT::v4i8, 4 }, //(load 12i8 and) deinterleave into 3 x 4i8
3545  { 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8
3546  { 3, MVT::v16i8, 11}, //(load 48i8 and) deinterleave into 3 x 16i8
3547  { 3, MVT::v32i8, 13}, //(load 96i8 and) deinterleave into 3 x 32i8
3548  { 3, MVT::v8f32, 17 }, //(load 24f32 and)deinterleave into 3 x 8f32
3549 
3550  { 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8
3551  { 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8
3552  { 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8
3553  { 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8
3554  { 4, MVT::v32i8, 80 }, //(load 128i8 and) deinterleave into 4 x 32i8
3555 
3556  { 8, MVT::v8f32, 40 } //(load 64f32 and)deinterleave into 8 x 8f32
3557  };
3558 
3559  static const CostTblEntry AVX2InterleavedStoreTbl[] = {
3560  { 2, MVT::v4i64, 6 }, //interleave into 2 x 4i64 into 8i64 (and store)
3561  { 2, MVT::v4f64, 6 }, //interleave into 2 x 4f64 into 8f64 (and store)
3562 
3563  { 3, MVT::v2i8, 7 }, //interleave 3 x 2i8 into 6i8 (and store)
3564  { 3, MVT::v4i8, 8 }, //interleave 3 x 4i8 into 12i8 (and store)
3565  { 3, MVT::v8i8, 11 }, //interleave 3 x 8i8 into 24i8 (and store)
3566  { 3, MVT::v16i8, 11 }, //interleave 3 x 16i8 into 48i8 (and store)
3567  { 3, MVT::v32i8, 13 }, //interleave 3 x 32i8 into 96i8 (and store)
3568 
3569  { 4, MVT::v2i8, 12 }, //interleave 4 x 2i8 into 8i8 (and store)
3570  { 4, MVT::v4i8, 9 }, //interleave 4 x 4i8 into 16i8 (and store)
3571  { 4, MVT::v8i8, 10 }, //interleave 4 x 8i8 into 32i8 (and store)
3572  { 4, MVT::v16i8, 10 }, //interleave 4 x 16i8 into 64i8 (and store)
3573  { 4, MVT::v32i8, 12 } //interleave 4 x 32i8 into 128i8 (and store)
3574  };
3575 
3576  if (Opcode == Instruction::Load) {
3577  if (const auto *Entry =
3578  CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT()))
3579  return NumOfMemOps * MemOpCost + Entry->Cost;
3580  } else {
3581  assert(Opcode == Instruction::Store &&
3582  "Expected Store Instruction at this point");
3583  if (const auto *Entry =
3584  CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT()))
3585  return NumOfMemOps * MemOpCost + Entry->Cost;
3586  }
3587 
3588  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3589  Alignment, AddressSpace);
3590 }
3591 
3592 // Get estimation for interleaved load/store operations and strided load.
3593 // \p Indices contains indices for strided load.
3594 // \p Factor - the factor of interleaving.
3595 // AVX-512 provides 3-src shuffles that significantly reduces the cost.
3597  unsigned Factor,
3598  ArrayRef<unsigned> Indices,
3599  unsigned Alignment,
3600  unsigned AddressSpace,
3601  bool UseMaskForCond,
3602  bool UseMaskForGaps) {
3603 
3604  if (UseMaskForCond || UseMaskForGaps)
3605  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3606  Alignment, AddressSpace,
3607  UseMaskForCond, UseMaskForGaps);
3608 
3609  // VecTy for interleave memop is <VF*Factor x Elt>.
3610  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
3611  // VecTy = <12 x i32>.
3612 
3613  // Calculate the number of memory operations (NumOfMemOps), required
3614  // for load/store the VecTy.
3615  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
3616  unsigned VecTySize = DL.getTypeStoreSize(VecTy);
3617  unsigned LegalVTSize = LegalVT.getStoreSize();
3618  unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
3619 
3620  // Get the cost of one memory operation.
3621  Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
3622  LegalVT.getVectorNumElements());
3623  unsigned MemOpCost =
3624  getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
3625 
3626  unsigned VF = VecTy->getVectorNumElements() / Factor;
3627  MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
3628 
3629  if (Opcode == Instruction::Load) {
3630  // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
3631  // contain the cost of the optimized shuffle sequence that the
3632  // X86InterleavedAccess pass will generate.
3633  // The cost of loads and stores are computed separately from the table.
3634 
3635  // X86InterleavedAccess support only the following interleaved-access group.
3636  static const CostTblEntry AVX512InterleavedLoadTbl[] = {
3637  {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
3638  {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
3639  {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
3640  };
3641 
3642  if (const auto *Entry =
3643  CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
3644  return NumOfMemOps * MemOpCost + Entry->Cost;
3645  //If an entry does not exist, fallback to the default implementation.
3646 
3647  // Kind of shuffle depends on number of loaded values.
3648  // If we load the entire data in one register, we can use a 1-src shuffle.
3649  // Otherwise, we'll merge 2 sources in each operation.
3650  TTI::ShuffleKind ShuffleKind =
3651  (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
3652 
3653  unsigned ShuffleCost =
3654  getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr);
3655 
3656  unsigned NumOfLoadsInInterleaveGrp =
3657  Indices.size() ? Indices.size() : Factor;
3658  Type *ResultTy = VectorType::get(VecTy->getVectorElementType(),
3659  VecTy->getVectorNumElements() / Factor);
3660  unsigned NumOfResults =
3661  getTLI()->getTypeLegalizationCost(DL, ResultTy).first *
3662  NumOfLoadsInInterleaveGrp;
3663 
3664  // About a half of the loads may be folded in shuffles when we have only
3665  // one result. If we have more than one result, we do not fold loads at all.
3666  unsigned NumOfUnfoldedLoads =
3667  NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
3668 
3669  // Get a number of shuffle operations per result.
3670  unsigned NumOfShufflesPerResult =
3671  std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
3672 
3673  // The SK_MergeTwoSrc shuffle clobbers one of src operands.
3674  // When we have more than one destination, we need additional instructions
3675  // to keep sources.
3676  unsigned NumOfMoves = 0;
3677  if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
3678  NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
3679 
3680  int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
3681  NumOfUnfoldedLoads * MemOpCost + NumOfMoves;
3682 
3683  return Cost;
3684  }
3685 
3686  // Store.
3687  assert(Opcode == Instruction::Store &&
3688  "Expected Store Instruction at this point");
3689  // X86InterleavedAccess support only the following interleaved-access group.
3690  static const CostTblEntry AVX512InterleavedStoreTbl[] = {
3691  {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
3692  {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
3693  {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
3694 
3695  {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
3696  {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
3697  {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
3698  {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
3699  };
3700 
3701  if (const auto *Entry =
3702  CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
3703  return NumOfMemOps * MemOpCost + Entry->Cost;
3704  //If an entry does not exist, fallback to the default implementation.
3705 
3706  // There is no strided stores meanwhile. And store can't be folded in
3707  // shuffle.
3708  unsigned NumOfSources = Factor; // The number of values to be merged.
3709  unsigned ShuffleCost =
3710  getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr);
3711  unsigned NumOfShufflesPerStore = NumOfSources - 1;
3712 
3713  // The SK_MergeTwoSrc shuffle clobbers one of src operands.
3714  // We need additional instructions to keep sources.
3715  unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
3716  int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
3717  NumOfMoves;
3718  return Cost;
3719 }
3720 
3721 int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
3722  unsigned Factor,
3723  ArrayRef<unsigned> Indices,
3724  unsigned Alignment,
3725  unsigned AddressSpace,
3726  bool UseMaskForCond,
3727  bool UseMaskForGaps) {
3728  auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) {
3729  Type *EltTy = VecTy->getVectorElementType();
3730  if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
3731  EltTy->isIntegerTy(32) || EltTy->isPointerTy())
3732  return true;
3733  if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8))
3734  return HasBW;
3735  return false;
3736  };
3737  if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
3738  return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
3739  Alignment, AddressSpace,
3740  UseMaskForCond, UseMaskForGaps);
3741  if (ST->hasAVX2())
3742  return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
3743  Alignment, AddressSpace,
3744  UseMaskForCond, UseMaskForGaps);
3745 
3746  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3747  Alignment, AddressSpace,
3748  UseMaskForCond, UseMaskForGaps);
3749 }
bool hasAVX() const
Definition: X86Subtarget.h:587
Type * getVectorElementType() const
Definition: Type.h:375
constexpr bool isUInt< 32 >(uint64_t x)
Definition: MathExtras.h:385
X = FP_ROUND(Y, TRUNC) - Rounding &#39;Y&#39; from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:577
bool is64Bit() const
Is this x86_64? (disregarding specific ABI / programming model)
Definition: X86Subtarget.h:548
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:112
bool areFunctionArgsABICompatible(const Function *Caller, const Function *Callee, SmallPtrSetImpl< Argument *> &Args) const
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:630
unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >())
Definition: BasicTTIImpl.h:621
static IntegerType * getInt1Ty(LLVMContext &C)
Definition: Type.cpp:177
llvm::Optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1571
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:888
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:41
bool hasSSE41() const
Definition: X86Subtarget.h:585
This class represents lattice values for constants.
Definition: AllocatorList.h:23
unsigned minRequiredElementSize(const Value *Val, bool &isSigned)
static MVT getVectorVT(MVT VT, unsigned NumElements)
Cost tables and simple lookup functions.
int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty)
int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, bool VariableMask, unsigned Alignment)
Calculate the cost of Gather / Scatter operation.
bool hasPOPCNT() const
Definition: X86Subtarget.h:595
bool hasAVX2() const
Definition: X86Subtarget.h:588
const Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
bool isVector() const
Return true if this is a vector value type.
unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:560
void push_back(const T &Elt)
Definition: SmallVector.h:211
This file a TargetTransformInfo::Concept conforming object specific to the X86 target machine...
The main scalar evolution driver.
unsigned getNumberOfRegisters(unsigned ClassID) const
unsigned getRegisterBitWidth(bool Vector) const
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:252
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:250
bool hasVBMI2() const
Definition: X86Subtarget.h:622
unsigned getVectorNumElements() const
bool isLegalMaskedLoad(Type *DataType, MaybeAlign Alignment)
bool isLegalMaskedStore(Type *DataType, MaybeAlign Alignment)
bool isLegalMaskedScatter(Type *DataType)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1165
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:393
bool useAVX512Regs() const
Definition: X86Subtarget.h:726
Type Conversion Cost Table.
Definition: CostTable.h:44
[US]{MIN/MAX} - Binary minimum or maximum or signed or unsigned integers.
Definition: ISDOpcodes.h:423
Hexagon Common GEP
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:230
bool isLegalNTLoad(Type *DataType, Align Alignment)
bool isLegalNTStore(Type *DataType, Align Alignment)
unsigned getAtomicMemIntrinsicMaxElementSize() const
Cost Table Entry.
Definition: CostTable.h:24
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1517
int getGatherOverhead() const
Definition: X86Subtarget.h:648
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:130
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:343
unsigned getMaxInterleaveFactor(unsigned VF)
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:209
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
bool isUnalignedMem32Slow() const
Definition: X86Subtarget.h:647
unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef< Value * > Args, FastMathFlags FMF, unsigned VF=1)
Get intrinsic cost based on arguments.
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:737