LLVM  10.0.0svn
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements a TargetTransformInfo analysis pass specific to the
10 /// X86 target machine. It uses the target's detailed information to provide
11 /// more precise answers to certain TTI queries, while letting the target
12 /// independent and default TTI implementations handle the rest.
13 ///
14 //===----------------------------------------------------------------------===//
15 /// About Cost Model numbers used below it's necessary to say the following:
16 /// the numbers correspond to some "generic" X86 CPU instead of usage of
17 /// concrete CPU model. Usually the numbers correspond to CPU where the feature
18 /// apeared at the first time. For example, if we do Subtarget.hasSSE42() in
19 /// the lookups below the cost is based on Nehalem as that was the first CPU
20 /// to support that feature level and thus has most likely the worst case cost.
21 /// Some examples of other technologies/CPUs:
22 /// SSE 3 - Pentium4 / Athlon64
23 /// SSE 4.1 - Penryn
24 /// SSE 4.2 - Nehalem
25 /// AVX - Sandy Bridge
26 /// AVX2 - Haswell
27 /// AVX-512 - Xeon Phi / Skylake
28 /// And some examples of instruction target dependent costs (latency)
29 /// divss sqrtss rsqrtss
30 /// AMD K7 11-16 19 3
31 /// Piledriver 9-24 13-15 5
32 /// Jaguar 14 16 2
33 /// Pentium II,III 18 30 2
34 /// Nehalem 7-14 7-18 3
35 /// Haswell 10-13 11 5
36 /// TODO: Develop and implement the target dependent cost model and
37 /// specialize cost numbers for different Cost Model Targets such as throughput,
38 /// code size, latency and uop count.
39 //===----------------------------------------------------------------------===//
40 
41 #include "X86TargetTransformInfo.h"
44 #include "llvm/CodeGen/CostTable.h"
46 #include "llvm/IR/IntrinsicInst.h"
47 #include "llvm/Support/Debug.h"
48 
49 using namespace llvm;
50 
51 #define DEBUG_TYPE "x86tti"
52 
53 //===----------------------------------------------------------------------===//
54 //
55 // X86 cost model.
56 //
57 //===----------------------------------------------------------------------===//
58 
60 X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
61  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
62  // TODO: Currently the __builtin_popcount() implementation using SSE3
63  // instructions is inefficient. Once the problem is fixed, we should
64  // call ST->hasSSE3() instead of ST->hasPOPCNT().
66 }
67 
70  switch (Level) {
72  // - Penryn
73  // - Nehalem
74  // - Westmere
75  // - Sandy Bridge
76  // - Ivy Bridge
77  // - Haswell
78  // - Broadwell
79  // - Skylake
80  // - Kabylake
81  return 32 * 1024; // 32 KByte
83  // - Penryn
84  // - Nehalem
85  // - Westmere
86  // - Sandy Bridge
87  // - Ivy Bridge
88  // - Haswell
89  // - Broadwell
90  // - Skylake
91  // - Kabylake
92  return 256 * 1024; // 256 KByte
93  }
94 
95  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
96 }
97 
100  // - Penryn
101  // - Nehalem
102  // - Westmere
103  // - Sandy Bridge
104  // - Ivy Bridge
105  // - Haswell
106  // - Broadwell
107  // - Skylake
108  // - Kabylake
109  switch (Level) {
113  return 8;
114  }
115 
116  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
117 }
118 
119 unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
120  if (Vector && !ST->hasSSE1())
121  return 0;
122 
123  if (ST->is64Bit()) {
124  if (Vector && ST->hasAVX512())
125  return 32;
126  return 16;
127  }
128  return 8;
129 }
130 
131 unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const {
132  unsigned PreferVectorWidth = ST->getPreferVectorWidth();
133  if (Vector) {
134  if (ST->hasAVX512() && PreferVectorWidth >= 512)
135  return 512;
136  if (ST->hasAVX() && PreferVectorWidth >= 256)
137  return 256;
138  if (ST->hasSSE1() && PreferVectorWidth >= 128)
139  return 128;
140  return 0;
141  }
142 
143  if (ST->is64Bit())
144  return 64;
145 
146  return 32;
147 }
148 
149 unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
150  return getRegisterBitWidth(true);
151 }
152 
153 unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
154  // If the loop will not be vectorized, don't interleave the loop.
155  // Let regular unroll to unroll the loop, which saves the overflow
156  // check and memory check cost.
157  if (VF == 1)
158  return 1;
159 
160  if (ST->isAtom())
161  return 1;
162 
163  // Sandybridge and Haswell have multiple execution ports and pipelined
164  // vector units.
165  if (ST->hasAVX())
166  return 4;
167 
168  return 2;
169 }
170 
172  unsigned Opcode, Type *Ty,
174  TTI::OperandValueProperties Opd1PropInfo,
175  TTI::OperandValueProperties Opd2PropInfo,
177  // Legalize the type.
178  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
179 
180  int ISD = TLI->InstructionOpcodeToISD(Opcode);
181  assert(ISD && "Invalid opcode");
182 
183  static const CostTblEntry GLMCostTable[] = {
184  { ISD::FDIV, MVT::f32, 18 }, // divss
185  { ISD::FDIV, MVT::v4f32, 35 }, // divps
186  { ISD::FDIV, MVT::f64, 33 }, // divsd
187  { ISD::FDIV, MVT::v2f64, 65 }, // divpd
188  };
189 
190  if (ST->isGLM())
191  if (const auto *Entry = CostTableLookup(GLMCostTable, ISD,
192  LT.second))
193  return LT.first * Entry->Cost;
194 
195  static const CostTblEntry SLMCostTable[] = {
196  { ISD::MUL, MVT::v4i32, 11 }, // pmulld
197  { ISD::MUL, MVT::v8i16, 2 }, // pmullw
198  { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
199  { ISD::FMUL, MVT::f64, 2 }, // mulsd
200  { ISD::FMUL, MVT::v2f64, 4 }, // mulpd
201  { ISD::FMUL, MVT::v4f32, 2 }, // mulps
202  { ISD::FDIV, MVT::f32, 17 }, // divss
203  { ISD::FDIV, MVT::v4f32, 39 }, // divps
204  { ISD::FDIV, MVT::f64, 32 }, // divsd
205  { ISD::FDIV, MVT::v2f64, 69 }, // divpd
206  { ISD::FADD, MVT::v2f64, 2 }, // addpd
207  { ISD::FSUB, MVT::v2f64, 2 }, // subpd
208  // v2i64/v4i64 mul is custom lowered as a series of long:
209  // multiplies(3), shifts(3) and adds(2)
210  // slm muldq version throughput is 2 and addq throughput 4
211  // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
212  // 3X4 (addq throughput) = 17
213  { ISD::MUL, MVT::v2i64, 17 },
214  // slm addq\subq throughput is 4
215  { ISD::ADD, MVT::v2i64, 4 },
216  { ISD::SUB, MVT::v2i64, 4 },
217  };
218 
219  if (ST->isSLM()) {
220  if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
221  // Check if the operands can be shrinked into a smaller datatype.
222  bool Op1Signed = false;
223  unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
224  bool Op2Signed = false;
225  unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
226 
227  bool signedMode = Op1Signed | Op2Signed;
228  unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
229 
230  if (OpMinSize <= 7)
231  return LT.first * 3; // pmullw/sext
232  if (!signedMode && OpMinSize <= 8)
233  return LT.first * 3; // pmullw/zext
234  if (OpMinSize <= 15)
235  return LT.first * 5; // pmullw/pmulhw/pshuf
236  if (!signedMode && OpMinSize <= 16)
237  return LT.first * 5; // pmullw/pmulhw/pshuf
238  }
239 
240  if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
241  LT.second)) {
242  return LT.first * Entry->Cost;
243  }
244  }
245 
246  if ((ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
247  ISD == ISD::UREM) &&
250  Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
251  if (ISD == ISD::SDIV || ISD == ISD::SREM) {
252  // On X86, vector signed division by constants power-of-two are
253  // normally expanded to the sequence SRA + SRL + ADD + SRA.
254  // The OperandValue properties may not be the same as that of the previous
255  // operation; conservatively assume OP_None.
256  int Cost =
257  2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, Op2Info,
260  Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
263  Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info,
266 
267  if (ISD == ISD::SREM) {
268  // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
269  Cost += getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info);
270  Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Op1Info, Op2Info);
271  }
272 
273  return Cost;
274  }
275 
276  // Vector unsigned division/remainder will be simplified to shifts/masks.
277  if (ISD == ISD::UDIV)
278  return getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
281 
282  if (ISD == ISD::UREM)
283  return getArithmeticInstrCost(Instruction::And, Ty, Op1Info, Op2Info,
286  }
287 
288  static const CostTblEntry AVX512BWUniformConstCostTable[] = {
289  { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand.
290  { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand.
291  { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb.
292  };
293 
295  ST->hasBWI()) {
296  if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
297  LT.second))
298  return LT.first * Entry->Cost;
299  }
300 
301  static const CostTblEntry AVX512UniformConstCostTable[] = {
302  { ISD::SRA, MVT::v2i64, 1 },
303  { ISD::SRA, MVT::v4i64, 1 },
304  { ISD::SRA, MVT::v8i64, 1 },
305  };
306 
308  ST->hasAVX512()) {
309  if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
310  LT.second))
311  return LT.first * Entry->Cost;
312  }
313 
314  static const CostTblEntry AVX2UniformConstCostTable[] = {
315  { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand.
316  { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand.
317  { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb.
318 
319  { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
320  };
321 
323  ST->hasAVX2()) {
324  if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
325  LT.second))
326  return LT.first * Entry->Cost;
327  }
328 
329  static const CostTblEntry SSE2UniformConstCostTable[] = {
330  { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand.
331  { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand.
332  { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
333 
334  { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split.
335  { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split.
336  { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
337  };
338 
339  // XOP has faster vXi8 shifts.
341  ST->hasSSE2() && !ST->hasXOP()) {
342  if (const auto *Entry =
343  CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
344  return LT.first * Entry->Cost;
345  }
346 
347  static const CostTblEntry AVX512BWConstCostTable[] = {
348  { ISD::SDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
349  { ISD::SREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
350  { ISD::UDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
351  { ISD::UREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
352  { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence
353  { ISD::SREM, MVT::v32i16, 8 }, // vpmulhw+mul+sub sequence
354  { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence
355  { ISD::UREM, MVT::v32i16, 8 }, // vpmulhuw+mul+sub sequence
356  };
357 
360  ST->hasBWI()) {
361  if (const auto *Entry =
362  CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
363  return LT.first * Entry->Cost;
364  }
365 
366  static const CostTblEntry AVX512ConstCostTable[] = {
367  { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
368  { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
369  { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
370  { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
371  };
372 
375  ST->hasAVX512()) {
376  if (const auto *Entry =
377  CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
378  return LT.first * Entry->Cost;
379  }
380 
381  static const CostTblEntry AVX2ConstCostTable[] = {
382  { ISD::SDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
383  { ISD::SREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
384  { ISD::UDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
385  { ISD::UREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
386  { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
387  { ISD::SREM, MVT::v16i16, 8 }, // vpmulhw+mul+sub sequence
388  { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
389  { ISD::UREM, MVT::v16i16, 8 }, // vpmulhuw+mul+sub sequence
390  { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
391  { ISD::SREM, MVT::v8i32, 19 }, // vpmuldq+mul+sub sequence
392  { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
393  { ISD::UREM, MVT::v8i32, 19 }, // vpmuludq+mul+sub sequence
394  };
395 
398  ST->hasAVX2()) {
399  if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
400  return LT.first * Entry->Cost;
401  }
402 
403  static const CostTblEntry SSE2ConstCostTable[] = {
404  { ISD::SDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
405  { ISD::SREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
406  { ISD::SDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
407  { ISD::SREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
408  { ISD::UDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
409  { ISD::UREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
410  { ISD::UDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
411  { ISD::UREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
412  { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
413  { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split.
414  { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
415  { ISD::SREM, MVT::v8i16, 8 }, // pmulhw+mul+sub sequence
416  { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
417  { ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split.
418  { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
419  { ISD::UREM, MVT::v8i16, 8 }, // pmulhuw+mul+sub sequence
420  { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split.
421  { ISD::SREM, MVT::v8i32, 48+2 }, // 2*pmuludq+mul+sub sequence + split.
422  { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
423  { ISD::SREM, MVT::v4i32, 24 }, // pmuludq+mul+sub sequence
424  { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split.
425  { ISD::UREM, MVT::v8i32, 40+2 }, // 2*pmuludq+mul+sub sequence + split.
426  { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
427  { ISD::UREM, MVT::v4i32, 20 }, // pmuludq+mul+sub sequence
428  };
429 
432  ST->hasSSE2()) {
433  // pmuldq sequence.
434  if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
435  return LT.first * 32;
436  if (ISD == ISD::SREM && LT.second == MVT::v8i32 && ST->hasAVX())
437  return LT.first * 38;
438  if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
439  return LT.first * 15;
440  if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41())
441  return LT.first * 20;
442 
443  if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
444  return LT.first * Entry->Cost;
445  }
446 
447  static const CostTblEntry AVX2UniformCostTable[] = {
448  // Uniform splats are cheaper for the following instructions.
449  { ISD::SHL, MVT::v16i16, 1 }, // psllw.
450  { ISD::SRL, MVT::v16i16, 1 }, // psrlw.
451  { ISD::SRA, MVT::v16i16, 1 }, // psraw.
452  };
453 
454  if (ST->hasAVX2() &&
456  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
457  if (const auto *Entry =
458  CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
459  return LT.first * Entry->Cost;
460  }
461 
462  static const CostTblEntry SSE2UniformCostTable[] = {
463  // Uniform splats are cheaper for the following instructions.
464  { ISD::SHL, MVT::v8i16, 1 }, // psllw.
465  { ISD::SHL, MVT::v4i32, 1 }, // pslld
466  { ISD::SHL, MVT::v2i64, 1 }, // psllq.
467 
468  { ISD::SRL, MVT::v8i16, 1 }, // psrlw.
469  { ISD::SRL, MVT::v4i32, 1 }, // psrld.
470  { ISD::SRL, MVT::v2i64, 1 }, // psrlq.
471 
472  { ISD::SRA, MVT::v8i16, 1 }, // psraw.
473  { ISD::SRA, MVT::v4i32, 1 }, // psrad.
474  };
475 
476  if (ST->hasSSE2() &&
478  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
479  if (const auto *Entry =
480  CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
481  return LT.first * Entry->Cost;
482  }
483 
484  static const CostTblEntry AVX512DQCostTable[] = {
485  { ISD::MUL, MVT::v2i64, 1 },
486  { ISD::MUL, MVT::v4i64, 1 },
487  { ISD::MUL, MVT::v8i64, 1 }
488  };
489 
490  // Look for AVX512DQ lowering tricks for custom cases.
491  if (ST->hasDQI())
492  if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
493  return LT.first * Entry->Cost;
494 
495  static const CostTblEntry AVX512BWCostTable[] = {
496  { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
497  { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
498  { ISD::SRA, MVT::v8i16, 1 }, // vpsravw
499 
500  { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
501  { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
502  { ISD::SRA, MVT::v16i16, 1 }, // vpsravw
503 
504  { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
505  { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
506  { ISD::SRA, MVT::v32i16, 1 }, // vpsravw
507 
508  { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence.
509  { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence.
510  { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence.
511 
512  { ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence.
513  { ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence.
514  { ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence.
515  };
516 
517  // Look for AVX512BW lowering tricks for custom cases.
518  if (ST->hasBWI())
519  if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
520  return LT.first * Entry->Cost;
521 
522  static const CostTblEntry AVX512CostTable[] = {
523  { ISD::SHL, MVT::v16i32, 1 },
524  { ISD::SRL, MVT::v16i32, 1 },
525  { ISD::SRA, MVT::v16i32, 1 },
526 
527  { ISD::SHL, MVT::v8i64, 1 },
528  { ISD::SRL, MVT::v8i64, 1 },
529 
530  { ISD::SRA, MVT::v2i64, 1 },
531  { ISD::SRA, MVT::v4i64, 1 },
532  { ISD::SRA, MVT::v8i64, 1 },
533 
534  { ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence.
535  { ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence.
536  { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org)
537  { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org)
538  { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org)
539  { ISD::MUL, MVT::v8i64, 8 }, // 3*pmuludq/3*shift/2*add
540 
541  { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
542  { ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
543  { ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
544 
545  { ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
546  { ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
547  { ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
548  };
549 
550  if (ST->hasAVX512())
551  if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
552  return LT.first * Entry->Cost;
553 
554  static const CostTblEntry AVX2ShiftCostTable[] = {
555  // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
556  // customize them to detect the cases where shift amount is a scalar one.
557  { ISD::SHL, MVT::v4i32, 1 },
558  { ISD::SRL, MVT::v4i32, 1 },
559  { ISD::SRA, MVT::v4i32, 1 },
560  { ISD::SHL, MVT::v8i32, 1 },
561  { ISD::SRL, MVT::v8i32, 1 },
562  { ISD::SRA, MVT::v8i32, 1 },
563  { ISD::SHL, MVT::v2i64, 1 },
564  { ISD::SRL, MVT::v2i64, 1 },
565  { ISD::SHL, MVT::v4i64, 1 },
566  { ISD::SRL, MVT::v4i64, 1 },
567  };
568 
569  // Look for AVX2 lowering tricks.
570  if (ST->hasAVX2()) {
571  if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
574  // On AVX2, a packed v16i16 shift left by a constant build_vector
575  // is lowered into a vector multiply (vpmullw).
576  return getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info,
579 
580  if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
581  return LT.first * Entry->Cost;
582  }
583 
584  static const CostTblEntry XOPShiftCostTable[] = {
585  // 128bit shifts take 1cy, but right shifts require negation beforehand.
586  { ISD::SHL, MVT::v16i8, 1 },
587  { ISD::SRL, MVT::v16i8, 2 },
588  { ISD::SRA, MVT::v16i8, 2 },
589  { ISD::SHL, MVT::v8i16, 1 },
590  { ISD::SRL, MVT::v8i16, 2 },
591  { ISD::SRA, MVT::v8i16, 2 },
592  { ISD::SHL, MVT::v4i32, 1 },
593  { ISD::SRL, MVT::v4i32, 2 },
594  { ISD::SRA, MVT::v4i32, 2 },
595  { ISD::SHL, MVT::v2i64, 1 },
596  { ISD::SRL, MVT::v2i64, 2 },
597  { ISD::SRA, MVT::v2i64, 2 },
598  // 256bit shifts require splitting if AVX2 didn't catch them above.
599  { ISD::SHL, MVT::v32i8, 2+2 },
600  { ISD::SRL, MVT::v32i8, 4+2 },
601  { ISD::SRA, MVT::v32i8, 4+2 },
602  { ISD::SHL, MVT::v16i16, 2+2 },
603  { ISD::SRL, MVT::v16i16, 4+2 },
604  { ISD::SRA, MVT::v16i16, 4+2 },
605  { ISD::SHL, MVT::v8i32, 2+2 },
606  { ISD::SRL, MVT::v8i32, 4+2 },
607  { ISD::SRA, MVT::v8i32, 4+2 },
608  { ISD::SHL, MVT::v4i64, 2+2 },
609  { ISD::SRL, MVT::v4i64, 4+2 },
610  { ISD::SRA, MVT::v4i64, 4+2 },
611  };
612 
613  // Look for XOP lowering tricks.
614  if (ST->hasXOP()) {
615  // If the right shift is constant then we'll fold the negation so
616  // it's as cheap as a left shift.
617  int ShiftISD = ISD;
618  if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) &&
621  ShiftISD = ISD::SHL;
622  if (const auto *Entry =
623  CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
624  return LT.first * Entry->Cost;
625  }
626 
627  static const CostTblEntry SSE2UniformShiftCostTable[] = {
628  // Uniform splats are cheaper for the following instructions.
629  { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split.
630  { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split.
631  { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split.
632 
633  { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split.
634  { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split.
635  { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split.
636 
637  { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split.
638  { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split.
639  { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle.
640  { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split.
641  };
642 
643  if (ST->hasSSE2() &&
645  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
646 
647  // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
648  if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2())
649  return LT.first * 4; // 2*psrad + shuffle.
650 
651  if (const auto *Entry =
652  CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
653  return LT.first * Entry->Cost;
654  }
655 
656  if (ISD == ISD::SHL &&
658  MVT VT = LT.second;
659  // Vector shift left by non uniform constant can be lowered
660  // into vector multiply.
661  if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
662  ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
663  ISD = ISD::MUL;
664  }
665 
666  static const CostTblEntry AVX2CostTable[] = {
667  { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence.
668  { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
669 
670  { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence.
671  { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
672 
673  { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence.
674  { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence.
675  { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence.
676  { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence.
677 
678  { ISD::SUB, MVT::v32i8, 1 }, // psubb
679  { ISD::ADD, MVT::v32i8, 1 }, // paddb
680  { ISD::SUB, MVT::v16i16, 1 }, // psubw
681  { ISD::ADD, MVT::v16i16, 1 }, // paddw
682  { ISD::SUB, MVT::v8i32, 1 }, // psubd
683  { ISD::ADD, MVT::v8i32, 1 }, // paddd
684  { ISD::SUB, MVT::v4i64, 1 }, // psubq
685  { ISD::ADD, MVT::v4i64, 1 }, // paddq
686 
687  { ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence.
688  { ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence.
689  { ISD::MUL, MVT::v16i16, 1 }, // pmullw
690  { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org)
691  { ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add
692 
693  { ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
694  { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
695  { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
696  { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
697  { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
698  { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
699 
700  { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
701  { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
702  { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
703  { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/
704  { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
705  { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
706  };
707 
708  // Look for AVX2 lowering tricks for custom cases.
709  if (ST->hasAVX2())
710  if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
711  return LT.first * Entry->Cost;
712 
713  static const CostTblEntry AVX1CostTable[] = {
714  // We don't have to scalarize unsupported ops. We can issue two half-sized
715  // operations and we only need to extract the upper YMM half.
716  // Two ops + 1 extract + 1 insert = 4.
717  { ISD::MUL, MVT::v16i16, 4 },
718  { ISD::MUL, MVT::v8i32, 4 },
719  { ISD::SUB, MVT::v32i8, 4 },
720  { ISD::ADD, MVT::v32i8, 4 },
721  { ISD::SUB, MVT::v16i16, 4 },
722  { ISD::ADD, MVT::v16i16, 4 },
723  { ISD::SUB, MVT::v8i32, 4 },
724  { ISD::ADD, MVT::v8i32, 4 },
725  { ISD::SUB, MVT::v4i64, 4 },
726  { ISD::ADD, MVT::v4i64, 4 },
727 
728  // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
729  // are lowered as a series of long multiplies(3), shifts(3) and adds(2)
730  // Because we believe v4i64 to be a legal type, we must also include the
731  // extract+insert in the cost table. Therefore, the cost here is 18
732  // instead of 8.
733  { ISD::MUL, MVT::v4i64, 18 },
734 
735  { ISD::MUL, MVT::v32i8, 26 }, // extend/pmullw/trunc sequence.
736 
737  { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
738  { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
739  { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
740  { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/
741  { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/
742  { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/
743  };
744 
745  if (ST->hasAVX())
746  if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
747  return LT.first * Entry->Cost;
748 
749  static const CostTblEntry SSE42CostTable[] = {
750  { ISD::FADD, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
751  { ISD::FADD, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
752  { ISD::FADD, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
753  { ISD::FADD, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
754 
755  { ISD::FSUB, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
756  { ISD::FSUB, MVT::f32 , 1 }, // Nehalem from http://www.agner.org/
757  { ISD::FSUB, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
758  { ISD::FSUB, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
759 
760  { ISD::FMUL, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
761  { ISD::FMUL, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
762  { ISD::FMUL, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
763  { ISD::FMUL, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
764 
765  { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/
766  { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
767  { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/
768  { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
769  };
770 
771  if (ST->hasSSE42())
772  if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
773  return LT.first * Entry->Cost;
774 
775  static const CostTblEntry SSE41CostTable[] = {
776  { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence.
777  { ISD::SHL, MVT::v32i8, 2*11+2 }, // pblendvb sequence + split.
778  { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence.
779  { ISD::SHL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
780  { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
781  { ISD::SHL, MVT::v8i32, 2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split
782 
783  { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence.
784  { ISD::SRL, MVT::v32i8, 2*12+2 }, // pblendvb sequence + split.
785  { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence.
786  { ISD::SRL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
787  { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend.
788  { ISD::SRL, MVT::v8i32, 2*11+2 }, // Shift each lane + blend + split.
789 
790  { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence.
791  { ISD::SRA, MVT::v32i8, 2*24+2 }, // pblendvb sequence + split.
792  { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence.
793  { ISD::SRA, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
794  { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
795  { ISD::SRA, MVT::v8i32, 2*12+2 }, // Shift each lane + blend + split.
796 
797  { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org)
798  };
799 
800  if (ST->hasSSE41())
801  if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
802  return LT.first * Entry->Cost;
803 
804  static const CostTblEntry SSE2CostTable[] = {
805  // We don't correctly identify costs of casts because they are marked as
806  // custom.
807  { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence.
808  { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence.
809  { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
810  { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
811  { ISD::SHL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
812 
813  { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence.
814  { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence.
815  { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
816  { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
817  { ISD::SRL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
818 
819  { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence.
820  { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence.
821  { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend.
822  { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence.
823  { ISD::SRA, MVT::v4i64, 2*12+2 }, // srl/xor/sub sequence+split.
824 
825  { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence.
826  { ISD::MUL, MVT::v8i16, 1 }, // pmullw
827  { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
828  { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
829 
830  { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/
831  { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
832  { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
833  { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
834 
835  { ISD::FADD, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
836  { ISD::FADD, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
837 
838  { ISD::FSUB, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
839  { ISD::FSUB, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
840  };
841 
842  if (ST->hasSSE2())
843  if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
844  return LT.first * Entry->Cost;
845 
846  static const CostTblEntry SSE1CostTable[] = {
847  { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/
848  { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
849 
850  { ISD::FADD, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
851  { ISD::FADD, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
852 
853  { ISD::FSUB, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
854  { ISD::FSUB, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
855 
856  { ISD::ADD, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
857  { ISD::ADD, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
858  { ISD::ADD, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
859 
860  { ISD::SUB, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
861  { ISD::SUB, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
862  { ISD::SUB, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
863  };
864 
865  if (ST->hasSSE1())
866  if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
867  return LT.first * Entry->Cost;
868 
869  // It is not a good idea to vectorize division. We have to scalarize it and
870  // in the process we will often end up having to spilling regular
871  // registers. The overhead of division is going to dominate most kernels
872  // anyways so try hard to prevent vectorization of division - it is
873  // generally a bad idea. Assume somewhat arbitrarily that we have to be able
874  // to hide "20 cycles" for each lane.
875  if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM ||
876  ISD == ISD::UDIV || ISD == ISD::UREM)) {
877  int ScalarCost = getArithmeticInstrCost(
878  Opcode, Ty->getScalarType(), Op1Info, Op2Info,
880  return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
881  }
882 
883  // Fallback to the default implementation.
884  return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
885 }
886 
888  Type *SubTp) {
889  // 64-bit packed float vectors (v2f32) are widened to type v4f32.
890  // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
891  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
892 
893  // Treat Transpose as 2-op shuffles - there's no difference in lowering.
894  if (Kind == TTI::SK_Transpose)
895  Kind = TTI::SK_PermuteTwoSrc;
896 
897  // For Broadcasts we are splatting the first element from the first input
898  // register, so only need to reference that input and all the output
899  // registers are the same.
900  if (Kind == TTI::SK_Broadcast)
901  LT.first = 1;
902 
903  // Subvector extractions are free if they start at the beginning of a
904  // vector and cheap if the subvectors are aligned.
905  if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
906  int NumElts = LT.second.getVectorNumElements();
907  if ((Index % NumElts) == 0)
908  return 0;
909  std::pair<int, MVT> SubLT = TLI->getTypeLegalizationCost(DL, SubTp);
910  if (SubLT.second.isVector()) {
911  int NumSubElts = SubLT.second.getVectorNumElements();
912  if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
913  return SubLT.first;
914  // Handle some cases for widening legalization. For now we only handle
915  // cases where the original subvector was naturally aligned and evenly
916  // fit in its legalized subvector type.
917  // FIXME: Remove some of the alignment restrictions.
918  // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
919  // vectors.
920  int OrigSubElts = SubTp->getVectorNumElements();
921  if (NumSubElts > OrigSubElts &&
922  (Index % OrigSubElts) == 0 && (NumSubElts % OrigSubElts) == 0 &&
923  LT.second.getVectorElementType() ==
924  SubLT.second.getVectorElementType() &&
925  LT.second.getVectorElementType().getSizeInBits() ==
927  assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
928  "Unexpected number of elements!");
930  LT.second.getVectorNumElements());
932  SubLT.second.getVectorNumElements());
933  int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
934  int ExtractCost = getShuffleCost(TTI::SK_ExtractSubvector, VecTy,
935  ExtractIndex, SubTy);
936 
937  // If the original size is 32-bits or more, we can use pshufd. Otherwise
938  // if we have SSSE3 we can use pshufb.
939  if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
940  return ExtractCost + 1; // pshufd or pshufb
941 
942  assert(SubTp->getPrimitiveSizeInBits() == 16 &&
943  "Unexpected vector size");
944 
945  return ExtractCost + 2; // worst case pshufhw + pshufd
946  }
947  }
948  }
949 
950  // We are going to permute multiple sources and the result will be in multiple
951  // destinations. Providing an accurate cost only for splits where the element
952  // type remains the same.
953  if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
954  MVT LegalVT = LT.second;
955  if (LegalVT.isVector() &&
956  LegalVT.getVectorElementType().getSizeInBits() ==
958  LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) {
959 
960  unsigned VecTySize = DL.getTypeStoreSize(Tp);
961  unsigned LegalVTSize = LegalVT.getStoreSize();
962  // Number of source vectors after legalization:
963  unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
964  // Number of destination vectors after legalization:
965  unsigned NumOfDests = LT.first;
966 
967  Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(),
968  LegalVT.getVectorNumElements());
969 
970  unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
971  return NumOfShuffles *
972  getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
973  }
974 
975  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
976  }
977 
978  // For 2-input shuffles, we must account for splitting the 2 inputs into many.
979  if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
980  // We assume that source and destination have the same vector type.
981  int NumOfDests = LT.first;
982  int NumOfShufflesPerDest = LT.first * 2 - 1;
983  LT.first = NumOfDests * NumOfShufflesPerDest;
984  }
985 
986  static const CostTblEntry AVX512VBMIShuffleTbl[] = {
987  {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
988  {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
989 
990  {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
991  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
992 
993  {TTI::SK_PermuteTwoSrc, MVT::v64i8, 1}, // vpermt2b
994  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 1}, // vpermt2b
995  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1} // vpermt2b
996  };
997 
998  if (ST->hasVBMI())
999  if (const auto *Entry =
1000  CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1001  return LT.first * Entry->Cost;
1002 
1003  static const CostTblEntry AVX512BWShuffleTbl[] = {
1004  {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1005  {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1006 
1007  {TTI::SK_Reverse, MVT::v32i16, 1}, // vpermw
1008  {TTI::SK_Reverse, MVT::v16i16, 1}, // vpermw
1009  {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
1010 
1011  {TTI::SK_PermuteSingleSrc, MVT::v32i16, 1}, // vpermw
1012  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 1}, // vpermw
1013  {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // vpermw
1014  {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
1015  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 3}, // vpermw + zext/trunc
1016 
1017  {TTI::SK_PermuteTwoSrc, MVT::v32i16, 1}, // vpermt2w
1018  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 1}, // vpermt2w
1019  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpermt2w
1020  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 3}, // zext + vpermt2w + trunc
1021  {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1022  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3} // zext + vpermt2w + trunc
1023  };
1024 
1025  if (ST->hasBWI())
1026  if (const auto *Entry =
1027  CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1028  return LT.first * Entry->Cost;
1029 
1030  static const CostTblEntry AVX512ShuffleTbl[] = {
1031  {TTI::SK_Broadcast, MVT::v8f64, 1}, // vbroadcastpd
1032  {TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps
1033  {TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq
1034  {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd
1035 
1036  {TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd
1037  {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps
1038  {TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq
1039  {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd
1040 
1041  {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd
1042  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1043  {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // vpermpd
1044  {TTI::SK_PermuteSingleSrc, MVT::v16f32, 1}, // vpermps
1045  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1046  {TTI::SK_PermuteSingleSrc, MVT::v4f32, 1}, // vpermps
1047  {TTI::SK_PermuteSingleSrc, MVT::v8i64, 1}, // vpermq
1048  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1049  {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // vpermq
1050  {TTI::SK_PermuteSingleSrc, MVT::v16i32, 1}, // vpermd
1051  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1052  {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // vpermd
1053  {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1054 
1055  {TTI::SK_PermuteTwoSrc, MVT::v8f64, 1}, // vpermt2pd
1056  {TTI::SK_PermuteTwoSrc, MVT::v16f32, 1}, // vpermt2ps
1057  {TTI::SK_PermuteTwoSrc, MVT::v8i64, 1}, // vpermt2q
1058  {TTI::SK_PermuteTwoSrc, MVT::v16i32, 1}, // vpermt2d
1059  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 1}, // vpermt2pd
1060  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 1}, // vpermt2ps
1061  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 1}, // vpermt2q
1062  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 1}, // vpermt2d
1063  {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1}, // vpermt2pd
1064  {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1}, // vpermt2ps
1065  {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1}, // vpermt2q
1066  {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1} // vpermt2d
1067  };
1068 
1069  if (ST->hasAVX512())
1070  if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1071  return LT.first * Entry->Cost;
1072 
1073  static const CostTblEntry AVX2ShuffleTbl[] = {
1074  {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1075  {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1076  {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1077  {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1078  {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1079  {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1080 
1081  {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1082  {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1083  {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1084  {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1085  {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1086  {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1087 
1088  {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1089  {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1090 
1091  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1092  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1093  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1094  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1095  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1096  // + vpblendvb
1097  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1098  // + vpblendvb
1099 
1100  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1101  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1102  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1103  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1104  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1105  // + vpblendvb
1106  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
1107  // + vpblendvb
1108  };
1109 
1110  if (ST->hasAVX2())
1111  if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1112  return LT.first * Entry->Cost;
1113 
1114  static const CostTblEntry XOPShuffleTbl[] = {
1115  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
1116  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
1117  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
1118  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
1119  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1120  // + vinsertf128
1121  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
1122  // + vinsertf128
1123 
1124  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1125  // + vinsertf128
1126  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
1127  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
1128  // + vinsertf128
1129  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
1130  };
1131 
1132  if (ST->hasXOP())
1133  if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1134  return LT.first * Entry->Cost;
1135 
1136  static const CostTblEntry AVX1ShuffleTbl[] = {
1137  {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1138  {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1139  {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1140  {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1141  {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1142  {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
1143 
1144  {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1145  {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1146  {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1147  {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1148  {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1149  // + vinsertf128
1150  {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
1151  // + vinsertf128
1152 
1153  {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
1154  {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
1155  {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
1156  {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
1157  {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
1158  {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
1159 
1160  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
1161  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
1162  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1163  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1164  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
1165  // + 2*por + vinsertf128
1166  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
1167  // + 2*por + vinsertf128
1168 
1169  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
1170  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
1171  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1172  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1173  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
1174  // + 4*por + vinsertf128
1175  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
1176  // + 4*por + vinsertf128
1177  };
1178 
1179  if (ST->hasAVX())
1180  if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
1181  return LT.first * Entry->Cost;
1182 
1183  static const CostTblEntry SSE41ShuffleTbl[] = {
1184  {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
1185  {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1186  {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
1187  {TTI::SK_Select, MVT::v4f32, 1}, // blendps
1188  {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
1189  {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
1190  };
1191 
1192  if (ST->hasSSE41())
1193  if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
1194  return LT.first * Entry->Cost;
1195 
1196  static const CostTblEntry SSSE3ShuffleTbl[] = {
1197  {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
1198  {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
1199 
1200  {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
1201  {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
1202 
1203  {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
1204  {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
1205 
1206  {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
1207  {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1208 
1209  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
1210  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
1211  };
1212 
1213  if (ST->hasSSSE3())
1214  if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
1215  return LT.first * Entry->Cost;
1216 
1217  static const CostTblEntry SSE2ShuffleTbl[] = {
1218  {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
1219  {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
1220  {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
1221  {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
1222  {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
1223 
1224  {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
1225  {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
1226  {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
1227  {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
1228  {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
1229  // + 2*pshufd + 2*unpck + packus
1230 
1231  {TTI::SK_Select, MVT::v2i64, 1}, // movsd
1232  {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1233  {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
1234  {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
1235  {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
1236 
1237  {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
1238  {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
1239  {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
1240  {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
1241  // + pshufd/unpck
1242  { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
1243  // + 2*pshufd + 2*unpck + 2*packus
1244 
1245  { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
1246  { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
1247  { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
1248  { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
1249  { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
1250  };
1251 
1252  if (ST->hasSSE2())
1253  if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
1254  return LT.first * Entry->Cost;
1255 
1256  static const CostTblEntry SSE1ShuffleTbl[] = {
1257  { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
1258  { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
1259  { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
1260  { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
1261  { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
1262  };
1263 
1264  if (ST->hasSSE1())
1265  if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
1266  return LT.first * Entry->Cost;
1267 
1268  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
1269 }
1270 
1271 int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1272  const Instruction *I) {
1273  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1274  assert(ISD && "Invalid opcode");
1275 
1276  // FIXME: Need a better design of the cost table to handle non-simple types of
1277  // potential massive combinations (elem_num x src_type x dst_type).
1278 
1279  static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
1282 
1283  // Mask sign extend has an instruction.
1290 
1291  // Mask zero extend is a load + broadcast.
1298  };
1299 
1300  static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
1307 
1314 
1321 
1328  };
1329 
1330  // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
1331  // 256-bit wide vectors.
1332 
1333  static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
1337 
1342 
1343  // v16i1 -> v16i32 - load + broadcast
1356 
1365 
1390 
1392 
1402  };
1403 
1404  static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
1421 
1428 
1431 
1433  };
1434 
1435  static const TypeConversionCostTblEntry AVXConversionTbl[] = {
1452 
1460 
1473 
1489  // The generic code to compute the scalar overhead is currently broken.
1490  // Workaround this limitation by estimating the scalarization overhead
1491  // here. We have roughly 10 instructions per scalar element.
1492  // Multiply that by the vector width.
1493  // FIXME: remove that when PR19268 is fixed.
1496 
1499  // This node is expanded into scalarized operations but BasicTTI is overly
1500  // optimistic estimating its cost. It computes 3 per element (one
1501  // vector-extract, one scalar conversion and one vector-insert). The
1502  // problem is that the inserts form a read-modify-write chain so latency
1503  // should be factored in too. Inflating the cost per element by 1.
1506 
1509  };
1510 
1511  static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
1518 
1537 
1545 
1547  };
1548 
1549  static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
1550  // These are somewhat magic numbers justified by looking at the output of
1551  // Intel's IACA, running some kernels and making sure when we take
1552  // legalization into account the throughput will be overestimated.
1554  { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1561 
1562  { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1570 
1572 
1574 
1599 
1609  };
1610 
1611  std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
1612  std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);
1613 
1614  if (ST->hasSSE2() && !ST->hasAVX()) {
1615  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
1616  LTDest.second, LTSrc.second))
1617  return LTSrc.first * Entry->Cost;
1618  }
1619 
1620  EVT SrcTy = TLI->getValueType(DL, Src);
1621  EVT DstTy = TLI->getValueType(DL, Dst);
1622 
1623  // The function getSimpleVT only handles simple value types.
1624  if (!SrcTy.isSimple() || !DstTy.isSimple())
1625  return BaseT::getCastInstrCost(Opcode, Dst, Src);
1626 
1627  MVT SimpleSrcTy = SrcTy.getSimpleVT();
1628  MVT SimpleDstTy = DstTy.getSimpleVT();
1629 
1630  // Make sure that neither type is going to be split before using the
1631  // AVX512 tables. This handles -mprefer-vector-width=256
1632  // with -min-legal-vector-width<=256
1633  if (TLI->getTypeAction(SimpleSrcTy) != TargetLowering::TypeSplitVector &&
1634  TLI->getTypeAction(SimpleDstTy) != TargetLowering::TypeSplitVector) {
1635  if (ST->hasBWI())
1636  if (const auto *Entry = ConvertCostTableLookup(AVX512BWConversionTbl, ISD,
1637  SimpleDstTy, SimpleSrcTy))
1638  return Entry->Cost;
1639 
1640  if (ST->hasDQI())
1641  if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
1642  SimpleDstTy, SimpleSrcTy))
1643  return Entry->Cost;
1644 
1645  if (ST->hasAVX512())
1646  if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
1647  SimpleDstTy, SimpleSrcTy))
1648  return Entry->Cost;
1649  }
1650 
1651  if (ST->hasAVX2()) {
1652  if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
1653  SimpleDstTy, SimpleSrcTy))
1654  return Entry->Cost;
1655  }
1656 
1657  if (ST->hasAVX()) {
1658  if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
1659  SimpleDstTy, SimpleSrcTy))
1660  return Entry->Cost;
1661  }
1662 
1663  if (ST->hasSSE41()) {
1664  if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
1665  SimpleDstTy, SimpleSrcTy))
1666  return Entry->Cost;
1667  }
1668 
1669  if (ST->hasSSE2()) {
1670  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
1671  SimpleDstTy, SimpleSrcTy))
1672  return Entry->Cost;
1673  }
1674 
1675  return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
1676 }
1677 
1678 int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1679  const Instruction *I) {
1680  // Legalize the type.
1681  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1682 
1683  MVT MTy = LT.second;
1684 
1685  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1686  assert(ISD && "Invalid opcode");
1687 
1688  unsigned ExtraCost = 0;
1689  if (I && (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp)) {
1690  // Some vector comparison predicates cost extra instructions.
1691  if (MTy.isVector() &&
1692  !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
1693  (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
1694  ST->hasBWI())) {
1695  switch (cast<CmpInst>(I)->getPredicate()) {
1696  case CmpInst::Predicate::ICMP_NE:
1697  // xor(cmpeq(x,y),-1)
1698  ExtraCost = 1;
1699  break;
1700  case CmpInst::Predicate::ICMP_SGE:
1701  case CmpInst::Predicate::ICMP_SLE:
1702  // xor(cmpgt(x,y),-1)
1703  ExtraCost = 1;
1704  break;
1705  case CmpInst::Predicate::ICMP_ULT:
1706  case CmpInst::Predicate::ICMP_UGT:
1707  // cmpgt(xor(x,signbit),xor(y,signbit))
1708  // xor(cmpeq(pmaxu(x,y),x),-1)
1709  ExtraCost = 2;
1710  break;
1711  case CmpInst::Predicate::ICMP_ULE:
1712  case CmpInst::Predicate::ICMP_UGE:
1713  if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
1714  (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
1715  // cmpeq(psubus(x,y),0)
1716  // cmpeq(pminu(x,y),x)
1717  ExtraCost = 1;
1718  } else {
1719  // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
1720  ExtraCost = 3;
1721  }
1722  break;
1723  default:
1724  break;
1725  }
1726  }
1727  }
1728 
1729  static const CostTblEntry AVX512BWCostTbl[] = {
1730  { ISD::SETCC, MVT::v32i16, 1 },
1731  { ISD::SETCC, MVT::v64i8, 1 },
1732 
1733  { ISD::SELECT, MVT::v32i16, 1 },
1734  { ISD::SELECT, MVT::v64i8, 1 },
1735  };
1736 
1737  static const CostTblEntry AVX512CostTbl[] = {
1738  { ISD::SETCC, MVT::v8i64, 1 },
1739  { ISD::SETCC, MVT::v16i32, 1 },
1740  { ISD::SETCC, MVT::v8f64, 1 },
1741  { ISD::SETCC, MVT::v16f32, 1 },
1742 
1743  { ISD::SELECT, MVT::v8i64, 1 },
1744  { ISD::SELECT, MVT::v16i32, 1 },
1745  { ISD::SELECT, MVT::v8f64, 1 },
1746  { ISD::SELECT, MVT::v16f32, 1 },
1747  };
1748 
1749  static const CostTblEntry AVX2CostTbl[] = {
1750  { ISD::SETCC, MVT::v4i64, 1 },
1751  { ISD::SETCC, MVT::v8i32, 1 },
1752  { ISD::SETCC, MVT::v16i16, 1 },
1753  { ISD::SETCC, MVT::v32i8, 1 },
1754 
1755  { ISD::SELECT, MVT::v4i64, 1 }, // pblendvb
1756  { ISD::SELECT, MVT::v8i32, 1 }, // pblendvb
1757  { ISD::SELECT, MVT::v16i16, 1 }, // pblendvb
1758  { ISD::SELECT, MVT::v32i8, 1 }, // pblendvb
1759  };
1760 
1761  static const CostTblEntry AVX1CostTbl[] = {
1762  { ISD::SETCC, MVT::v4f64, 1 },
1763  { ISD::SETCC, MVT::v8f32, 1 },
1764  // AVX1 does not support 8-wide integer compare.
1765  { ISD::SETCC, MVT::v4i64, 4 },
1766  { ISD::SETCC, MVT::v8i32, 4 },
1767  { ISD::SETCC, MVT::v16i16, 4 },
1768  { ISD::SETCC, MVT::v32i8, 4 },
1769 
1770  { ISD::SELECT, MVT::v4f64, 1 }, // vblendvpd
1771  { ISD::SELECT, MVT::v8f32, 1 }, // vblendvps
1772  { ISD::SELECT, MVT::v4i64, 1 }, // vblendvpd
1773  { ISD::SELECT, MVT::v8i32, 1 }, // vblendvps
1774  { ISD::SELECT, MVT::v16i16, 3 }, // vandps + vandnps + vorps
1775  { ISD::SELECT, MVT::v32i8, 3 }, // vandps + vandnps + vorps
1776  };
1777 
1778  static const CostTblEntry SSE42CostTbl[] = {
1779  { ISD::SETCC, MVT::v2f64, 1 },
1780  { ISD::SETCC, MVT::v4f32, 1 },
1781  { ISD::SETCC, MVT::v2i64, 1 },
1782  };
1783 
1784  static const CostTblEntry SSE41CostTbl[] = {
1785  { ISD::SELECT, MVT::v2f64, 1 }, // blendvpd
1786  { ISD::SELECT, MVT::v4f32, 1 }, // blendvps
1787  { ISD::SELECT, MVT::v2i64, 1 }, // pblendvb
1788  { ISD::SELECT, MVT::v4i32, 1 }, // pblendvb
1789  { ISD::SELECT, MVT::v8i16, 1 }, // pblendvb
1790  { ISD::SELECT, MVT::v16i8, 1 }, // pblendvb
1791  };
1792 
1793  static const CostTblEntry SSE2CostTbl[] = {
1794  { ISD::SETCC, MVT::v2f64, 2 },
1795  { ISD::SETCC, MVT::f64, 1 },
1796  { ISD::SETCC, MVT::v2i64, 8 },
1797  { ISD::SETCC, MVT::v4i32, 1 },
1798  { ISD::SETCC, MVT::v8i16, 1 },
1799  { ISD::SETCC, MVT::v16i8, 1 },
1800 
1801  { ISD::SELECT, MVT::v2f64, 3 }, // andpd + andnpd + orpd
1802  { ISD::SELECT, MVT::v2i64, 3 }, // pand + pandn + por
1803  { ISD::SELECT, MVT::v4i32, 3 }, // pand + pandn + por
1804  { ISD::SELECT, MVT::v8i16, 3 }, // pand + pandn + por
1805  { ISD::SELECT, MVT::v16i8, 3 }, // pand + pandn + por
1806  };
1807 
1808  static const CostTblEntry SSE1CostTbl[] = {
1809  { ISD::SETCC, MVT::v4f32, 2 },
1810  { ISD::SETCC, MVT::f32, 1 },
1811 
1812  { ISD::SELECT, MVT::v4f32, 3 }, // andps + andnps + orps
1813  };
1814 
1815  if (ST->hasBWI())
1816  if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
1817  return LT.first * (ExtraCost + Entry->Cost);
1818 
1819  if (ST->hasAVX512())
1820  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
1821  return LT.first * (ExtraCost + Entry->Cost);
1822 
1823  if (ST->hasAVX2())
1824  if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
1825  return LT.first * (ExtraCost + Entry->Cost);
1826 
1827  if (ST->hasAVX())
1828  if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
1829  return LT.first * (ExtraCost + Entry->Cost);
1830 
1831  if (ST->hasSSE42())
1832  if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
1833  return LT.first * (ExtraCost + Entry->Cost);
1834 
1835  if (ST->hasSSE41())
1836  if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
1837  return LT.first * (ExtraCost + Entry->Cost);
1838 
1839  if (ST->hasSSE2())
1840  if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
1841  return LT.first * (ExtraCost + Entry->Cost);
1842 
1843  if (ST->hasSSE1())
1844  if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
1845  return LT.first * (ExtraCost + Entry->Cost);
1846 
1847  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
1848 }
1849 
1851 
1854  unsigned ScalarizationCostPassed) {
1855  // Costs should match the codegen from:
1856  // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
1857  // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
1858  // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
1859  // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
1860  // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
1861  static const CostTblEntry AVX512CDCostTbl[] = {
1862  { ISD::CTLZ, MVT::v8i64, 1 },
1863  { ISD::CTLZ, MVT::v16i32, 1 },
1864  { ISD::CTLZ, MVT::v32i16, 8 },
1865  { ISD::CTLZ, MVT::v64i8, 20 },
1866  { ISD::CTLZ, MVT::v4i64, 1 },
1867  { ISD::CTLZ, MVT::v8i32, 1 },
1868  { ISD::CTLZ, MVT::v16i16, 4 },
1869  { ISD::CTLZ, MVT::v32i8, 10 },
1870  { ISD::CTLZ, MVT::v2i64, 1 },
1871  { ISD::CTLZ, MVT::v4i32, 1 },
1872  { ISD::CTLZ, MVT::v8i16, 4 },
1873  { ISD::CTLZ, MVT::v16i8, 4 },
1874  };
1875  static const CostTblEntry AVX512BWCostTbl[] = {
1876  { ISD::BITREVERSE, MVT::v8i64, 5 },
1877  { ISD::BITREVERSE, MVT::v16i32, 5 },
1878  { ISD::BITREVERSE, MVT::v32i16, 5 },
1879  { ISD::BITREVERSE, MVT::v64i8, 5 },
1880  { ISD::CTLZ, MVT::v8i64, 23 },
1881  { ISD::CTLZ, MVT::v16i32, 22 },
1882  { ISD::CTLZ, MVT::v32i16, 18 },
1883  { ISD::CTLZ, MVT::v64i8, 17 },
1884  { ISD::CTPOP, MVT::v8i64, 7 },
1885  { ISD::CTPOP, MVT::v16i32, 11 },
1886  { ISD::CTPOP, MVT::v32i16, 9 },
1887  { ISD::CTPOP, MVT::v64i8, 6 },
1888  { ISD::CTTZ, MVT::v8i64, 10 },
1889  { ISD::CTTZ, MVT::v16i32, 14 },
1890  { ISD::CTTZ, MVT::v32i16, 12 },
1891  { ISD::CTTZ, MVT::v64i8, 9 },
1892  { ISD::SADDSAT, MVT::v32i16, 1 },
1893  { ISD::SADDSAT, MVT::v64i8, 1 },
1894  { ISD::SSUBSAT, MVT::v32i16, 1 },
1895  { ISD::SSUBSAT, MVT::v64i8, 1 },
1896  { ISD::UADDSAT, MVT::v32i16, 1 },
1897  { ISD::UADDSAT, MVT::v64i8, 1 },
1898  { ISD::USUBSAT, MVT::v32i16, 1 },
1899  { ISD::USUBSAT, MVT::v64i8, 1 },
1900  };
1901  static const CostTblEntry AVX512CostTbl[] = {
1902  { ISD::BITREVERSE, MVT::v8i64, 36 },
1903  { ISD::BITREVERSE, MVT::v16i32, 24 },
1904  { ISD::CTLZ, MVT::v8i64, 29 },
1905  { ISD::CTLZ, MVT::v16i32, 35 },
1906  { ISD::CTPOP, MVT::v8i64, 16 },
1907  { ISD::CTPOP, MVT::v16i32, 24 },
1908  { ISD::CTTZ, MVT::v8i64, 20 },
1909  { ISD::CTTZ, MVT::v16i32, 28 },
1910  { ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd
1911  { ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq
1912  { ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq
1913  { ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq
1914  { ISD::UADDSAT, MVT::v16i32, 3 }, // not + pminud + paddd
1915  { ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq
1916  { ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq
1917  { ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq
1918  };
1919  static const CostTblEntry XOPCostTbl[] = {
1920  { ISD::BITREVERSE, MVT::v4i64, 4 },
1921  { ISD::BITREVERSE, MVT::v8i32, 4 },
1922  { ISD::BITREVERSE, MVT::v16i16, 4 },
1923  { ISD::BITREVERSE, MVT::v32i8, 4 },
1924  { ISD::BITREVERSE, MVT::v2i64, 1 },
1925  { ISD::BITREVERSE, MVT::v4i32, 1 },
1926  { ISD::BITREVERSE, MVT::v8i16, 1 },
1927  { ISD::BITREVERSE, MVT::v16i8, 1 },
1928  { ISD::BITREVERSE, MVT::i64, 3 },
1929  { ISD::BITREVERSE, MVT::i32, 3 },
1930  { ISD::BITREVERSE, MVT::i16, 3 },
1931  { ISD::BITREVERSE, MVT::i8, 3 }
1932  };
1933  static const CostTblEntry AVX2CostTbl[] = {
1934  { ISD::BITREVERSE, MVT::v4i64, 5 },
1935  { ISD::BITREVERSE, MVT::v8i32, 5 },
1936  { ISD::BITREVERSE, MVT::v16i16, 5 },
1937  { ISD::BITREVERSE, MVT::v32i8, 5 },
1938  { ISD::BSWAP, MVT::v4i64, 1 },
1939  { ISD::BSWAP, MVT::v8i32, 1 },
1940  { ISD::BSWAP, MVT::v16i16, 1 },
1941  { ISD::CTLZ, MVT::v4i64, 23 },
1942  { ISD::CTLZ, MVT::v8i32, 18 },
1943  { ISD::CTLZ, MVT::v16i16, 14 },
1944  { ISD::CTLZ, MVT::v32i8, 9 },
1945  { ISD::CTPOP, MVT::v4i64, 7 },
1946  { ISD::CTPOP, MVT::v8i32, 11 },
1947  { ISD::CTPOP, MVT::v16i16, 9 },
1948  { ISD::CTPOP, MVT::v32i8, 6 },
1949  { ISD::CTTZ, MVT::v4i64, 10 },
1950  { ISD::CTTZ, MVT::v8i32, 14 },
1951  { ISD::CTTZ, MVT::v16i16, 12 },
1952  { ISD::CTTZ, MVT::v32i8, 9 },
1953  { ISD::SADDSAT, MVT::v16i16, 1 },
1954  { ISD::SADDSAT, MVT::v32i8, 1 },
1955  { ISD::SSUBSAT, MVT::v16i16, 1 },
1956  { ISD::SSUBSAT, MVT::v32i8, 1 },
1957  { ISD::UADDSAT, MVT::v16i16, 1 },
1958  { ISD::UADDSAT, MVT::v32i8, 1 },
1959  { ISD::UADDSAT, MVT::v8i32, 3 }, // not + pminud + paddd
1960  { ISD::USUBSAT, MVT::v16i16, 1 },
1961  { ISD::USUBSAT, MVT::v32i8, 1 },
1962  { ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd
1963  { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/
1964  { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
1965  { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
1966  { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/
1967  { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
1968  { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
1969  };
1970  static const CostTblEntry AVX1CostTbl[] = {
1971  { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert
1972  { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert
1973  { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert
1974  { ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert
1975  { ISD::BSWAP, MVT::v4i64, 4 },
1976  { ISD::BSWAP, MVT::v8i32, 4 },
1977  { ISD::BSWAP, MVT::v16i16, 4 },
1978  { ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert
1979  { ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert
1980  { ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert
1981  { ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
1982  { ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert
1983  { ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert
1984  { ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert
1985  { ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert
1986  { ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert
1987  { ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert
1988  { ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert
1989  { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
1990  { ISD::SADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
1991  { ISD::SADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
1992  { ISD::SSUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
1993  { ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
1994  { ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
1995  { ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
1996  { ISD::UADDSAT, MVT::v8i32, 8 }, // 2 x 128-bit Op + extract/insert
1997  { ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
1998  { ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
1999  { ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert
2000  { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
2001  { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
2002  { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
2003  { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/
2004  { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/
2005  { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/
2006  };
2007  static const CostTblEntry GLMCostTbl[] = {
2008  { ISD::FSQRT, MVT::f32, 19 }, // sqrtss
2009  { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps
2010  { ISD::FSQRT, MVT::f64, 34 }, // sqrtsd
2011  { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd
2012  };
2013  static const CostTblEntry SLMCostTbl[] = {
2014  { ISD::FSQRT, MVT::f32, 20 }, // sqrtss
2015  { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps
2016  { ISD::FSQRT, MVT::f64, 35 }, // sqrtsd
2017  { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd
2018  };
2019  static const CostTblEntry SSE42CostTbl[] = {
2020  { ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd
2021  { ISD::UADDSAT, MVT::v4i32, 3 }, // not + pminud + paddd
2022  { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
2023  { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
2024  };
2025  static const CostTblEntry SSSE3CostTbl[] = {
2026  { ISD::BITREVERSE, MVT::v2i64, 5 },
2027  { ISD::BITREVERSE, MVT::v4i32, 5 },
2028  { ISD::BITREVERSE, MVT::v8i16, 5 },
2029  { ISD::BITREVERSE, MVT::v16i8, 5 },
2030  { ISD::BSWAP, MVT::v2i64, 1 },
2031  { ISD::BSWAP, MVT::v4i32, 1 },
2032  { ISD::BSWAP, MVT::v8i16, 1 },
2033  { ISD::CTLZ, MVT::v2i64, 23 },
2034  { ISD::CTLZ, MVT::v4i32, 18 },
2035  { ISD::CTLZ, MVT::v8i16, 14 },
2036  { ISD::CTLZ, MVT::v16i8, 9 },
2037  { ISD::CTPOP, MVT::v2i64, 7 },
2038  { ISD::CTPOP, MVT::v4i32, 11 },
2039  { ISD::CTPOP, MVT::v8i16, 9 },
2040  { ISD::CTPOP, MVT::v16i8, 6 },
2041  { ISD::CTTZ, MVT::v2i64, 10 },
2042  { ISD::CTTZ, MVT::v4i32, 14 },
2043  { ISD::CTTZ, MVT::v8i16, 12 },
2044  { ISD::CTTZ, MVT::v16i8, 9 }
2045  };
2046  static const CostTblEntry SSE2CostTbl[] = {
2047  { ISD::BITREVERSE, MVT::v2i64, 29 },
2048  { ISD::BITREVERSE, MVT::v4i32, 27 },
2049  { ISD::BITREVERSE, MVT::v8i16, 27 },
2050  { ISD::BITREVERSE, MVT::v16i8, 20 },
2051  { ISD::BSWAP, MVT::v2i64, 7 },
2052  { ISD::BSWAP, MVT::v4i32, 7 },
2053  { ISD::BSWAP, MVT::v8i16, 7 },
2054  { ISD::CTLZ, MVT::v2i64, 25 },
2055  { ISD::CTLZ, MVT::v4i32, 26 },
2056  { ISD::CTLZ, MVT::v8i16, 20 },
2057  { ISD::CTLZ, MVT::v16i8, 17 },
2058  { ISD::CTPOP, MVT::v2i64, 12 },
2059  { ISD::CTPOP, MVT::v4i32, 15 },
2060  { ISD::CTPOP, MVT::v8i16, 13 },
2061  { ISD::CTPOP, MVT::v16i8, 10 },
2062  { ISD::CTTZ, MVT::v2i64, 14 },
2063  { ISD::CTTZ, MVT::v4i32, 18 },
2064  { ISD::CTTZ, MVT::v8i16, 16 },
2065  { ISD::CTTZ, MVT::v16i8, 13 },
2066  { ISD::SADDSAT, MVT::v8i16, 1 },
2067  { ISD::SADDSAT, MVT::v16i8, 1 },
2068  { ISD::SSUBSAT, MVT::v8i16, 1 },
2069  { ISD::SSUBSAT, MVT::v16i8, 1 },
2070  { ISD::UADDSAT, MVT::v8i16, 1 },
2071  { ISD::UADDSAT, MVT::v16i8, 1 },
2072  { ISD::USUBSAT, MVT::v8i16, 1 },
2073  { ISD::USUBSAT, MVT::v16i8, 1 },
2074  { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/
2075  { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/
2076  };
2077  static const CostTblEntry SSE1CostTbl[] = {
2078  { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
2079  { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
2080  };
2081  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
2082  { ISD::BITREVERSE, MVT::i64, 14 },
2083  { ISD::SADDO, MVT::i64, 1 },
2084  { ISD::UADDO, MVT::i64, 1 },
2085  };
2086  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
2087  { ISD::BITREVERSE, MVT::i32, 14 },
2088  { ISD::BITREVERSE, MVT::i16, 14 },
2089  { ISD::BITREVERSE, MVT::i8, 11 },
2090  { ISD::SADDO, MVT::i32, 1 },
2091  { ISD::SADDO, MVT::i16, 1 },
2092  { ISD::SADDO, MVT::i8, 1 },
2093  { ISD::UADDO, MVT::i32, 1 },
2094  { ISD::UADDO, MVT::i16, 1 },
2095  { ISD::UADDO, MVT::i8, 1 },
2096  };
2097 
2098  Type *OpTy = RetTy;
2099  unsigned ISD = ISD::DELETED_NODE;
2100  switch (IID) {
2101  default:
2102  break;
2103  case Intrinsic::bitreverse:
2104  ISD = ISD::BITREVERSE;
2105  break;
2106  case Intrinsic::bswap:
2107  ISD = ISD::BSWAP;
2108  break;
2109  case Intrinsic::ctlz:
2110  ISD = ISD::CTLZ;
2111  break;
2112  case Intrinsic::ctpop:
2113  ISD = ISD::CTPOP;
2114  break;
2115  case Intrinsic::cttz:
2116  ISD = ISD::CTTZ;
2117  break;
2118  case Intrinsic::sadd_sat:
2119  ISD = ISD::SADDSAT;
2120  break;
2121  case Intrinsic::ssub_sat:
2122  ISD = ISD::SSUBSAT;
2123  break;
2124  case Intrinsic::uadd_sat:
2125  ISD = ISD::UADDSAT;
2126  break;
2127  case Intrinsic::usub_sat:
2128  ISD = ISD::USUBSAT;
2129  break;
2130  case Intrinsic::sqrt:
2131  ISD = ISD::FSQRT;
2132  break;
2133  case Intrinsic::sadd_with_overflow:
2134  case Intrinsic::ssub_with_overflow:
2135  // SSUBO has same costs so don't duplicate.
2136  ISD = ISD::SADDO;
2137  OpTy = RetTy->getContainedType(0);
2138  break;
2139  case Intrinsic::uadd_with_overflow:
2140  case Intrinsic::usub_with_overflow:
2141  // USUBO has same costs so don't duplicate.
2142  ISD = ISD::UADDO;
2143  OpTy = RetTy->getContainedType(0);
2144  break;
2145  }
2146 
2147  if (ISD != ISD::DELETED_NODE) {
2148  // Legalize the type.
2149  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, OpTy);
2150  MVT MTy = LT.second;
2151 
2152  // Attempt to lookup cost.
2153  if (ST->isGLM())
2154  if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
2155  return LT.first * Entry->Cost;
2156 
2157  if (ST->isSLM())
2158  if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
2159  return LT.first * Entry->Cost;
2160 
2161  if (ST->hasCDI())
2162  if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
2163  return LT.first * Entry->Cost;
2164 
2165  if (ST->hasBWI())
2166  if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
2167  return LT.first * Entry->Cost;
2168 
2169  if (ST->hasAVX512())
2170  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2171  return LT.first * Entry->Cost;
2172 
2173  if (ST->hasXOP())
2174  if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
2175  return LT.first * Entry->Cost;
2176 
2177  if (ST->hasAVX2())
2178  if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
2179  return LT.first * Entry->Cost;
2180 
2181  if (ST->hasAVX())
2182  if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
2183  return LT.first * Entry->Cost;
2184 
2185  if (ST->hasSSE42())
2186  if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
2187  return LT.first * Entry->Cost;
2188 
2189  if (ST->hasSSSE3())
2190  if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
2191  return LT.first * Entry->Cost;
2192 
2193  if (ST->hasSSE2())
2194  if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
2195  return LT.first * Entry->Cost;
2196 
2197  if (ST->hasSSE1())
2198  if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
2199  return LT.first * Entry->Cost;
2200 
2201  if (ST->is64Bit())
2202  if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
2203  return LT.first * Entry->Cost;
2204 
2205  if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
2206  return LT.first * Entry->Cost;
2207  }
2208 
2209  return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed);
2210 }
2211 
2214  unsigned VF) {
2215  static const CostTblEntry AVX512CostTbl[] = {
2216  { ISD::ROTL, MVT::v8i64, 1 },
2217  { ISD::ROTL, MVT::v4i64, 1 },
2218  { ISD::ROTL, MVT::v2i64, 1 },
2219  { ISD::ROTL, MVT::v16i32, 1 },
2220  { ISD::ROTL, MVT::v8i32, 1 },
2221  { ISD::ROTL, MVT::v4i32, 1 },
2222  { ISD::ROTR, MVT::v8i64, 1 },
2223  { ISD::ROTR, MVT::v4i64, 1 },
2224  { ISD::ROTR, MVT::v2i64, 1 },
2225  { ISD::ROTR, MVT::v16i32, 1 },
2226  { ISD::ROTR, MVT::v8i32, 1 },
2227  { ISD::ROTR, MVT::v4i32, 1 }
2228  };
2229  // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
2230  static const CostTblEntry XOPCostTbl[] = {
2231  { ISD::ROTL, MVT::v4i64, 4 },
2232  { ISD::ROTL, MVT::v8i32, 4 },
2233  { ISD::ROTL, MVT::v16i16, 4 },
2234  { ISD::ROTL, MVT::v32i8, 4 },
2235  { ISD::ROTL, MVT::v2i64, 1 },
2236  { ISD::ROTL, MVT::v4i32, 1 },
2237  { ISD::ROTL, MVT::v8i16, 1 },
2238  { ISD::ROTL, MVT::v16i8, 1 },
2239  { ISD::ROTR, MVT::v4i64, 6 },
2240  { ISD::ROTR, MVT::v8i32, 6 },
2241  { ISD::ROTR, MVT::v16i16, 6 },
2242  { ISD::ROTR, MVT::v32i8, 6 },
2243  { ISD::ROTR, MVT::v2i64, 2 },
2244  { ISD::ROTR, MVT::v4i32, 2 },
2245  { ISD::ROTR, MVT::v8i16, 2 },
2246  { ISD::ROTR, MVT::v16i8, 2 }
2247  };
2248  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
2249  { ISD::ROTL, MVT::i64, 1 },
2250  { ISD::ROTR, MVT::i64, 1 },
2251  { ISD::FSHL, MVT::i64, 4 }
2252  };
2253  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
2254  { ISD::ROTL, MVT::i32, 1 },
2255  { ISD::ROTL, MVT::i16, 1 },
2256  { ISD::ROTL, MVT::i8, 1 },
2257  { ISD::ROTR, MVT::i32, 1 },
2258  { ISD::ROTR, MVT::i16, 1 },
2259  { ISD::ROTR, MVT::i8, 1 },
2260  { ISD::FSHL, MVT::i32, 4 },
2261  { ISD::FSHL, MVT::i16, 4 },
2262  { ISD::FSHL, MVT::i8, 4 }
2263  };
2264 
2265  unsigned ISD = ISD::DELETED_NODE;
2266  switch (IID) {
2267  default:
2268  break;
2269  case Intrinsic::fshl:
2270  ISD = ISD::FSHL;
2271  if (Args[0] == Args[1])
2272  ISD = ISD::ROTL;
2273  break;
2274  case Intrinsic::fshr:
2275  // FSHR has same costs so don't duplicate.
2276  ISD = ISD::FSHL;
2277  if (Args[0] == Args[1])
2278  ISD = ISD::ROTR;
2279  break;
2280  }
2281 
2282  if (ISD != ISD::DELETED_NODE) {
2283  // Legalize the type.
2284  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
2285  MVT MTy = LT.second;
2286 
2287  // Attempt to lookup cost.
2288  if (ST->hasAVX512())
2289  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2290  return LT.first * Entry->Cost;
2291 
2292  if (ST->hasXOP())
2293  if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
2294  return LT.first * Entry->Cost;
2295 
2296  if (ST->is64Bit())
2297  if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
2298  return LT.first * Entry->Cost;
2299 
2300  if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
2301  return LT.first * Entry->Cost;
2302  }
2303 
2304  return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF);
2305 }
2306 
2307 int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
2308  assert(Val->isVectorTy() && "This must be a vector type");
2309 
2310  Type *ScalarType = Val->getScalarType();
2311 
2312  if (Index != -1U) {
2313  // Legalize the type.
2314  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
2315 
2316  // This type is legalized to a scalar type.
2317  if (!LT.second.isVector())
2318  return 0;
2319 
2320  // The type may be split. Normalize the index to the new type.
2321  unsigned Width = LT.second.getVectorNumElements();
2322  Index = Index % Width;
2323 
2324  // Floating point scalars are already located in index #0.
2325  if (ScalarType->isFloatingPointTy() && Index == 0)
2326  return 0;
2327  }
2328 
2329  // Add to the base cost if we know that the extracted element of a vector is
2330  // destined to be moved to and used in the integer register file.
2331  int RegisterFileMoveCost = 0;
2332  if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
2333  RegisterFileMoveCost = 1;
2334 
2335  return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
2336 }
2337 
2338 int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
2339  unsigned AddressSpace, const Instruction *I) {
2340  // Handle non-power-of-two vectors such as <3 x float>
2341  if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
2342  unsigned NumElem = VTy->getVectorNumElements();
2343 
2344  // Handle a few common cases:
2345  // <3 x float>
2346  if (NumElem == 3 && VTy->getScalarSizeInBits() == 32)
2347  // Cost = 64 bit store + extract + 32 bit store.
2348  return 3;
2349 
2350  // <3 x double>
2351  if (NumElem == 3 && VTy->getScalarSizeInBits() == 64)
2352  // Cost = 128 bit store + unpack + 64 bit store.
2353  return 3;
2354 
2355  // Assume that all other non-power-of-two numbers are scalarized.
2356  if (!isPowerOf2_32(NumElem)) {
2357  int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
2358  AddressSpace);
2359  int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load,
2360  Opcode == Instruction::Store);
2361  return NumElem * Cost + SplitCost;
2362  }
2363  }
2364 
2365  // Legalize the type.
2366  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
2367  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
2368  "Invalid Opcode");
2369 
2370  // Each load/store unit costs 1.
2371  int Cost = LT.first * 1;
2372 
2373  // This isn't exactly right. We're using slow unaligned 32-byte accesses as a
2374  // proxy for a double-pumped AVX memory interface such as on Sandybridge.
2375  if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow())
2376  Cost *= 2;
2377 
2378  return Cost;
2379 }
2380 
2381 int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
2382  unsigned Alignment,
2383  unsigned AddressSpace) {
2384  bool IsLoad = (Instruction::Load == Opcode);
2385  bool IsStore = (Instruction::Store == Opcode);
2386 
2387  VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy);
2388  if (!SrcVTy)
2389  // To calculate scalar take the regular cost, without mask
2390  return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace);
2391 
2392  unsigned NumElem = SrcVTy->getVectorNumElements();
2393  VectorType *MaskTy =
2394  VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
2395  if ((IsLoad && !isLegalMaskedLoad(SrcVTy)) ||
2396  (IsStore && !isLegalMaskedStore(SrcVTy)) || !isPowerOf2_32(NumElem)) {
2397  // Scalarization
2398  int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
2399  int ScalarCompareCost = getCmpSelInstrCost(
2400  Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr);
2401  int BranchCost = getCFInstrCost(Instruction::Br);
2402  int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
2403 
2404  int ValueSplitCost = getScalarizationOverhead(SrcVTy, IsLoad, IsStore);
2405  int MemopCost =
2406  NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
2407  Alignment, AddressSpace);
2408  return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
2409  }
2410 
2411  // Legalize the type.
2412  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
2413  auto VT = TLI->getValueType(DL, SrcVTy);
2414  int Cost = 0;
2415  if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
2416  LT.second.getVectorNumElements() == NumElem)
2417  // Promotion requires expand/truncate for data and a shuffle for mask.
2418  Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, 0, nullptr) +
2419  getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, 0, nullptr);
2420 
2421  else if (LT.second.getVectorNumElements() > NumElem) {
2422  VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(),
2423  LT.second.getVectorNumElements());
2424  // Expanding requires fill mask with zeroes
2425  Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy);
2426  }
2427 
2428  // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
2429  if (!ST->hasAVX512())
2430  return Cost + LT.first * (IsLoad ? 2 : 8);
2431 
2432  // AVX-512 masked load/store is cheapper
2433  return Cost + LT.first;
2434 }
2435 
2437  const SCEV *Ptr) {
2438  // Address computations in vectorized code with non-consecutive addresses will
2439  // likely result in more instructions compared to scalar code where the
2440  // computation can more often be merged into the index mode. The resulting
2441  // extra micro-ops can significantly decrease throughput.
2442  const unsigned NumVectorInstToHideOverhead = 10;
2443 
2444  // Cost modeling of Strided Access Computation is hidden by the indexing
2445  // modes of X86 regardless of the stride value. We dont believe that there
2446  // is a difference between constant strided access in gerenal and constant
2447  // strided value which is less than or equal to 64.
2448  // Even in the case of (loop invariant) stride whose value is not known at
2449  // compile time, the address computation will not incur more than one extra
2450  // ADD instruction.
2451  if (Ty->isVectorTy() && SE) {
2452  if (!BaseT::isStridedAccess(Ptr))
2453  return NumVectorInstToHideOverhead;
2454  if (!BaseT::getConstantStrideStep(SE, Ptr))
2455  return 1;
2456  }
2457 
2458  return BaseT::getAddressComputationCost(Ty, SE, Ptr);
2459 }
2460 
2461 int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
2462  bool IsPairwise) {
2463  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
2464  // and make it as the cost.
2465 
2466  static const CostTblEntry SSE42CostTblPairWise[] = {
2467  { ISD::FADD, MVT::v2f64, 2 },
2468  { ISD::FADD, MVT::v4f32, 4 },
2469  { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
2470  { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32.
2471  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
2472  { ISD::ADD, MVT::v2i16, 3 }, // FIXME: chosen to be less than v4i16
2473  { ISD::ADD, MVT::v4i16, 4 }, // FIXME: chosen to be less than v8i16
2474  { ISD::ADD, MVT::v8i16, 5 },
2475  };
2476 
2477  static const CostTblEntry AVX1CostTblPairWise[] = {
2478  { ISD::FADD, MVT::v4f32, 4 },
2479  { ISD::FADD, MVT::v4f64, 5 },
2480  { ISD::FADD, MVT::v8f32, 7 },
2481  { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
2482  { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
2483  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
2484  { ISD::ADD, MVT::v4i64, 5 }, // The data reported by the IACA tool is "4.8".
2485  { ISD::ADD, MVT::v2i16, 3 }, // FIXME: chosen to be less than v4i16
2486  { ISD::ADD, MVT::v4i16, 4 }, // FIXME: chosen to be less than v8i16
2487  { ISD::ADD, MVT::v8i16, 5 },
2488  { ISD::ADD, MVT::v8i32, 5 },
2489  };
2490 
2491  static const CostTblEntry SSE42CostTblNoPairWise[] = {
2492  { ISD::FADD, MVT::v2f64, 2 },
2493  { ISD::FADD, MVT::v4f32, 4 },
2494  { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
2495  { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
2496  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
2497  { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
2498  { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
2499  { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
2500  };
2501 
2502  static const CostTblEntry AVX1CostTblNoPairWise[] = {
2503  { ISD::FADD, MVT::v4f32, 3 },
2504  { ISD::FADD, MVT::v4f64, 3 },
2505  { ISD::FADD, MVT::v8f32, 4 },
2506  { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
2507  { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
2508  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "2.8".
2509  { ISD::ADD, MVT::v4i64, 3 },
2510  { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
2511  { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
2512  { ISD::ADD, MVT::v8i16, 4 },
2513  { ISD::ADD, MVT::v8i32, 5 },
2514  };
2515 
2516  int ISD = TLI->InstructionOpcodeToISD(Opcode);
2517  assert(ISD && "Invalid opcode");
2518 
2519  // Before legalizing the type, give a chance to look up illegal narrow types
2520  // in the table.
2521  // FIXME: Is there a better way to do this?
2522  EVT VT = TLI->getValueType(DL, ValTy);
2523  if (VT.isSimple()) {
2524  MVT MTy = VT.getSimpleVT();
2525  if (IsPairwise) {
2526  if (ST->hasAVX())
2527  if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
2528  return Entry->Cost;
2529 
2530  if (ST->hasSSE42())
2531  if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
2532  return Entry->Cost;
2533  } else {
2534  if (ST->hasAVX())
2535  if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
2536  return Entry->Cost;
2537 
2538  if (ST->hasSSE42())
2539  if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
2540  return Entry->Cost;
2541  }
2542  }
2543 
2544  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2545 
2546  MVT MTy = LT.second;
2547 
2548  if (IsPairwise) {
2549  if (ST->hasAVX())
2550  if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
2551  return LT.first * Entry->Cost;
2552 
2553  if (ST->hasSSE42())
2554  if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
2555  return LT.first * Entry->Cost;
2556  } else {
2557  if (ST->hasAVX())
2558  if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
2559  return LT.first * Entry->Cost;
2560 
2561  if (ST->hasSSE42())
2562  if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
2563  return LT.first * Entry->Cost;
2564  }
2565 
2566  static const CostTblEntry AVX2BoolReduction[] = {
2567  { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
2568  { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
2569  { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
2570  { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
2571  };
2572 
2573  static const CostTblEntry AVX1BoolReduction[] = {
2574  { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
2575  { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
2576  { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
2577  { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
2578  { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
2579  { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
2580  { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
2581  { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
2582  };
2583 
2584  static const CostTblEntry SSE2BoolReduction[] = {
2585  { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
2586  { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
2587  { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
2588  { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
2589  { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
2590  { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
2591  { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
2592  { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
2593  };
2594 
2595  // Handle bool allof/anyof patterns.
2596  if (ValTy->getVectorElementType()->isIntegerTy(1)) {
2597  if (ST->hasAVX2())
2598  if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
2599  return LT.first * Entry->Cost;
2600  if (ST->hasAVX())
2601  if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
2602  return LT.first * Entry->Cost;
2603  if (ST->hasSSE2())
2604  if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
2605  return LT.first * Entry->Cost;
2606  }
2607 
2608  return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise);
2609 }
2610 
2612  bool IsPairwise, bool IsUnsigned) {
2613  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2614 
2615  MVT MTy = LT.second;
2616 
2617  int ISD;
2618  if (ValTy->isIntOrIntVectorTy()) {
2619  ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
2620  } else {
2621  assert(ValTy->isFPOrFPVectorTy() &&
2622  "Expected float point or integer vector type.");
2623  ISD = ISD::FMINNUM;
2624  }
2625 
2626  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
2627  // and make it as the cost.
2628 
2629  static const CostTblEntry SSE1CostTblPairWise[] = {
2630  {ISD::FMINNUM, MVT::v4f32, 4},
2631  };
2632 
2633  static const CostTblEntry SSE2CostTblPairWise[] = {
2634  {ISD::FMINNUM, MVT::v2f64, 3},
2635  {ISD::SMIN, MVT::v2i64, 6},
2636  {ISD::UMIN, MVT::v2i64, 8},
2637  {ISD::SMIN, MVT::v4i32, 6},
2638  {ISD::UMIN, MVT::v4i32, 8},
2639  {ISD::SMIN, MVT::v8i16, 4},
2640  {ISD::UMIN, MVT::v8i16, 6},
2641  {ISD::SMIN, MVT::v16i8, 8},
2642  {ISD::UMIN, MVT::v16i8, 6},
2643  };
2644 
2645  static const CostTblEntry SSE41CostTblPairWise[] = {
2646  {ISD::FMINNUM, MVT::v4f32, 2},
2647  {ISD::SMIN, MVT::v2i64, 9},
2648  {ISD::UMIN, MVT::v2i64,10},
2649  {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
2650  {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
2651  {ISD::SMIN, MVT::v8i16, 2},
2652  {ISD::UMIN, MVT::v8i16, 2},
2653  {ISD::SMIN, MVT::v16i8, 3},
2654  {ISD::UMIN, MVT::v16i8, 3},
2655  };
2656 
2657  static const CostTblEntry SSE42CostTblPairWise[] = {
2658  {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
2659  {ISD::UMIN, MVT::v2i64, 8}, // The data reported by the IACA is "8.6"
2660  };
2661 
2662  static const CostTblEntry AVX1CostTblPairWise[] = {
2663  {ISD::FMINNUM, MVT::v4f32, 1},
2664  {ISD::FMINNUM, MVT::v4f64, 1},
2665  {ISD::FMINNUM, MVT::v8f32, 2},
2666  {ISD::SMIN, MVT::v2i64, 3},
2667  {ISD::UMIN, MVT::v2i64, 3},
2668  {ISD::SMIN, MVT::v4i32, 1},
2669  {ISD::UMIN, MVT::v4i32, 1},
2670  {ISD::SMIN, MVT::v8i16, 1},
2671  {ISD::UMIN, MVT::v8i16, 1},
2672  {ISD::SMIN, MVT::v16i8, 2},
2673  {ISD::UMIN, MVT::v16i8, 2},
2674  {ISD::SMIN, MVT::v4i64, 7},
2675  {ISD::UMIN, MVT::v4i64, 7},
2676  {ISD::SMIN, MVT::v8i32, 3},
2677  {ISD::UMIN, MVT::v8i32, 3},
2678  {ISD::SMIN, MVT::v16i16, 3},
2679  {ISD::UMIN, MVT::v16i16, 3},
2680  {ISD::SMIN, MVT::v32i8, 3},
2681  {ISD::UMIN, MVT::v32i8, 3},
2682  };
2683 
2684  static const CostTblEntry AVX2CostTblPairWise[] = {
2685  {ISD::SMIN, MVT::v4i64, 2},
2686  {ISD::UMIN, MVT::v4i64, 2},
2687  {ISD::SMIN, MVT::v8i32, 1},
2688  {ISD::UMIN, MVT::v8i32, 1},
2689  {ISD::SMIN, MVT::v16i16, 1},
2690  {ISD::UMIN, MVT::v16i16, 1},
2691  {ISD::SMIN, MVT::v32i8, 2},
2692  {ISD::UMIN, MVT::v32i8, 2},
2693  };
2694 
2695  static const CostTblEntry AVX512CostTblPairWise[] = {
2696  {ISD::FMINNUM, MVT::v8f64, 1},
2697  {ISD::FMINNUM, MVT::v16f32, 2},
2698  {ISD::SMIN, MVT::v8i64, 2},
2699  {ISD::UMIN, MVT::v8i64, 2},
2700  {ISD::SMIN, MVT::v16i32, 1},
2701  {ISD::UMIN, MVT::v16i32, 1},
2702  };
2703 
2704  static const CostTblEntry SSE1CostTblNoPairWise[] = {
2705  {ISD::FMINNUM, MVT::v4f32, 4},
2706  };
2707 
2708  static const CostTblEntry SSE2CostTblNoPairWise[] = {
2709  {ISD::FMINNUM, MVT::v2f64, 3},
2710  {ISD::SMIN, MVT::v2i64, 6},
2711  {ISD::UMIN, MVT::v2i64, 8},
2712  {ISD::SMIN, MVT::v4i32, 6},
2713  {ISD::UMIN, MVT::v4i32, 8},
2714  {ISD::SMIN, MVT::v8i16, 4},
2715  {ISD::UMIN, MVT::v8i16, 6},
2716  {ISD::SMIN, MVT::v16i8, 8},
2717  {ISD::UMIN, MVT::v16i8, 6},
2718  };
2719 
2720  static const CostTblEntry SSE41CostTblNoPairWise[] = {
2721  {ISD::FMINNUM, MVT::v4f32, 3},
2722  {ISD::SMIN, MVT::v2i64, 9},
2723  {ISD::UMIN, MVT::v2i64,11},
2724  {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
2725  {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
2726  {ISD::SMIN, MVT::v8i16, 1}, // The data reported by the IACA is "1.5"
2727  {ISD::UMIN, MVT::v8i16, 2}, // The data reported by the IACA is "1.8"
2728  {ISD::SMIN, MVT::v16i8, 3},
2729  {ISD::UMIN, MVT::v16i8, 3},
2730  };
2731 
2732  static const CostTblEntry SSE42CostTblNoPairWise[] = {
2733  {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
2734  {ISD::UMIN, MVT::v2i64, 9}, // The data reported by the IACA is "8.6"
2735  };
2736 
2737  static const CostTblEntry AVX1CostTblNoPairWise[] = {
2738  {ISD::FMINNUM, MVT::v4f32, 1},
2739  {ISD::FMINNUM, MVT::v4f64, 1},
2740  {ISD::FMINNUM, MVT::v8f32, 1},
2741  {ISD::SMIN, MVT::v2i64, 3},
2742  {ISD::UMIN, MVT::v2i64, 3},
2743  {ISD::SMIN, MVT::v4i32, 1},
2744  {ISD::UMIN, MVT::v4i32, 1},
2745  {ISD::SMIN, MVT::v8i16, 1},
2746  {ISD::UMIN, MVT::v8i16, 1},
2747  {ISD::SMIN, MVT::v16i8, 2},
2748  {ISD::UMIN, MVT::v16i8, 2},
2749  {ISD::SMIN, MVT::v4i64, 7},
2750  {ISD::UMIN, MVT::v4i64, 7},
2751  {ISD::SMIN, MVT::v8i32, 2},
2752  {ISD::UMIN, MVT::v8i32, 2},
2753  {ISD::SMIN, MVT::v16i16, 2},
2754  {ISD::UMIN, MVT::v16i16, 2},
2755  {ISD::SMIN, MVT::v32i8, 2},
2756  {ISD::UMIN, MVT::v32i8, 2},
2757  };
2758 
2759  static const CostTblEntry AVX2CostTblNoPairWise[] = {
2760  {ISD::SMIN, MVT::v4i64, 1},
2761  {ISD::UMIN, MVT::v4i64, 1},
2762  {ISD::SMIN, MVT::v8i32, 1},
2763  {ISD::UMIN, MVT::v8i32, 1},
2764  {ISD::SMIN, MVT::v16i16, 1},
2765  {ISD::UMIN, MVT::v16i16, 1},
2766  {ISD::SMIN, MVT::v32i8, 1},
2767  {ISD::UMIN, MVT::v32i8, 1},
2768  };
2769 
2770  static const CostTblEntry AVX512CostTblNoPairWise[] = {
2771  {ISD::FMINNUM, MVT::v8f64, 1},
2772  {ISD::FMINNUM, MVT::v16f32, 2},
2773  {ISD::SMIN, MVT::v8i64, 1},
2774  {ISD::UMIN, MVT::v8i64, 1},
2775  {ISD::SMIN, MVT::v16i32, 1},
2776  {ISD::UMIN, MVT::v16i32, 1},
2777  };
2778 
2779  if (IsPairwise) {
2780  if (ST->hasAVX512())
2781  if (const auto *Entry = CostTableLookup(AVX512CostTblPairWise, ISD, MTy))
2782  return LT.first * Entry->Cost;
2783 
2784  if (ST->hasAVX2())
2785  if (const auto *Entry = CostTableLookup(AVX2CostTblPairWise, ISD, MTy))
2786  return LT.first * Entry->Cost;
2787 
2788  if (ST->hasAVX())
2789  if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
2790  return LT.first * Entry->Cost;
2791 
2792  if (ST->hasSSE42())
2793  if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
2794  return LT.first * Entry->Cost;
2795 
2796  if (ST->hasSSE41())
2797  if (const auto *Entry = CostTableLookup(SSE41CostTblPairWise, ISD, MTy))
2798  return LT.first * Entry->Cost;
2799 
2800  if (ST->hasSSE2())
2801  if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy))
2802  return LT.first * Entry->Cost;
2803 
2804  if (ST->hasSSE1())
2805  if (const auto *Entry = CostTableLookup(SSE1CostTblPairWise, ISD, MTy))
2806  return LT.first * Entry->Cost;
2807  } else {
2808  if (ST->hasAVX512())
2809  if (const auto *Entry =
2810  CostTableLookup(AVX512CostTblNoPairWise, ISD, MTy))
2811  return LT.first * Entry->Cost;
2812 
2813  if (ST->hasAVX2())
2814  if (const auto *Entry = CostTableLookup(AVX2CostTblNoPairWise, ISD, MTy))
2815  return LT.first * Entry->Cost;
2816 
2817  if (ST->hasAVX())
2818  if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
2819  return LT.first * Entry->Cost;
2820 
2821  if (ST->hasSSE42())
2822  if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
2823  return LT.first * Entry->Cost;
2824 
2825  if (ST->hasSSE41())
2826  if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy))
2827  return LT.first * Entry->Cost;
2828 
2829  if (ST->hasSSE2())
2830  if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
2831  return LT.first * Entry->Cost;
2832 
2833  if (ST->hasSSE1())
2834  if (const auto *Entry = CostTableLookup(SSE1CostTblNoPairWise, ISD, MTy))
2835  return LT.first * Entry->Cost;
2836  }
2837 
2838  return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned);
2839 }
2840 
2841 /// Calculate the cost of materializing a 64-bit value. This helper
2842 /// method might only calculate a fraction of a larger immediate. Therefore it
2843 /// is valid to return a cost of ZERO.
2844 int X86TTIImpl::getIntImmCost(int64_t Val) {
2845  if (Val == 0)
2846  return TTI::TCC_Free;
2847 
2848  if (isInt<32>(Val))
2849  return TTI::TCC_Basic;
2850 
2851  return 2 * TTI::TCC_Basic;
2852 }
2853 
2854 int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
2855  assert(Ty->isIntegerTy());
2856 
2857  unsigned BitSize = Ty->getPrimitiveSizeInBits();
2858  if (BitSize == 0)
2859  return ~0U;
2860 
2861  // Never hoist constants larger than 128bit, because this might lead to
2862  // incorrect code generation or assertions in codegen.
2863  // Fixme: Create a cost model for types larger than i128 once the codegen
2864  // issues have been fixed.
2865  if (BitSize > 128)
2866  return TTI::TCC_Free;
2867 
2868  if (Imm == 0)
2869  return TTI::TCC_Free;
2870 
2871  // Sign-extend all constants to a multiple of 64-bit.
2872  APInt ImmVal = Imm;
2873  if (BitSize % 64 != 0)
2874  ImmVal = Imm.sext(alignTo(BitSize, 64));
2875 
2876  // Split the constant into 64-bit chunks and calculate the cost for each
2877  // chunk.
2878  int Cost = 0;
2879  for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
2880  APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
2881  int64_t Val = Tmp.getSExtValue();
2882  Cost += getIntImmCost(Val);
2883  }
2884  // We need at least one instruction to materialize the constant.
2885  return std::max(1, Cost);
2886 }
2887 
2888 int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
2889  Type *Ty) {
2890  assert(Ty->isIntegerTy());
2891 
2892  unsigned BitSize = Ty->getPrimitiveSizeInBits();
2893  // There is no cost model for constants with a bit size of 0. Return TCC_Free
2894  // here, so that constant hoisting will ignore this constant.
2895  if (BitSize == 0)
2896  return TTI::TCC_Free;
2897 
2898  unsigned ImmIdx = ~0U;
2899  switch (Opcode) {
2900  default:
2901  return TTI::TCC_Free;
2902  case Instruction::GetElementPtr:
2903  // Always hoist the base address of a GetElementPtr. This prevents the
2904  // creation of new constants for every base constant that gets constant
2905  // folded with the offset.
2906  if (Idx == 0)
2907  return 2 * TTI::TCC_Basic;
2908  return TTI::TCC_Free;
2909  case Instruction::Store:
2910  ImmIdx = 0;
2911  break;
2912  case Instruction::ICmp:
2913  // This is an imperfect hack to prevent constant hoisting of
2914  // compares that might be trying to check if a 64-bit value fits in
2915  // 32-bits. The backend can optimize these cases using a right shift by 32.
2916  // Ideally we would check the compare predicate here. There also other
2917  // similar immediates the backend can use shifts for.
2918  if (Idx == 1 && Imm.getBitWidth() == 64) {
2919  uint64_t ImmVal = Imm.getZExtValue();
2920  if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
2921  return TTI::TCC_Free;
2922  }
2923  ImmIdx = 1;
2924  break;
2925  case Instruction::And:
2926  // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
2927  // by using a 32-bit operation with implicit zero extension. Detect such
2928  // immediates here as the normal path expects bit 31 to be sign extended.
2929  if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue()))
2930  return TTI::TCC_Free;
2931  ImmIdx = 1;
2932  break;
2933  case Instruction::Add:
2934  case Instruction::Sub:
2935  // For add/sub, we can use the opposite instruction for INT32_MIN.
2936  if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000)
2937  return TTI::TCC_Free;
2938  ImmIdx = 1;
2939  break;
2940  case Instruction::UDiv:
2941  case Instruction::SDiv:
2942  case Instruction::URem:
2943  case Instruction::SRem:
2944  // Division by constant is typically expanded later into a different
2945  // instruction sequence. This completely changes the constants.
2946  // Report them as "free" to stop ConstantHoist from marking them as opaque.
2947  return TTI::TCC_Free;
2948  case Instruction::Mul:
2949  case Instruction::Or:
2950  case Instruction::Xor:
2951  ImmIdx = 1;
2952  break;
2953  // Always return TCC_Free for the shift value of a shift instruction.
2954  case Instruction::Shl:
2955  case Instruction::LShr:
2956  case Instruction::AShr:
2957  if (Idx == 1)
2958  return TTI::TCC_Free;
2959  break;
2960  case Instruction::Trunc:
2961  case Instruction::ZExt:
2962  case Instruction::SExt:
2963  case Instruction::IntToPtr:
2964  case Instruction::PtrToInt:
2965  case Instruction::BitCast:
2966  case Instruction::PHI:
2967  case Instruction::Call:
2968  case Instruction::Select:
2969  case Instruction::Ret:
2970  case Instruction::Load:
2971  break;
2972  }
2973 
2974  if (Idx == ImmIdx) {
2975  int NumConstants = divideCeil(BitSize, 64);
2976  int Cost = X86TTIImpl::getIntImmCost(Imm, Ty);
2977  return (Cost <= NumConstants * TTI::TCC_Basic)
2978  ? static_cast<int>(TTI::TCC_Free)
2979  : Cost;
2980  }
2981 
2982  return X86TTIImpl::getIntImmCost(Imm, Ty);
2983 }
2984 
2985 int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
2986  Type *Ty) {
2987  assert(Ty->isIntegerTy());
2988 
2989  unsigned BitSize = Ty->getPrimitiveSizeInBits();
2990  // There is no cost model for constants with a bit size of 0. Return TCC_Free
2991  // here, so that constant hoisting will ignore this constant.
2992  if (BitSize == 0)
2993  return TTI::TCC_Free;
2994 
2995  switch (IID) {
2996  default:
2997  return TTI::TCC_Free;
2998  case Intrinsic::sadd_with_overflow:
2999  case Intrinsic::uadd_with_overflow:
3000  case Intrinsic::ssub_with_overflow:
3001  case Intrinsic::usub_with_overflow:
3002  case Intrinsic::smul_with_overflow:
3003  case Intrinsic::umul_with_overflow:
3004  if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
3005  return TTI::TCC_Free;
3006  break;
3007  case Intrinsic::experimental_stackmap:
3008  if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
3009  return TTI::TCC_Free;
3010  break;
3011  case Intrinsic::experimental_patchpoint_void:
3012  case Intrinsic::experimental_patchpoint_i64:
3013  if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
3014  return TTI::TCC_Free;
3015  break;
3016  }
3017  return X86TTIImpl::getIntImmCost(Imm, Ty);
3018 }
3019 
3020 unsigned X86TTIImpl::getUserCost(const User *U,
3021  ArrayRef<const Value *> Operands) {
3022  if (isa<StoreInst>(U)) {
3023  Value *Ptr = U->getOperand(1);
3024  // Store instruction with index and scale costs 2 Uops.
3025  // Check the preceding GEP to identify non-const indices.
3026  if (auto GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
3027  if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
3028  return TTI::TCC_Basic * 2;
3029  }
3030  return TTI::TCC_Basic;
3031  }
3032  return BaseT::getUserCost(U, Operands);
3033 }
3034 
3035 // Return an average cost of Gather / Scatter instruction, maybe improved later
3036 int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
3037  unsigned Alignment, unsigned AddressSpace) {
3038 
3039  assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
3040  unsigned VF = SrcVTy->getVectorNumElements();
3041 
3042  // Try to reduce index size from 64 bit (default for GEP)
3043  // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
3044  // operation will use 16 x 64 indices which do not fit in a zmm and needs
3045  // to split. Also check that the base pointer is the same for all lanes,
3046  // and that there's at most one variable index.
3047  auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) {
3048  unsigned IndexSize = DL.getPointerSizeInBits();
3050  if (IndexSize < 64 || !GEP)
3051  return IndexSize;
3052 
3053  unsigned NumOfVarIndices = 0;
3054  Value *Ptrs = GEP->getPointerOperand();
3055  if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
3056  return IndexSize;
3057  for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
3058  if (isa<Constant>(GEP->getOperand(i)))
3059  continue;
3060  Type *IndxTy = GEP->getOperand(i)->getType();
3061  if (IndxTy->isVectorTy())
3062  IndxTy = IndxTy->getVectorElementType();
3063  if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
3064  !isa<SExtInst>(GEP->getOperand(i))) ||
3065  ++NumOfVarIndices > 1)
3066  return IndexSize; // 64
3067  }
3068  return (unsigned)32;
3069  };
3070 
3071 
3072  // Trying to reduce IndexSize to 32 bits for vector 16.
3073  // By default the IndexSize is equal to pointer size.
3074  unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
3075  ? getIndexSizeInBits(Ptr, DL)
3077 
3078  Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(),
3079  IndexSize), VF);
3080  std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
3081  std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
3082  int SplitFactor = std::max(IdxsLT.first, SrcLT.first);
3083  if (SplitFactor > 1) {
3084  // Handle splitting of vector of pointers
3085  Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
3086  return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
3087  AddressSpace);
3088  }
3089 
3090  // The gather / scatter cost is given by Intel architects. It is a rough
3091  // number since we are looking at one instruction in a time.
3092  const int GSOverhead = (Opcode == Instruction::Load)
3093  ? ST->getGatherOverhead()
3094  : ST->getScatterOverhead();
3095  return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
3096  Alignment, AddressSpace);
3097 }
3098 
3099 /// Return the cost of full scalarization of gather / scatter operation.
3100 ///
3101 /// Opcode - Load or Store instruction.
3102 /// SrcVTy - The type of the data vector that should be gathered or scattered.
3103 /// VariableMask - The mask is non-constant at compile time.
3104 /// Alignment - Alignment for one element.
3105 /// AddressSpace - pointer[s] address space.
3106 ///
3107 int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
3108  bool VariableMask, unsigned Alignment,
3109  unsigned AddressSpace) {
3110  unsigned VF = SrcVTy->getVectorNumElements();
3111 
3112  int MaskUnpackCost = 0;
3113  if (VariableMask) {
3114  VectorType *MaskTy =
3115  VectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
3116  MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true);
3117  int ScalarCompareCost =
3118  getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()),
3119  nullptr);
3120  int BranchCost = getCFInstrCost(Instruction::Br);
3121  MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
3122  }
3123 
3124  // The cost of the scalar loads/stores.
3125  int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
3126  Alignment, AddressSpace);
3127 
3128  int InsertExtractCost = 0;
3129  if (Opcode == Instruction::Load)
3130  for (unsigned i = 0; i < VF; ++i)
3131  // Add the cost of inserting each scalar load into the vector
3132  InsertExtractCost +=
3133  getVectorInstrCost(Instruction::InsertElement, SrcVTy, i);
3134  else
3135  for (unsigned i = 0; i < VF; ++i)
3136  // Add the cost of extracting each element out of the data vector
3137  InsertExtractCost +=
3138  getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i);
3139 
3140  return MemoryOpCost + MaskUnpackCost + InsertExtractCost;
3141 }
3142 
3143 /// Calculate the cost of Gather / Scatter operation
3144 int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
3145  Value *Ptr, bool VariableMask,
3146  unsigned Alignment) {
3147  assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
3148  unsigned VF = SrcVTy->getVectorNumElements();
3149  PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
3150  if (!PtrTy && Ptr->getType()->isVectorTy())
3151  PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType());
3152  assert(PtrTy && "Unexpected type for Ptr argument");
3153  unsigned AddressSpace = PtrTy->getAddressSpace();
3154 
3155  bool Scalarize = false;
3156  if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) ||
3157  (Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy)))
3158  Scalarize = true;
3159  // Gather / Scatter for vector 2 is not profitable on KNL / SKX
3160  // Vector-4 of gather/scatter instruction does not exist on KNL.
3161  // We can extend it to 8 elements, but zeroing upper bits of
3162  // the mask vector will add more instructions. Right now we give the scalar
3163  // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction
3164  // is better in the VariableMask case.
3165  if (ST->hasAVX512() && (VF == 2 || (VF == 4 && !ST->hasVLX())))
3166  Scalarize = true;
3167 
3168  if (Scalarize)
3169  return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
3170  AddressSpace);
3171 
3172  return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
3173 }
3174 
3177  // X86 specific here are "instruction number 1st priority".
3178  return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
3179  C1.NumIVMuls, C1.NumBaseAdds,
3180  C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
3181  std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
3182  C2.NumIVMuls, C2.NumBaseAdds,
3183  C2.ScaleCost, C2.ImmCost, C2.SetupCost);
3184 }
3185 
3187  return ST->hasMacroFusion() || ST->hasBranchFusion();
3188 }
3189 
3191  if (!ST->hasAVX())
3192  return false;
3193 
3194  // The backend can't handle a single element vector.
3195  if (isa<VectorType>(DataTy) && DataTy->getVectorNumElements() == 1)
3196  return false;
3197  Type *ScalarTy = DataTy->getScalarType();
3198 
3199  if (ScalarTy->isPointerTy())
3200  return true;
3201 
3202  if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
3203  return true;
3204 
3205  if (!ScalarTy->isIntegerTy())
3206  return false;
3207 
3208  unsigned IntWidth = ScalarTy->getIntegerBitWidth();
3209  return IntWidth == 32 || IntWidth == 64 ||
3210  ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
3211 }
3212 
3214  return isLegalMaskedLoad(DataType);
3215 }
3216 
3217 bool X86TTIImpl::isLegalNTLoad(Type *DataType, unsigned Alignment) {
3218  unsigned DataSize = DL.getTypeStoreSize(DataType);
3219  // The only supported nontemporal loads are for aligned vectors of 16 or 32
3220  // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
3221  // (the equivalent stores only require AVX).
3222  if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
3223  return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
3224 
3225  return false;
3226 }
3227 
3228 bool X86TTIImpl::isLegalNTStore(Type *DataType, unsigned Alignment) {
3229  unsigned DataSize = DL.getTypeStoreSize(DataType);
3230 
3231  // SSE4A supports nontemporal stores of float and double at arbitrary
3232  // alignment.
3233  if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
3234  return true;
3235 
3236  // Besides the SSE4A subtarget exception above, only aligned stores are
3237  // available nontemporaly on any other subtarget. And only stores with a size
3238  // of 4..32 bytes (powers of 2, only) are permitted.
3239  if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
3240  !isPowerOf2_32(DataSize))
3241  return false;
3242 
3243  // 32-byte vector nontemporal stores are supported by AVX (the equivalent
3244  // loads require AVX2).
3245  if (DataSize == 32)
3246  return ST->hasAVX();
3247  else if (DataSize == 16)
3248  return ST->hasSSE1();
3249  return true;
3250 }
3251 
3253  if (!isa<VectorType>(DataTy))
3254  return false;
3255 
3256  if (!ST->hasAVX512())
3257  return false;
3258 
3259  // The backend can't handle a single element vector.
3260  if (DataTy->getVectorNumElements() == 1)
3261  return false;
3262 
3263  Type *ScalarTy = DataTy->getVectorElementType();
3264 
3265  if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
3266  return true;
3267 
3268  if (!ScalarTy->isIntegerTy())
3269  return false;
3270 
3271  unsigned IntWidth = ScalarTy->getIntegerBitWidth();
3272  return IntWidth == 32 || IntWidth == 64 ||
3273  ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
3274 }
3275 
3277  return isLegalMaskedExpandLoad(DataTy);
3278 }
3279 
3281  // Some CPUs have better gather performance than others.
3282  // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
3283  // enable gather with a -march.
3284  if (!(ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2())))
3285  return false;
3286 
3287  // This function is called now in two cases: from the Loop Vectorizer
3288  // and from the Scalarizer.
3289  // When the Loop Vectorizer asks about legality of the feature,
3290  // the vectorization factor is not calculated yet. The Loop Vectorizer
3291  // sends a scalar type and the decision is based on the width of the
3292  // scalar element.
3293  // Later on, the cost model will estimate usage this intrinsic based on
3294  // the vector type.
3295  // The Scalarizer asks again about legality. It sends a vector type.
3296  // In this case we can reject non-power-of-2 vectors.
3297  // We also reject single element vectors as the type legalizer can't
3298  // scalarize it.
3299  if (isa<VectorType>(DataTy)) {
3300  unsigned NumElts = DataTy->getVectorNumElements();
3301  if (NumElts == 1 || !isPowerOf2_32(NumElts))
3302  return false;
3303  }
3304  Type *ScalarTy = DataTy->getScalarType();
3305  if (ScalarTy->isPointerTy())
3306  return true;
3307 
3308  if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
3309  return true;
3310 
3311  if (!ScalarTy->isIntegerTy())
3312  return false;
3313 
3314  unsigned IntWidth = ScalarTy->getIntegerBitWidth();
3315  return IntWidth == 32 || IntWidth == 64;
3316 }
3317 
3319  // AVX2 doesn't support scatter
3320  if (!ST->hasAVX512())
3321  return false;
3322  return isLegalMaskedGather(DataType);
3323 }
3324 
3325 bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
3326  EVT VT = TLI->getValueType(DL, DataType);
3327  return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
3328 }
3329 
3331  return false;
3332 }
3333 
3335  const Function *Callee) const {
3336  const TargetMachine &TM = getTLI()->getTargetMachine();
3337 
3338  // Work this as a subsetting of subtarget features.
3339  const FeatureBitset &CallerBits =
3340  TM.getSubtargetImpl(*Caller)->getFeatureBits();
3341  const FeatureBitset &CalleeBits =
3342  TM.getSubtargetImpl(*Callee)->getFeatureBits();
3343 
3344  FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
3345  FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
3346  return (RealCallerBits & RealCalleeBits) == RealCalleeBits;
3347 }
3348 
3350  const Function *Caller, const Function *Callee,
3352  if (!BaseT::areFunctionArgsABICompatible(Caller, Callee, Args))
3353  return false;
3354 
3355  // If we get here, we know the target features match. If one function
3356  // considers 512-bit vectors legal and the other does not, consider them
3357  // incompatible.
3358  // FIXME Look at the arguments and only consider 512 bit or larger vectors?
3359  const TargetMachine &TM = getTLI()->getTargetMachine();
3360 
3361  return TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
3362  TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs();
3363 }
3364 
3366 X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3368  Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3369  Options.NumLoadsPerBlock = 2;
3370  if (IsZeroCmp) {
3371  // Only enable vector loads for equality comparison. Right now the vector
3372  // version is not as fast for three way compare (see #33329).
3373  // TODO: enable AVX512 when the DAG is ready.
3374  // if (ST->hasAVX512()) Options.LoadSizes.push_back(64);
3375  const unsigned PreferredWidth = ST->getPreferVectorWidth();
3376  if (PreferredWidth >= 256 && ST->hasAVX2()) Options.LoadSizes.push_back(32);
3377  if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
3378  // All GPR and vector loads can be unaligned. SIMD compare requires integer
3379  // vectors (SSE2/AVX2).
3380  Options.AllowOverlappingLoads = true;
3381  }
3382  if (ST->is64Bit()) {
3383  Options.LoadSizes.push_back(8);
3384  }
3385  Options.LoadSizes.push_back(4);
3386  Options.LoadSizes.push_back(2);
3387  Options.LoadSizes.push_back(1);
3388  return Options;
3389 }
3390 
3392  // TODO: We expect this to be beneficial regardless of arch,
3393  // but there are currently some unexplained performance artifacts on Atom.
3394  // As a temporary solution, disable on Atom.
3395  return !(ST->isAtom());
3396 }
3397 
3398 // Get estimation for interleaved load/store operations for AVX2.
3399 // \p Factor is the interleaved-access factor (stride) - number of
3400 // (interleaved) elements in the group.
3401 // \p Indices contains the indices for a strided load: when the
3402 // interleaved load has gaps they indicate which elements are used.
3403 // If Indices is empty (or if the number of indices is equal to the size
3404 // of the interleaved-access as given in \p Factor) the access has no gaps.
3405 //
3406 // As opposed to AVX-512, AVX2 does not have generic shuffles that allow
3407 // computing the cost using a generic formula as a function of generic
3408 // shuffles. We therefore use a lookup table instead, filled according to
3409 // the instruction sequences that codegen currently generates.
3411  unsigned Factor,
3412  ArrayRef<unsigned> Indices,
3413  unsigned Alignment,
3414  unsigned AddressSpace,
3415  bool UseMaskForCond,
3416  bool UseMaskForGaps) {
3417 
3418  if (UseMaskForCond || UseMaskForGaps)
3419  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3420  Alignment, AddressSpace,
3421  UseMaskForCond, UseMaskForGaps);
3422 
3423  // We currently Support only fully-interleaved groups, with no gaps.
3424  // TODO: Support also strided loads (interleaved-groups with gaps).
3425  if (Indices.size() && Indices.size() != Factor)
3426  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3427  Alignment, AddressSpace);
3428 
3429  // VecTy for interleave memop is <VF*Factor x Elt>.
3430  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
3431  // VecTy = <12 x i32>.
3432  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
3433 
3434  // This function can be called with VecTy=<6xi128>, Factor=3, in which case
3435  // the VF=2, while v2i128 is an unsupported MVT vector type
3436  // (see MachineValueType.h::getVectorVT()).
3437  if (!LegalVT.isVector())
3438  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3439  Alignment, AddressSpace);
3440 
3441  unsigned VF = VecTy->getVectorNumElements() / Factor;
3442  Type *ScalarTy = VecTy->getVectorElementType();
3443 
3444  // Calculate the number of memory operations (NumOfMemOps), required
3445  // for load/store the VecTy.
3446  unsigned VecTySize = DL.getTypeStoreSize(VecTy);
3447  unsigned LegalVTSize = LegalVT.getStoreSize();
3448  unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
3449 
3450  // Get the cost of one memory operation.
3451  Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
3452  LegalVT.getVectorNumElements());
3453  unsigned MemOpCost =
3454  getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
3455 
3456  VectorType *VT = VectorType::get(ScalarTy, VF);
3457  EVT ETy = TLI->getValueType(DL, VT);
3458  if (!ETy.isSimple())
3459  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3460  Alignment, AddressSpace);
3461 
3462  // TODO: Complete for other data-types and strides.
3463  // Each combination of Stride, ElementTy and VF results in a different
3464  // sequence; The cost tables are therefore accessed with:
3465  // Factor (stride) and VectorType=VFxElemType.
3466  // The Cost accounts only for the shuffle sequence;
3467  // The cost of the loads/stores is accounted for separately.
3468  //
3469  static const CostTblEntry AVX2InterleavedLoadTbl[] = {
3470  { 2, MVT::v4i64, 6 }, //(load 8i64 and) deinterleave into 2 x 4i64
3471  { 2, MVT::v4f64, 6 }, //(load 8f64 and) deinterleave into 2 x 4f64
3472 
3473  { 3, MVT::v2i8, 10 }, //(load 6i8 and) deinterleave into 3 x 2i8
3474  { 3, MVT::v4i8, 4 }, //(load 12i8 and) deinterleave into 3 x 4i8
3475  { 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8
3476  { 3, MVT::v16i8, 11}, //(load 48i8 and) deinterleave into 3 x 16i8
3477  { 3, MVT::v32i8, 13}, //(load 96i8 and) deinterleave into 3 x 32i8
3478  { 3, MVT::v8f32, 17 }, //(load 24f32 and)deinterleave into 3 x 8f32
3479 
3480  { 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8
3481  { 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8
3482  { 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8
3483  { 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8
3484  { 4, MVT::v32i8, 80 }, //(load 128i8 and) deinterleave into 4 x 32i8
3485 
3486  { 8, MVT::v8f32, 40 } //(load 64f32 and)deinterleave into 8 x 8f32
3487  };
3488 
3489  static const CostTblEntry AVX2InterleavedStoreTbl[] = {
3490  { 2, MVT::v4i64, 6 }, //interleave into 2 x 4i64 into 8i64 (and store)
3491  { 2, MVT::v4f64, 6 }, //interleave into 2 x 4f64 into 8f64 (and store)
3492 
3493  { 3, MVT::v2i8, 7 }, //interleave 3 x 2i8 into 6i8 (and store)
3494  { 3, MVT::v4i8, 8 }, //interleave 3 x 4i8 into 12i8 (and store)
3495  { 3, MVT::v8i8, 11 }, //interleave 3 x 8i8 into 24i8 (and store)
3496  { 3, MVT::v16i8, 11 }, //interleave 3 x 16i8 into 48i8 (and store)
3497  { 3, MVT::v32i8, 13 }, //interleave 3 x 32i8 into 96i8 (and store)
3498 
3499  { 4, MVT::v2i8, 12 }, //interleave 4 x 2i8 into 8i8 (and store)
3500  { 4, MVT::v4i8, 9 }, //interleave 4 x 4i8 into 16i8 (and store)
3501  { 4, MVT::v8i8, 10 }, //interleave 4 x 8i8 into 32i8 (and store)
3502  { 4, MVT::v16i8, 10 }, //interleave 4 x 16i8 into 64i8 (and store)
3503  { 4, MVT::v32i8, 12 } //interleave 4 x 32i8 into 128i8 (and store)
3504  };
3505 
3506  if (Opcode == Instruction::Load) {
3507  if (const auto *Entry =
3508  CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT()))
3509  return NumOfMemOps * MemOpCost + Entry->Cost;
3510  } else {
3511  assert(Opcode == Instruction::Store &&
3512  "Expected Store Instruction at this point");
3513  if (const auto *Entry =
3514  CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT()))
3515  return NumOfMemOps * MemOpCost + Entry->Cost;
3516  }
3517 
3518  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3519  Alignment, AddressSpace);
3520 }
3521 
3522 // Get estimation for interleaved load/store operations and strided load.
3523 // \p Indices contains indices for strided load.
3524 // \p Factor - the factor of interleaving.
3525 // AVX-512 provides 3-src shuffles that significantly reduces the cost.
3527  unsigned Factor,
3528  ArrayRef<unsigned> Indices,
3529  unsigned Alignment,
3530  unsigned AddressSpace,
3531  bool UseMaskForCond,
3532  bool UseMaskForGaps) {
3533 
3534  if (UseMaskForCond || UseMaskForGaps)
3535  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3536  Alignment, AddressSpace,
3537  UseMaskForCond, UseMaskForGaps);
3538 
3539  // VecTy for interleave memop is <VF*Factor x Elt>.
3540  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
3541  // VecTy = <12 x i32>.
3542 
3543  // Calculate the number of memory operations (NumOfMemOps), required
3544  // for load/store the VecTy.
3545  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
3546  unsigned VecTySize = DL.getTypeStoreSize(VecTy);
3547  unsigned LegalVTSize = LegalVT.getStoreSize();
3548  unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
3549 
3550  // Get the cost of one memory operation.
3551  Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
3552  LegalVT.getVectorNumElements());
3553  unsigned MemOpCost =
3554  getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
3555 
3556  unsigned VF = VecTy->getVectorNumElements() / Factor;
3557  MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
3558 
3559  if (Opcode == Instruction::Load) {
3560  // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
3561  // contain the cost of the optimized shuffle sequence that the
3562  // X86InterleavedAccess pass will generate.
3563  // The cost of loads and stores are computed separately from the table.
3564 
3565  // X86InterleavedAccess support only the following interleaved-access group.
3566  static const CostTblEntry AVX512InterleavedLoadTbl[] = {
3567  {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
3568  {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
3569  {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
3570  };
3571 
3572  if (const auto *Entry =
3573  CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
3574  return NumOfMemOps * MemOpCost + Entry->Cost;
3575  //If an entry does not exist, fallback to the default implementation.
3576 
3577  // Kind of shuffle depends on number of loaded values.
3578  // If we load the entire data in one register, we can use a 1-src shuffle.
3579  // Otherwise, we'll merge 2 sources in each operation.
3580  TTI::ShuffleKind ShuffleKind =
3581  (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
3582 
3583  unsigned ShuffleCost =
3584  getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr);
3585 
3586  unsigned NumOfLoadsInInterleaveGrp =
3587  Indices.size() ? Indices.size() : Factor;
3588  Type *ResultTy = VectorType::get(VecTy->getVectorElementType(),
3589  VecTy->getVectorNumElements() / Factor);
3590  unsigned NumOfResults =
3591  getTLI()->getTypeLegalizationCost(DL, ResultTy).first *
3592  NumOfLoadsInInterleaveGrp;
3593 
3594  // About a half of the loads may be folded in shuffles when we have only
3595  // one result. If we have more than one result, we do not fold loads at all.
3596  unsigned NumOfUnfoldedLoads =
3597  NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
3598 
3599  // Get a number of shuffle operations per result.
3600  unsigned NumOfShufflesPerResult =
3601  std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
3602 
3603  // The SK_MergeTwoSrc shuffle clobbers one of src operands.
3604  // When we have more than one destination, we need additional instructions
3605  // to keep sources.
3606  unsigned NumOfMoves = 0;
3607  if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
3608  NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
3609 
3610  int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
3611  NumOfUnfoldedLoads * MemOpCost + NumOfMoves;
3612 
3613  return Cost;
3614  }
3615 
3616  // Store.
3617  assert(Opcode == Instruction::Store &&
3618  "Expected Store Instruction at this point");
3619  // X86InterleavedAccess support only the following interleaved-access group.
3620  static const CostTblEntry AVX512InterleavedStoreTbl[] = {
3621  {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
3622  {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
3623  {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
3624 
3625  {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
3626  {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
3627  {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
3628  {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
3629  };
3630 
3631  if (const auto *Entry =
3632  CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
3633  return NumOfMemOps * MemOpCost + Entry->Cost;
3634  //If an entry does not exist, fallback to the default implementation.
3635 
3636  // There is no strided stores meanwhile. And store can't be folded in
3637  // shuffle.
3638  unsigned NumOfSources = Factor; // The number of values to be merged.
3639  unsigned ShuffleCost =
3640  getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr);
3641  unsigned NumOfShufflesPerStore = NumOfSources - 1;
3642 
3643  // The SK_MergeTwoSrc shuffle clobbers one of src operands.
3644  // We need additional instructions to keep sources.
3645  unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
3646  int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
3647  NumOfMoves;
3648  return Cost;
3649 }
3650 
3651 int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
3652  unsigned Factor,
3653  ArrayRef<unsigned> Indices,
3654  unsigned Alignment,
3655  unsigned AddressSpace,
3656  bool UseMaskForCond,
3657  bool UseMaskForGaps) {
3658  auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) {
3659  Type *EltTy = VecTy->getVectorElementType();
3660  if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
3661  EltTy->isIntegerTy(32) || EltTy->isPointerTy())
3662  return true;
3663  if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8))
3664  return HasBW;
3665  return false;
3666  };
3667  if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
3668  return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
3669  Alignment, AddressSpace,
3670  UseMaskForCond, UseMaskForGaps);
3671  if (ST->hasAVX2())
3672  return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
3673  Alignment, AddressSpace,
3674  UseMaskForCond, UseMaskForGaps);
3675 
3676  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3677  Alignment, AddressSpace,
3678  UseMaskForCond, UseMaskForGaps);
3679 }
bool hasAVX() const
Definition: X86Subtarget.h:581
Type * getVectorElementType() const
Definition: Type.h:371
constexpr bool isUInt< 32 >(uint64_t x)
Definition: MathExtras.h:348
X = FP_ROUND(Y, TRUNC) - Rounding &#39;Y&#39; from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:562
bool is64Bit() const
Is this x86_64? (disregarding specific ABI / programming model)
Definition: X86Subtarget.h:542
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
bool areFunctionArgsABICompatible(const Function *Caller, const Function *Callee, SmallPtrSetImpl< Argument *> &Args) const
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:622
unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >())
Definition: BasicTTIImpl.h:593
static IntegerType * getInt1Ty(LLVMContext &C)
Definition: Type.cpp:172
llvm::Optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1569
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:836
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:41
bool hasSSE41() const
Definition: X86Subtarget.h:579
This class represents lattice values for constants.
Definition: AllocatorList.h:23
unsigned minRequiredElementSize(const Value *Val, bool &isSigned)
static MVT getVectorVT(MVT VT, unsigned NumElements)
Cost tables and simple lookup functions.
int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty)
int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, bool VariableMask, unsigned Alignment)
Calculate the cost of Gather / Scatter operation.
bool hasPOPCNT() const
Definition: X86Subtarget.h:589
bool hasAVX2() const
Definition: X86Subtarget.h:582
const Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
bool isVector() const
Return true if this is a vector value type.
unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:532
void push_back(const T &Elt)
Definition: SmallVector.h:211
This file a TargetTransformInfo::Concept conforming object specific to the X86 target machine...
The main scalar evolution driver.
unsigned getRegisterBitWidth(bool Vector) const
bool isLegalNTLoad(Type *DataType, unsigned Alignment)
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:252
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:250
bool hasVBMI2() const
Definition: X86Subtarget.h:616
unsigned getVectorNumElements() const
bool isLegalMaskedScatter(Type *DataType)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1177
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:389
bool useAVX512Regs() const
Definition: X86Subtarget.h:721
Type Conversion Cost Table.
Definition: CostTable.h:44
[US]{MIN/MAX} - Binary minimum or maximum or signed or unsigned integers.
Definition: ISDOpcodes.h:408
Hexagon Common GEP
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:229
unsigned getAtomicMemIntrinsicMaxElementSize() const
Cost Table Entry.
Definition: CostTable.h:24
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1515
unsigned getNumberOfRegisters(bool Vector)
int getGatherOverhead() const
Definition: X86Subtarget.h:642
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:343
unsigned getMaxInterleaveFactor(unsigned VF)
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:209
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
bool isUnalignedMem32Slow() const
Definition: X86Subtarget.h:641
unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef< Value * > Args, FastMathFlags FMF, unsigned VF=1)
Get intrinsic cost based on arguments.
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:722
bool isLegalMaskedStore(Type *DataType)
unsigned getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
int getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value *> Args=ArrayRef< const Value *>())
int getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond=false, bool UseMaskForGaps=false)
const FeatureBitset & getFeatureBits() const
Shift and rotation operations.
Definition: ISDOpcodes.h:434
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:196
unsigned getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwise)
Try to calculate arithmetic and shuffle op costs for reduction operations.
const TypeConversionCostTblEntry * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntry > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table, TypeTy must be comparable to CompareTy by ==. ...
Definition: CostTable.h:54
unsigned getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isLSRCostLess(TargetTransformInfo::LSRCost &C1, TargetTransformInfo::LSRCost &C2)
bool hasVLX() const
Definition: X86Subtarget.h:681
unsigned getSizeInBits() const
unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I)
Definition: BasicTTIImpl.h:797
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1581
unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:659
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:245
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:502
PopcntSupportKind
Flags indicating the kind of support for population count.
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:886
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:200
llvm::Optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:32
Selects elements from the corresponding lane of either source operand.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:202
int getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr)
Reverse the order of the vector.
bool hasDQI() const
Definition: X86Subtarget.h:679
MVT getVectorElementType() const
Value * getOperand(unsigned i) const
Definition: User.h:169
Class to represent pointers.
Definition: DerivedTypes.h:544
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:548
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return &#39;this&#39;.
Definition: Type.h:303
bool is128BitVector() const
Return true if this is a 128-bit vector type.
ExtractSubvector Index indicates start offset.
bool isFloatTy() const
Return true if this is &#39;float&#39;, a 32-bit IEEE fp type.
Definition: Type.h:146