LLVM  10.0.0svn
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements a TargetTransformInfo analysis pass specific to the
10 /// X86 target machine. It uses the target's detailed information to provide
11 /// more precise answers to certain TTI queries, while letting the target
12 /// independent and default TTI implementations handle the rest.
13 ///
14 //===----------------------------------------------------------------------===//
15 /// About Cost Model numbers used below it's necessary to say the following:
16 /// the numbers correspond to some "generic" X86 CPU instead of usage of
17 /// concrete CPU model. Usually the numbers correspond to CPU where the feature
18 /// apeared at the first time. For example, if we do Subtarget.hasSSE42() in
19 /// the lookups below the cost is based on Nehalem as that was the first CPU
20 /// to support that feature level and thus has most likely the worst case cost.
21 /// Some examples of other technologies/CPUs:
22 /// SSE 3 - Pentium4 / Athlon64
23 /// SSE 4.1 - Penryn
24 /// SSE 4.2 - Nehalem
25 /// AVX - Sandy Bridge
26 /// AVX2 - Haswell
27 /// AVX-512 - Xeon Phi / Skylake
28 /// And some examples of instruction target dependent costs (latency)
29 /// divss sqrtss rsqrtss
30 /// AMD K7 11-16 19 3
31 /// Piledriver 9-24 13-15 5
32 /// Jaguar 14 16 2
33 /// Pentium II,III 18 30 2
34 /// Nehalem 7-14 7-18 3
35 /// Haswell 10-13 11 5
36 /// TODO: Develop and implement the target dependent cost model and
37 /// specialize cost numbers for different Cost Model Targets such as throughput,
38 /// code size, latency and uop count.
39 //===----------------------------------------------------------------------===//
40 
41 #include "X86TargetTransformInfo.h"
44 #include "llvm/CodeGen/CostTable.h"
46 #include "llvm/IR/IntrinsicInst.h"
47 #include "llvm/Support/Debug.h"
48 
49 using namespace llvm;
50 
51 #define DEBUG_TYPE "x86tti"
52 
54 
55 //===----------------------------------------------------------------------===//
56 //
57 // X86 cost model.
58 //
59 //===----------------------------------------------------------------------===//
60 
62 X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
63  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
64  // TODO: Currently the __builtin_popcount() implementation using SSE3
65  // instructions is inefficient. Once the problem is fixed, we should
66  // call ST->hasSSE3() instead of ST->hasPOPCNT().
68 }
69 
72  switch (Level) {
74  // - Penryn
75  // - Nehalem
76  // - Westmere
77  // - Sandy Bridge
78  // - Ivy Bridge
79  // - Haswell
80  // - Broadwell
81  // - Skylake
82  // - Kabylake
83  return 32 * 1024; // 32 KByte
85  // - Penryn
86  // - Nehalem
87  // - Westmere
88  // - Sandy Bridge
89  // - Ivy Bridge
90  // - Haswell
91  // - Broadwell
92  // - Skylake
93  // - Kabylake
94  return 256 * 1024; // 256 KByte
95  }
96 
97  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
98 }
99 
102  // - Penryn
103  // - Nehalem
104  // - Westmere
105  // - Sandy Bridge
106  // - Ivy Bridge
107  // - Haswell
108  // - Broadwell
109  // - Skylake
110  // - Kabylake
111  switch (Level) {
115  return 8;
116  }
117 
118  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
119 }
120 
121 unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
122  if (Vector && !ST->hasSSE1())
123  return 0;
124 
125  if (ST->is64Bit()) {
126  if (Vector && ST->hasAVX512())
127  return 32;
128  return 16;
129  }
130  return 8;
131 }
132 
133 unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const {
134  unsigned PreferVectorWidth = ST->getPreferVectorWidth();
135  if (Vector) {
136  if (ST->hasAVX512() && PreferVectorWidth >= 512)
137  return 512;
138  if (ST->hasAVX() && PreferVectorWidth >= 256)
139  return 256;
140  if (ST->hasSSE1() && PreferVectorWidth >= 128)
141  return 128;
142  return 0;
143  }
144 
145  if (ST->is64Bit())
146  return 64;
147 
148  return 32;
149 }
150 
151 unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
152  return getRegisterBitWidth(true);
153 }
154 
155 unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
156  // If the loop will not be vectorized, don't interleave the loop.
157  // Let regular unroll to unroll the loop, which saves the overflow
158  // check and memory check cost.
159  if (VF == 1)
160  return 1;
161 
162  if (ST->isAtom())
163  return 1;
164 
165  // Sandybridge and Haswell have multiple execution ports and pipelined
166  // vector units.
167  if (ST->hasAVX())
168  return 4;
169 
170  return 2;
171 }
172 
174  unsigned Opcode, Type *Ty,
176  TTI::OperandValueProperties Opd1PropInfo,
177  TTI::OperandValueProperties Opd2PropInfo,
179  // Legalize the type.
180  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
181 
182  int ISD = TLI->InstructionOpcodeToISD(Opcode);
183  assert(ISD && "Invalid opcode");
184 
185  static const CostTblEntry GLMCostTable[] = {
186  { ISD::FDIV, MVT::f32, 18 }, // divss
187  { ISD::FDIV, MVT::v4f32, 35 }, // divps
188  { ISD::FDIV, MVT::f64, 33 }, // divsd
189  { ISD::FDIV, MVT::v2f64, 65 }, // divpd
190  };
191 
192  if (ST->isGLM())
193  if (const auto *Entry = CostTableLookup(GLMCostTable, ISD,
194  LT.second))
195  return LT.first * Entry->Cost;
196 
197  static const CostTblEntry SLMCostTable[] = {
198  { ISD::MUL, MVT::v4i32, 11 }, // pmulld
199  { ISD::MUL, MVT::v8i16, 2 }, // pmullw
200  { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
201  { ISD::FMUL, MVT::f64, 2 }, // mulsd
202  { ISD::FMUL, MVT::v2f64, 4 }, // mulpd
203  { ISD::FMUL, MVT::v4f32, 2 }, // mulps
204  { ISD::FDIV, MVT::f32, 17 }, // divss
205  { ISD::FDIV, MVT::v4f32, 39 }, // divps
206  { ISD::FDIV, MVT::f64, 32 }, // divsd
207  { ISD::FDIV, MVT::v2f64, 69 }, // divpd
208  { ISD::FADD, MVT::v2f64, 2 }, // addpd
209  { ISD::FSUB, MVT::v2f64, 2 }, // subpd
210  // v2i64/v4i64 mul is custom lowered as a series of long:
211  // multiplies(3), shifts(3) and adds(2)
212  // slm muldq version throughput is 2 and addq throughput 4
213  // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
214  // 3X4 (addq throughput) = 17
215  { ISD::MUL, MVT::v2i64, 17 },
216  // slm addq\subq throughput is 4
217  { ISD::ADD, MVT::v2i64, 4 },
218  { ISD::SUB, MVT::v2i64, 4 },
219  };
220 
221  if (ST->isSLM()) {
222  if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
223  // Check if the operands can be shrinked into a smaller datatype.
224  bool Op1Signed = false;
225  unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
226  bool Op2Signed = false;
227  unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
228 
229  bool signedMode = Op1Signed | Op2Signed;
230  unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
231 
232  if (OpMinSize <= 7)
233  return LT.first * 3; // pmullw/sext
234  if (!signedMode && OpMinSize <= 8)
235  return LT.first * 3; // pmullw/zext
236  if (OpMinSize <= 15)
237  return LT.first * 5; // pmullw/pmulhw/pshuf
238  if (!signedMode && OpMinSize <= 16)
239  return LT.first * 5; // pmullw/pmulhw/pshuf
240  }
241 
242  if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
243  LT.second)) {
244  return LT.first * Entry->Cost;
245  }
246  }
247 
248  if ((ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
249  ISD == ISD::UREM) &&
252  Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
253  if (ISD == ISD::SDIV || ISD == ISD::SREM) {
254  // On X86, vector signed division by constants power-of-two are
255  // normally expanded to the sequence SRA + SRL + ADD + SRA.
256  // The OperandValue properties may not be the same as that of the previous
257  // operation; conservatively assume OP_None.
258  int Cost =
259  2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, Op2Info,
262  Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
265  Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info,
268 
269  if (ISD == ISD::SREM) {
270  // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
271  Cost += getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info);
272  Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Op1Info, Op2Info);
273  }
274 
275  return Cost;
276  }
277 
278  // Vector unsigned division/remainder will be simplified to shifts/masks.
279  if (ISD == ISD::UDIV)
280  return getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
283 
284  if (ISD == ISD::UREM)
285  return getArithmeticInstrCost(Instruction::And, Ty, Op1Info, Op2Info,
288  }
289 
290  static const CostTblEntry AVX512BWUniformConstCostTable[] = {
291  { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand.
292  { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand.
293  { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb.
294  };
295 
297  ST->hasBWI()) {
298  if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
299  LT.second))
300  return LT.first * Entry->Cost;
301  }
302 
303  static const CostTblEntry AVX512UniformConstCostTable[] = {
304  { ISD::SRA, MVT::v2i64, 1 },
305  { ISD::SRA, MVT::v4i64, 1 },
306  { ISD::SRA, MVT::v8i64, 1 },
307  };
308 
310  ST->hasAVX512()) {
311  if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
312  LT.second))
313  return LT.first * Entry->Cost;
314  }
315 
316  static const CostTblEntry AVX2UniformConstCostTable[] = {
317  { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand.
318  { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand.
319  { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb.
320 
321  { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
322  };
323 
325  ST->hasAVX2()) {
326  if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
327  LT.second))
328  return LT.first * Entry->Cost;
329  }
330 
331  static const CostTblEntry SSE2UniformConstCostTable[] = {
332  { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand.
333  { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand.
334  { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
335 
336  { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split.
337  { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split.
338  { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
339  };
340 
341  // XOP has faster vXi8 shifts.
343  ST->hasSSE2() && !ST->hasXOP()) {
344  if (const auto *Entry =
345  CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
346  return LT.first * Entry->Cost;
347  }
348 
349  static const CostTblEntry AVX512BWConstCostTable[] = {
350  { ISD::SDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
351  { ISD::SREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
352  { ISD::UDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
353  { ISD::UREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
354  { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence
355  { ISD::SREM, MVT::v32i16, 8 }, // vpmulhw+mul+sub sequence
356  { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence
357  { ISD::UREM, MVT::v32i16, 8 }, // vpmulhuw+mul+sub sequence
358  };
359 
362  ST->hasBWI()) {
363  if (const auto *Entry =
364  CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
365  return LT.first * Entry->Cost;
366  }
367 
368  static const CostTblEntry AVX512ConstCostTable[] = {
369  { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
370  { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
371  { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
372  { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
373  };
374 
377  ST->hasAVX512()) {
378  if (const auto *Entry =
379  CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
380  return LT.first * Entry->Cost;
381  }
382 
383  static const CostTblEntry AVX2ConstCostTable[] = {
384  { ISD::SDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
385  { ISD::SREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
386  { ISD::UDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
387  { ISD::UREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
388  { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
389  { ISD::SREM, MVT::v16i16, 8 }, // vpmulhw+mul+sub sequence
390  { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
391  { ISD::UREM, MVT::v16i16, 8 }, // vpmulhuw+mul+sub sequence
392  { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
393  { ISD::SREM, MVT::v8i32, 19 }, // vpmuldq+mul+sub sequence
394  { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
395  { ISD::UREM, MVT::v8i32, 19 }, // vpmuludq+mul+sub sequence
396  };
397 
400  ST->hasAVX2()) {
401  if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
402  return LT.first * Entry->Cost;
403  }
404 
405  static const CostTblEntry SSE2ConstCostTable[] = {
406  { ISD::SDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
407  { ISD::SREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
408  { ISD::SDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
409  { ISD::SREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
410  { ISD::UDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
411  { ISD::UREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
412  { ISD::UDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
413  { ISD::UREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
414  { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
415  { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split.
416  { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
417  { ISD::SREM, MVT::v8i16, 8 }, // pmulhw+mul+sub sequence
418  { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
419  { ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split.
420  { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
421  { ISD::UREM, MVT::v8i16, 8 }, // pmulhuw+mul+sub sequence
422  { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split.
423  { ISD::SREM, MVT::v8i32, 48+2 }, // 2*pmuludq+mul+sub sequence + split.
424  { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
425  { ISD::SREM, MVT::v4i32, 24 }, // pmuludq+mul+sub sequence
426  { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split.
427  { ISD::UREM, MVT::v8i32, 40+2 }, // 2*pmuludq+mul+sub sequence + split.
428  { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
429  { ISD::UREM, MVT::v4i32, 20 }, // pmuludq+mul+sub sequence
430  };
431 
434  ST->hasSSE2()) {
435  // pmuldq sequence.
436  if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
437  return LT.first * 32;
438  if (ISD == ISD::SREM && LT.second == MVT::v8i32 && ST->hasAVX())
439  return LT.first * 38;
440  if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
441  return LT.first * 15;
442  if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41())
443  return LT.first * 20;
444 
445  if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
446  return LT.first * Entry->Cost;
447  }
448 
449  static const CostTblEntry AVX2UniformCostTable[] = {
450  // Uniform splats are cheaper for the following instructions.
451  { ISD::SHL, MVT::v16i16, 1 }, // psllw.
452  { ISD::SRL, MVT::v16i16, 1 }, // psrlw.
453  { ISD::SRA, MVT::v16i16, 1 }, // psraw.
454  };
455 
456  if (ST->hasAVX2() &&
458  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
459  if (const auto *Entry =
460  CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
461  return LT.first * Entry->Cost;
462  }
463 
464  static const CostTblEntry SSE2UniformCostTable[] = {
465  // Uniform splats are cheaper for the following instructions.
466  { ISD::SHL, MVT::v8i16, 1 }, // psllw.
467  { ISD::SHL, MVT::v4i32, 1 }, // pslld
468  { ISD::SHL, MVT::v2i64, 1 }, // psllq.
469 
470  { ISD::SRL, MVT::v8i16, 1 }, // psrlw.
471  { ISD::SRL, MVT::v4i32, 1 }, // psrld.
472  { ISD::SRL, MVT::v2i64, 1 }, // psrlq.
473 
474  { ISD::SRA, MVT::v8i16, 1 }, // psraw.
475  { ISD::SRA, MVT::v4i32, 1 }, // psrad.
476  };
477 
478  if (ST->hasSSE2() &&
480  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
481  if (const auto *Entry =
482  CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
483  return LT.first * Entry->Cost;
484  }
485 
486  static const CostTblEntry AVX512DQCostTable[] = {
487  { ISD::MUL, MVT::v2i64, 1 },
488  { ISD::MUL, MVT::v4i64, 1 },
489  { ISD::MUL, MVT::v8i64, 1 }
490  };
491 
492  // Look for AVX512DQ lowering tricks for custom cases.
493  if (ST->hasDQI())
494  if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
495  return LT.first * Entry->Cost;
496 
497  static const CostTblEntry AVX512BWCostTable[] = {
498  { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
499  { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
500  { ISD::SRA, MVT::v8i16, 1 }, // vpsravw
501 
502  { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
503  { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
504  { ISD::SRA, MVT::v16i16, 1 }, // vpsravw
505 
506  { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
507  { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
508  { ISD::SRA, MVT::v32i16, 1 }, // vpsravw
509 
510  { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence.
511  { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence.
512  { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence.
513 
514  { ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence.
515  { ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence.
516  { ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence.
517  };
518 
519  // Look for AVX512BW lowering tricks for custom cases.
520  if (ST->hasBWI())
521  if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
522  return LT.first * Entry->Cost;
523 
524  static const CostTblEntry AVX512CostTable[] = {
525  { ISD::SHL, MVT::v16i32, 1 },
526  { ISD::SRL, MVT::v16i32, 1 },
527  { ISD::SRA, MVT::v16i32, 1 },
528 
529  { ISD::SHL, MVT::v8i64, 1 },
530  { ISD::SRL, MVT::v8i64, 1 },
531 
532  { ISD::SRA, MVT::v2i64, 1 },
533  { ISD::SRA, MVT::v4i64, 1 },
534  { ISD::SRA, MVT::v8i64, 1 },
535 
536  { ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence.
537  { ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence.
538  { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org)
539  { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org)
540  { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org)
541  { ISD::MUL, MVT::v8i64, 8 }, // 3*pmuludq/3*shift/2*add
542 
543  { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
544  { ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
545  { ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
546 
547  { ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
548  { ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
549  { ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
550  };
551 
552  if (ST->hasAVX512())
553  if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
554  return LT.first * Entry->Cost;
555 
556  static const CostTblEntry AVX2ShiftCostTable[] = {
557  // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
558  // customize them to detect the cases where shift amount is a scalar one.
559  { ISD::SHL, MVT::v4i32, 1 },
560  { ISD::SRL, MVT::v4i32, 1 },
561  { ISD::SRA, MVT::v4i32, 1 },
562  { ISD::SHL, MVT::v8i32, 1 },
563  { ISD::SRL, MVT::v8i32, 1 },
564  { ISD::SRA, MVT::v8i32, 1 },
565  { ISD::SHL, MVT::v2i64, 1 },
566  { ISD::SRL, MVT::v2i64, 1 },
567  { ISD::SHL, MVT::v4i64, 1 },
568  { ISD::SRL, MVT::v4i64, 1 },
569  };
570 
571  // Look for AVX2 lowering tricks.
572  if (ST->hasAVX2()) {
573  if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
576  // On AVX2, a packed v16i16 shift left by a constant build_vector
577  // is lowered into a vector multiply (vpmullw).
578  return getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info,
581 
582  if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
583  return LT.first * Entry->Cost;
584  }
585 
586  static const CostTblEntry XOPShiftCostTable[] = {
587  // 128bit shifts take 1cy, but right shifts require negation beforehand.
588  { ISD::SHL, MVT::v16i8, 1 },
589  { ISD::SRL, MVT::v16i8, 2 },
590  { ISD::SRA, MVT::v16i8, 2 },
591  { ISD::SHL, MVT::v8i16, 1 },
592  { ISD::SRL, MVT::v8i16, 2 },
593  { ISD::SRA, MVT::v8i16, 2 },
594  { ISD::SHL, MVT::v4i32, 1 },
595  { ISD::SRL, MVT::v4i32, 2 },
596  { ISD::SRA, MVT::v4i32, 2 },
597  { ISD::SHL, MVT::v2i64, 1 },
598  { ISD::SRL, MVT::v2i64, 2 },
599  { ISD::SRA, MVT::v2i64, 2 },
600  // 256bit shifts require splitting if AVX2 didn't catch them above.
601  { ISD::SHL, MVT::v32i8, 2+2 },
602  { ISD::SRL, MVT::v32i8, 4+2 },
603  { ISD::SRA, MVT::v32i8, 4+2 },
604  { ISD::SHL, MVT::v16i16, 2+2 },
605  { ISD::SRL, MVT::v16i16, 4+2 },
606  { ISD::SRA, MVT::v16i16, 4+2 },
607  { ISD::SHL, MVT::v8i32, 2+2 },
608  { ISD::SRL, MVT::v8i32, 4+2 },
609  { ISD::SRA, MVT::v8i32, 4+2 },
610  { ISD::SHL, MVT::v4i64, 2+2 },
611  { ISD::SRL, MVT::v4i64, 4+2 },
612  { ISD::SRA, MVT::v4i64, 4+2 },
613  };
614 
615  // Look for XOP lowering tricks.
616  if (ST->hasXOP()) {
617  // If the right shift is constant then we'll fold the negation so
618  // it's as cheap as a left shift.
619  int ShiftISD = ISD;
620  if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) &&
623  ShiftISD = ISD::SHL;
624  if (const auto *Entry =
625  CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
626  return LT.first * Entry->Cost;
627  }
628 
629  static const CostTblEntry SSE2UniformShiftCostTable[] = {
630  // Uniform splats are cheaper for the following instructions.
631  { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split.
632  { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split.
633  { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split.
634 
635  { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split.
636  { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split.
637  { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split.
638 
639  { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split.
640  { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split.
641  { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle.
642  { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split.
643  };
644 
645  if (ST->hasSSE2() &&
647  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
648 
649  // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
650  if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2())
651  return LT.first * 4; // 2*psrad + shuffle.
652 
653  if (const auto *Entry =
654  CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
655  return LT.first * Entry->Cost;
656  }
657 
658  if (ISD == ISD::SHL &&
660  MVT VT = LT.second;
661  // Vector shift left by non uniform constant can be lowered
662  // into vector multiply.
663  if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
664  ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
665  ISD = ISD::MUL;
666  }
667 
668  static const CostTblEntry AVX2CostTable[] = {
669  { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence.
670  { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
671 
672  { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence.
673  { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
674 
675  { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence.
676  { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence.
677  { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence.
678  { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence.
679 
680  { ISD::SUB, MVT::v32i8, 1 }, // psubb
681  { ISD::ADD, MVT::v32i8, 1 }, // paddb
682  { ISD::SUB, MVT::v16i16, 1 }, // psubw
683  { ISD::ADD, MVT::v16i16, 1 }, // paddw
684  { ISD::SUB, MVT::v8i32, 1 }, // psubd
685  { ISD::ADD, MVT::v8i32, 1 }, // paddd
686  { ISD::SUB, MVT::v4i64, 1 }, // psubq
687  { ISD::ADD, MVT::v4i64, 1 }, // paddq
688 
689  { ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence.
690  { ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence.
691  { ISD::MUL, MVT::v16i16, 1 }, // pmullw
692  { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org)
693  { ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add
694 
695  { ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
696  { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
697  { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
698  { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
699  { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
700  { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
701 
702  { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
703  { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
704  { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
705  { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/
706  { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
707  { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
708  };
709 
710  // Look for AVX2 lowering tricks for custom cases.
711  if (ST->hasAVX2())
712  if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
713  return LT.first * Entry->Cost;
714 
715  static const CostTblEntry AVX1CostTable[] = {
716  // We don't have to scalarize unsupported ops. We can issue two half-sized
717  // operations and we only need to extract the upper YMM half.
718  // Two ops + 1 extract + 1 insert = 4.
719  { ISD::MUL, MVT::v16i16, 4 },
720  { ISD::MUL, MVT::v8i32, 4 },
721  { ISD::SUB, MVT::v32i8, 4 },
722  { ISD::ADD, MVT::v32i8, 4 },
723  { ISD::SUB, MVT::v16i16, 4 },
724  { ISD::ADD, MVT::v16i16, 4 },
725  { ISD::SUB, MVT::v8i32, 4 },
726  { ISD::ADD, MVT::v8i32, 4 },
727  { ISD::SUB, MVT::v4i64, 4 },
728  { ISD::ADD, MVT::v4i64, 4 },
729 
730  // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
731  // are lowered as a series of long multiplies(3), shifts(3) and adds(2)
732  // Because we believe v4i64 to be a legal type, we must also include the
733  // extract+insert in the cost table. Therefore, the cost here is 18
734  // instead of 8.
735  { ISD::MUL, MVT::v4i64, 18 },
736 
737  { ISD::MUL, MVT::v32i8, 26 }, // extend/pmullw/trunc sequence.
738 
739  { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
740  { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
741  { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
742  { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/
743  { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/
744  { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/
745  };
746 
747  if (ST->hasAVX())
748  if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
749  return LT.first * Entry->Cost;
750 
751  static const CostTblEntry SSE42CostTable[] = {
752  { ISD::FADD, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
753  { ISD::FADD, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
754  { ISD::FADD, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
755  { ISD::FADD, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
756 
757  { ISD::FSUB, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
758  { ISD::FSUB, MVT::f32 , 1 }, // Nehalem from http://www.agner.org/
759  { ISD::FSUB, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
760  { ISD::FSUB, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
761 
762  { ISD::FMUL, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
763  { ISD::FMUL, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
764  { ISD::FMUL, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
765  { ISD::FMUL, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
766 
767  { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/
768  { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
769  { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/
770  { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
771  };
772 
773  if (ST->hasSSE42())
774  if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
775  return LT.first * Entry->Cost;
776 
777  static const CostTblEntry SSE41CostTable[] = {
778  { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence.
779  { ISD::SHL, MVT::v32i8, 2*11+2 }, // pblendvb sequence + split.
780  { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence.
781  { ISD::SHL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
782  { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
783  { ISD::SHL, MVT::v8i32, 2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split
784 
785  { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence.
786  { ISD::SRL, MVT::v32i8, 2*12+2 }, // pblendvb sequence + split.
787  { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence.
788  { ISD::SRL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
789  { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend.
790  { ISD::SRL, MVT::v8i32, 2*11+2 }, // Shift each lane + blend + split.
791 
792  { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence.
793  { ISD::SRA, MVT::v32i8, 2*24+2 }, // pblendvb sequence + split.
794  { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence.
795  { ISD::SRA, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
796  { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
797  { ISD::SRA, MVT::v8i32, 2*12+2 }, // Shift each lane + blend + split.
798 
799  { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org)
800  };
801 
802  if (ST->hasSSE41())
803  if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
804  return LT.first * Entry->Cost;
805 
806  static const CostTblEntry SSE2CostTable[] = {
807  // We don't correctly identify costs of casts because they are marked as
808  // custom.
809  { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence.
810  { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence.
811  { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
812  { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
813  { ISD::SHL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
814 
815  { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence.
816  { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence.
817  { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
818  { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
819  { ISD::SRL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
820 
821  { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence.
822  { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence.
823  { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend.
824  { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence.
825  { ISD::SRA, MVT::v4i64, 2*12+2 }, // srl/xor/sub sequence+split.
826 
827  { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence.
828  { ISD::MUL, MVT::v8i16, 1 }, // pmullw
829  { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
830  { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
831 
832  { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/
833  { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
834  { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
835  { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
836 
837  { ISD::FADD, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
838  { ISD::FADD, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
839 
840  { ISD::FSUB, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
841  { ISD::FSUB, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
842  };
843 
844  if (ST->hasSSE2())
845  if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
846  return LT.first * Entry->Cost;
847 
848  static const CostTblEntry SSE1CostTable[] = {
849  { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/
850  { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
851 
852  { ISD::FADD, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
853  { ISD::FADD, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
854 
855  { ISD::FSUB, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
856  { ISD::FSUB, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
857 
858  { ISD::ADD, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
859  { ISD::ADD, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
860  { ISD::ADD, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
861 
862  { ISD::SUB, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
863  { ISD::SUB, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
864  { ISD::SUB, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
865  };
866 
867  if (ST->hasSSE1())
868  if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
869  return LT.first * Entry->Cost;
870 
871  // It is not a good idea to vectorize division. We have to scalarize it and
872  // in the process we will often end up having to spilling regular
873  // registers. The overhead of division is going to dominate most kernels
874  // anyways so try hard to prevent vectorization of division - it is
875  // generally a bad idea. Assume somewhat arbitrarily that we have to be able
876  // to hide "20 cycles" for each lane.
877  if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM ||
878  ISD == ISD::UDIV || ISD == ISD::UREM)) {
879  int ScalarCost = getArithmeticInstrCost(
880  Opcode, Ty->getScalarType(), Op1Info, Op2Info,
882  return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
883  }
884 
885  // Fallback to the default implementation.
886  return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
887 }
888 
890  Type *SubTp) {
891  // 64-bit packed float vectors (v2f32) are widened to type v4f32.
892  // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
893  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
894 
895  // Treat Transpose as 2-op shuffles - there's no difference in lowering.
896  if (Kind == TTI::SK_Transpose)
897  Kind = TTI::SK_PermuteTwoSrc;
898 
899  // For Broadcasts we are splatting the first element from the first input
900  // register, so only need to reference that input and all the output
901  // registers are the same.
902  if (Kind == TTI::SK_Broadcast)
903  LT.first = 1;
904 
905  // Subvector extractions are free if they start at the beginning of a
906  // vector and cheap if the subvectors are aligned.
907  if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
908  int NumElts = LT.second.getVectorNumElements();
909  if ((Index % NumElts) == 0)
910  return 0;
911  std::pair<int, MVT> SubLT = TLI->getTypeLegalizationCost(DL, SubTp);
912  if (SubLT.second.isVector()) {
913  int NumSubElts = SubLT.second.getVectorNumElements();
914  if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
915  return SubLT.first;
916  // Handle some cases for widening legalization. For now we only handle
917  // cases where the original subvector was naturally aligned and evenly
918  // fit in its legalized subvector type.
919  // FIXME: Remove some of the alignment restrictions.
920  // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
921  // vectors.
922  int OrigSubElts = SubTp->getVectorNumElements();
923  if (ExperimentalVectorWideningLegalization &&
924  NumSubElts > OrigSubElts &&
925  (Index % OrigSubElts) == 0 && (NumSubElts % OrigSubElts) == 0 &&
926  LT.second.getVectorElementType() ==
927  SubLT.second.getVectorElementType() &&
928  LT.second.getVectorElementType().getSizeInBits() ==
930  assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
931  "Unexpected number of elements!");
933  LT.second.getVectorNumElements());
935  SubLT.second.getVectorNumElements());
936  int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
937  int ExtractCost = getShuffleCost(TTI::SK_ExtractSubvector, VecTy,
938  ExtractIndex, SubTy);
939 
940  // If the original size is 32-bits or more, we can use pshufd. Otherwise
941  // if we have SSSE3 we can use pshufb.
942  if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
943  return ExtractCost + 1; // pshufd or pshufb
944 
945  assert(SubTp->getPrimitiveSizeInBits() == 16 &&
946  "Unexpected vector size");
947 
948  return ExtractCost + 2; // worst case pshufhw + pshufd
949  }
950  }
951  }
952 
953  // We are going to permute multiple sources and the result will be in multiple
954  // destinations. Providing an accurate cost only for splits where the element
955  // type remains the same.
956  if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
957  MVT LegalVT = LT.second;
958  if (LegalVT.isVector() &&
959  LegalVT.getVectorElementType().getSizeInBits() ==
961  LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) {
962 
963  unsigned VecTySize = DL.getTypeStoreSize(Tp);
964  unsigned LegalVTSize = LegalVT.getStoreSize();
965  // Number of source vectors after legalization:
966  unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
967  // Number of destination vectors after legalization:
968  unsigned NumOfDests = LT.first;
969 
970  Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(),
971  LegalVT.getVectorNumElements());
972 
973  unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
974  return NumOfShuffles *
975  getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
976  }
977 
978  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
979  }
980 
981  // For 2-input shuffles, we must account for splitting the 2 inputs into many.
982  if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
983  // We assume that source and destination have the same vector type.
984  int NumOfDests = LT.first;
985  int NumOfShufflesPerDest = LT.first * 2 - 1;
986  LT.first = NumOfDests * NumOfShufflesPerDest;
987  }
988 
989  static const CostTblEntry AVX512VBMIShuffleTbl[] = {
990  {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
991  {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
992 
993  {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
994  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
995 
996  {TTI::SK_PermuteTwoSrc, MVT::v64i8, 1}, // vpermt2b
997  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 1}, // vpermt2b
998  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1} // vpermt2b
999  };
1000 
1001  if (ST->hasVBMI())
1002  if (const auto *Entry =
1003  CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1004  return LT.first * Entry->Cost;
1005 
1006  static const CostTblEntry AVX512BWShuffleTbl[] = {
1007  {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1008  {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1009 
1010  {TTI::SK_Reverse, MVT::v32i16, 1}, // vpermw
1011  {TTI::SK_Reverse, MVT::v16i16, 1}, // vpermw
1012  {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
1013 
1014  {TTI::SK_PermuteSingleSrc, MVT::v32i16, 1}, // vpermw
1015  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 1}, // vpermw
1016  {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // vpermw
1017  {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
1018  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 3}, // vpermw + zext/trunc
1019 
1020  {TTI::SK_PermuteTwoSrc, MVT::v32i16, 1}, // vpermt2w
1021  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 1}, // vpermt2w
1022  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpermt2w
1023  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 3}, // zext + vpermt2w + trunc
1024  {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1025  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3} // zext + vpermt2w + trunc
1026  };
1027 
1028  if (ST->hasBWI())
1029  if (const auto *Entry =
1030  CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1031  return LT.first * Entry->Cost;
1032 
1033  static const CostTblEntry AVX512ShuffleTbl[] = {
1034  {TTI::SK_Broadcast, MVT::v8f64, 1}, // vbroadcastpd
1035  {TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps
1036  {TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq
1037  {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd
1038 
1039  {TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd
1040  {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps
1041  {TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq
1042  {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd
1043 
1044  {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd
1045  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1046  {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // vpermpd
1047  {TTI::SK_PermuteSingleSrc, MVT::v16f32, 1}, // vpermps
1048  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1049  {TTI::SK_PermuteSingleSrc, MVT::v4f32, 1}, // vpermps
1050  {TTI::SK_PermuteSingleSrc, MVT::v8i64, 1}, // vpermq
1051  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1052  {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // vpermq
1053  {TTI::SK_PermuteSingleSrc, MVT::v16i32, 1}, // vpermd
1054  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1055  {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // vpermd
1056  {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1057 
1058  {TTI::SK_PermuteTwoSrc, MVT::v8f64, 1}, // vpermt2pd
1059  {TTI::SK_PermuteTwoSrc, MVT::v16f32, 1}, // vpermt2ps
1060  {TTI::SK_PermuteTwoSrc, MVT::v8i64, 1}, // vpermt2q
1061  {TTI::SK_PermuteTwoSrc, MVT::v16i32, 1}, // vpermt2d
1062  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 1}, // vpermt2pd
1063  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 1}, // vpermt2ps
1064  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 1}, // vpermt2q
1065  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 1}, // vpermt2d
1066  {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1}, // vpermt2pd
1067  {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1}, // vpermt2ps
1068  {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1}, // vpermt2q
1069  {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1} // vpermt2d
1070  };
1071 
1072  if (ST->hasAVX512())
1073  if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1074  return LT.first * Entry->Cost;
1075 
1076  static const CostTblEntry AVX2ShuffleTbl[] = {
1077  {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1078  {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1079  {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1080  {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1081  {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1082  {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1083 
1084  {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1085  {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1086  {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1087  {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1088  {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1089  {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1090 
1091  {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1092  {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1093 
1094  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1095  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1096  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1097  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1098  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1099  // + vpblendvb
1100  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1101  // + vpblendvb
1102 
1103  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1104  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1105  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1106  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1107  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1108  // + vpblendvb
1109  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
1110  // + vpblendvb
1111  };
1112 
1113  if (ST->hasAVX2())
1114  if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1115  return LT.first * Entry->Cost;
1116 
1117  static const CostTblEntry XOPShuffleTbl[] = {
1118  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
1119  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
1120  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
1121  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
1122  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1123  // + vinsertf128
1124  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
1125  // + vinsertf128
1126 
1127  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1128  // + vinsertf128
1129  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
1130  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
1131  // + vinsertf128
1132  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
1133  };
1134 
1135  if (ST->hasXOP())
1136  if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1137  return LT.first * Entry->Cost;
1138 
1139  static const CostTblEntry AVX1ShuffleTbl[] = {
1140  {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1141  {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1142  {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1143  {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1144  {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1145  {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
1146 
1147  {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1148  {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1149  {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1150  {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1151  {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1152  // + vinsertf128
1153  {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
1154  // + vinsertf128
1155 
1156  {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
1157  {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
1158  {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
1159  {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
1160  {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
1161  {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
1162 
1163  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
1164  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
1165  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1166  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1167  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
1168  // + 2*por + vinsertf128
1169  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
1170  // + 2*por + vinsertf128
1171 
1172  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
1173  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
1174  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1175  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1176  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
1177  // + 4*por + vinsertf128
1178  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
1179  // + 4*por + vinsertf128
1180  };
1181 
1182  if (ST->hasAVX())
1183  if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
1184  return LT.first * Entry->Cost;
1185 
1186  static const CostTblEntry SSE41ShuffleTbl[] = {
1187  {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
1188  {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1189  {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
1190  {TTI::SK_Select, MVT::v4f32, 1}, // blendps
1191  {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
1192  {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
1193  };
1194 
1195  if (ST->hasSSE41())
1196  if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
1197  return LT.first * Entry->Cost;
1198 
1199  static const CostTblEntry SSSE3ShuffleTbl[] = {
1200  {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
1201  {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
1202 
1203  {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
1204  {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
1205 
1206  {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
1207  {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
1208 
1209  {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
1210  {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1211 
1212  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
1213  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
1214  };
1215 
1216  if (ST->hasSSSE3())
1217  if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
1218  return LT.first * Entry->Cost;
1219 
1220  static const CostTblEntry SSE2ShuffleTbl[] = {
1221  {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
1222  {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
1223  {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
1224  {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
1225  {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
1226 
1227  {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
1228  {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
1229  {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
1230  {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
1231  {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
1232  // + 2*pshufd + 2*unpck + packus
1233 
1234  {TTI::SK_Select, MVT::v2i64, 1}, // movsd
1235  {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1236  {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
1237  {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
1238  {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
1239 
1240  {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
1241  {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
1242  {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
1243  {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
1244  // + pshufd/unpck
1245  { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
1246  // + 2*pshufd + 2*unpck + 2*packus
1247 
1248  { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
1249  { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
1250  { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
1251  { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
1252  { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
1253  };
1254 
1255  if (ST->hasSSE2())
1256  if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
1257  return LT.first * Entry->Cost;
1258 
1259  static const CostTblEntry SSE1ShuffleTbl[] = {
1260  { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
1261  { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
1262  { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
1263  { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
1264  { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
1265  };
1266 
1267  if (ST->hasSSE1())
1268  if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
1269  return LT.first * Entry->Cost;
1270 
1271  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
1272 }
1273 
1274 int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1275  const Instruction *I) {
1276  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1277  assert(ISD && "Invalid opcode");
1278 
1279  // FIXME: Need a better design of the cost table to handle non-simple types of
1280  // potential massive combinations (elem_num x src_type x dst_type).
1281 
1282  static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
1285 
1286  // Mask sign extend has an instruction.
1293 
1294  // Mask zero extend is a load + broadcast.
1301  };
1302 
1303  static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
1310 
1317 
1324 
1331  };
1332 
1333  // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
1334  // 256-bit wide vectors.
1335 
1336  // Used with widening legalization
1337  static const TypeConversionCostTblEntry AVX512FConversionTblWide[] = {
1340  };
1341 
1342  static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
1346 
1351 
1352  // v16i1 -> v16i32 - load + broadcast
1363 
1372 
1397 
1401 
1411  };
1412 
1413  static const TypeConversionCostTblEntry AVX2ConversionTblWide[] = {
1420  };
1421 
1422  static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
1439 
1446 
1449 
1451  };
1452 
1453  static const TypeConversionCostTblEntry AVXConversionTblWide[] = {
1457  };
1458 
1459  static const TypeConversionCostTblEntry AVXConversionTbl[] = {
1476 
1484 
1497 
1513  // The generic code to compute the scalar overhead is currently broken.
1514  // Workaround this limitation by estimating the scalarization overhead
1515  // here. We have roughly 10 instructions per scalar element.
1516  // Multiply that by the vector width.
1517  // FIXME: remove that when PR19268 is fixed.
1520 
1523  // This node is expanded into scalarized operations but BasicTTI is overly
1524  // optimistic estimating its cost. It computes 3 per element (one
1525  // vector-extract, one scalar conversion and one vector-insert). The
1526  // problem is that the inserts form a read-modify-write chain so latency
1527  // should be factored in too. Inflating the cost per element by 1.
1530 
1533  };
1534 
1535  static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
1542 
1561 
1569 
1571  };
1572 
1573  static const TypeConversionCostTblEntry SSE2ConversionTblWide[] = {
1576  };
1577 
1578  static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
1579  // These are somewhat magic numbers justified by looking at the output of
1580  // Intel's IACA, running some kernels and making sure when we take
1581  // legalization into account the throughput will be overestimated.
1583  { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1590 
1591  { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1599 
1601 
1605 
1630 
1640  };
1641 
1642  std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
1643  std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);
1644 
1645  if (ST->hasSSE2() && !ST->hasAVX() &&
1647  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTblWide, ISD,
1648  LTDest.second, LTSrc.second))
1649  return LTSrc.first * Entry->Cost;
1650  }
1651 
1652  if (ST->hasSSE2() && !ST->hasAVX()) {
1653  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
1654  LTDest.second, LTSrc.second))
1655  return LTSrc.first * Entry->Cost;
1656  }
1657 
1658  EVT SrcTy = TLI->getValueType(DL, Src);
1659  EVT DstTy = TLI->getValueType(DL, Dst);
1660 
1661  // The function getSimpleVT only handles simple value types.
1662  if (!SrcTy.isSimple() || !DstTy.isSimple())
1663  return BaseT::getCastInstrCost(Opcode, Dst, Src);
1664 
1665  MVT SimpleSrcTy = SrcTy.getSimpleVT();
1666  MVT SimpleDstTy = DstTy.getSimpleVT();
1667 
1668  // Make sure that neither type is going to be split before using the
1669  // AVX512 tables. This handles -mprefer-vector-width=256
1670  // with -min-legal-vector-width<=256
1671  if (TLI->getTypeAction(SimpleSrcTy) != TargetLowering::TypeSplitVector &&
1672  TLI->getTypeAction(SimpleDstTy) != TargetLowering::TypeSplitVector) {
1673  if (ST->hasBWI())
1674  if (const auto *Entry = ConvertCostTableLookup(AVX512BWConversionTbl, ISD,
1675  SimpleDstTy, SimpleSrcTy))
1676  return Entry->Cost;
1677 
1678  if (ST->hasDQI())
1679  if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
1680  SimpleDstTy, SimpleSrcTy))
1681  return Entry->Cost;
1682 
1684  if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTblWide, ISD,
1685  SimpleDstTy, SimpleSrcTy))
1686  return Entry->Cost;
1687 
1688  if (ST->hasAVX512())
1689  if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
1690  SimpleDstTy, SimpleSrcTy))
1691  return Entry->Cost;
1692  }
1693 
1695  if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTblWide, ISD,
1696  SimpleDstTy, SimpleSrcTy))
1697  return Entry->Cost;
1698  }
1699 
1700  if (ST->hasAVX2()) {
1701  if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
1702  SimpleDstTy, SimpleSrcTy))
1703  return Entry->Cost;
1704  }
1705 
1707  if (const auto *Entry = ConvertCostTableLookup(AVXConversionTblWide, ISD,
1708  SimpleDstTy, SimpleSrcTy))
1709  return Entry->Cost;
1710  }
1711 
1712  if (ST->hasAVX()) {
1713  if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
1714  SimpleDstTy, SimpleSrcTy))
1715  return Entry->Cost;
1716  }
1717 
1718  if (ST->hasSSE41()) {
1719  if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
1720  SimpleDstTy, SimpleSrcTy))
1721  return Entry->Cost;
1722  }
1723 
1725  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTblWide, ISD,
1726  SimpleDstTy, SimpleSrcTy))
1727  return Entry->Cost;
1728  }
1729 
1730  if (ST->hasSSE2()) {
1731  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
1732  SimpleDstTy, SimpleSrcTy))
1733  return Entry->Cost;
1734  }
1735 
1736  return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
1737 }
1738 
1739 int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1740  const Instruction *I) {
1741  // Legalize the type.
1742  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1743 
1744  MVT MTy = LT.second;
1745 
1746  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1747  assert(ISD && "Invalid opcode");
1748 
1749  unsigned ExtraCost = 0;
1750  if (I && (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp)) {
1751  // Some vector comparison predicates cost extra instructions.
1752  if (MTy.isVector() &&
1753  !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
1754  (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
1755  ST->hasBWI())) {
1756  switch (cast<CmpInst>(I)->getPredicate()) {
1757  case CmpInst::Predicate::ICMP_NE:
1758  // xor(cmpeq(x,y),-1)
1759  ExtraCost = 1;
1760  break;
1761  case CmpInst::Predicate::ICMP_SGE:
1762  case CmpInst::Predicate::ICMP_SLE:
1763  // xor(cmpgt(x,y),-1)
1764  ExtraCost = 1;
1765  break;
1766  case CmpInst::Predicate::ICMP_ULT:
1767  case CmpInst::Predicate::ICMP_UGT:
1768  // cmpgt(xor(x,signbit),xor(y,signbit))
1769  // xor(cmpeq(pmaxu(x,y),x),-1)
1770  ExtraCost = 2;
1771  break;
1772  case CmpInst::Predicate::ICMP_ULE:
1773  case CmpInst::Predicate::ICMP_UGE:
1774  if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
1775  (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
1776  // cmpeq(psubus(x,y),0)
1777  // cmpeq(pminu(x,y),x)
1778  ExtraCost = 1;
1779  } else {
1780  // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
1781  ExtraCost = 3;
1782  }
1783  break;
1784  default:
1785  break;
1786  }
1787  }
1788  }
1789 
1790  static const CostTblEntry AVX512BWCostTbl[] = {
1791  { ISD::SETCC, MVT::v32i16, 1 },
1792  { ISD::SETCC, MVT::v64i8, 1 },
1793 
1794  { ISD::SELECT, MVT::v32i16, 1 },
1795  { ISD::SELECT, MVT::v64i8, 1 },
1796  };
1797 
1798  static const CostTblEntry AVX512CostTbl[] = {
1799  { ISD::SETCC, MVT::v8i64, 1 },
1800  { ISD::SETCC, MVT::v16i32, 1 },
1801  { ISD::SETCC, MVT::v8f64, 1 },
1802  { ISD::SETCC, MVT::v16f32, 1 },
1803 
1804  { ISD::SELECT, MVT::v8i64, 1 },
1805  { ISD::SELECT, MVT::v16i32, 1 },
1806  { ISD::SELECT, MVT::v8f64, 1 },
1807  { ISD::SELECT, MVT::v16f32, 1 },
1808  };
1809 
1810  static const CostTblEntry AVX2CostTbl[] = {
1811  { ISD::SETCC, MVT::v4i64, 1 },
1812  { ISD::SETCC, MVT::v8i32, 1 },
1813  { ISD::SETCC, MVT::v16i16, 1 },
1814  { ISD::SETCC, MVT::v32i8, 1 },
1815 
1816  { ISD::SELECT, MVT::v4i64, 1 }, // pblendvb
1817  { ISD::SELECT, MVT::v8i32, 1 }, // pblendvb
1818  { ISD::SELECT, MVT::v16i16, 1 }, // pblendvb
1819  { ISD::SELECT, MVT::v32i8, 1 }, // pblendvb
1820  };
1821 
1822  static const CostTblEntry AVX1CostTbl[] = {
1823  { ISD::SETCC, MVT::v4f64, 1 },
1824  { ISD::SETCC, MVT::v8f32, 1 },
1825  // AVX1 does not support 8-wide integer compare.
1826  { ISD::SETCC, MVT::v4i64, 4 },
1827  { ISD::SETCC, MVT::v8i32, 4 },
1828  { ISD::SETCC, MVT::v16i16, 4 },
1829  { ISD::SETCC, MVT::v32i8, 4 },
1830 
1831  { ISD::SELECT, MVT::v4f64, 1 }, // vblendvpd
1832  { ISD::SELECT, MVT::v8f32, 1 }, // vblendvps
1833  { ISD::SELECT, MVT::v4i64, 1 }, // vblendvpd
1834  { ISD::SELECT, MVT::v8i32, 1 }, // vblendvps
1835  { ISD::SELECT, MVT::v16i16, 3 }, // vandps + vandnps + vorps
1836  { ISD::SELECT, MVT::v32i8, 3 }, // vandps + vandnps + vorps
1837  };
1838 
1839  static const CostTblEntry SSE42CostTbl[] = {
1840  { ISD::SETCC, MVT::v2f64, 1 },
1841  { ISD::SETCC, MVT::v4f32, 1 },
1842  { ISD::SETCC, MVT::v2i64, 1 },
1843  };
1844 
1845  static const CostTblEntry SSE41CostTbl[] = {
1846  { ISD::SELECT, MVT::v2f64, 1 }, // blendvpd
1847  { ISD::SELECT, MVT::v4f32, 1 }, // blendvps
1848  { ISD::SELECT, MVT::v2i64, 1 }, // pblendvb
1849  { ISD::SELECT, MVT::v4i32, 1 }, // pblendvb
1850  { ISD::SELECT, MVT::v8i16, 1 }, // pblendvb
1851  { ISD::SELECT, MVT::v16i8, 1 }, // pblendvb
1852  };
1853 
1854  static const CostTblEntry SSE2CostTbl[] = {
1855  { ISD::SETCC, MVT::v2f64, 2 },
1856  { ISD::SETCC, MVT::f64, 1 },
1857  { ISD::SETCC, MVT::v2i64, 8 },
1858  { ISD::SETCC, MVT::v4i32, 1 },
1859  { ISD::SETCC, MVT::v8i16, 1 },
1860  { ISD::SETCC, MVT::v16i8, 1 },
1861 
1862  { ISD::SELECT, MVT::v2f64, 3 }, // andpd + andnpd + orpd
1863  { ISD::SELECT, MVT::v2i64, 3 }, // pand + pandn + por
1864  { ISD::SELECT, MVT::v4i32, 3 }, // pand + pandn + por
1865  { ISD::SELECT, MVT::v8i16, 3 }, // pand + pandn + por
1866  { ISD::SELECT, MVT::v16i8, 3 }, // pand + pandn + por
1867  };
1868 
1869  static const CostTblEntry SSE1CostTbl[] = {
1870  { ISD::SETCC, MVT::v4f32, 2 },
1871  { ISD::SETCC, MVT::f32, 1 },
1872 
1873  { ISD::SELECT, MVT::v4f32, 3 }, // andps + andnps + orps
1874  };
1875 
1876  if (ST->hasBWI())
1877  if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
1878  return LT.first * (ExtraCost + Entry->Cost);
1879 
1880  if (ST->hasAVX512())
1881  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
1882  return LT.first * (ExtraCost + Entry->Cost);
1883 
1884  if (ST->hasAVX2())
1885  if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
1886  return LT.first * (ExtraCost + Entry->Cost);
1887 
1888  if (ST->hasAVX())
1889  if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
1890  return LT.first * (ExtraCost + Entry->Cost);
1891 
1892  if (ST->hasSSE42())
1893  if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
1894  return LT.first * (ExtraCost + Entry->Cost);
1895 
1896  if (ST->hasSSE41())
1897  if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
1898  return LT.first * (ExtraCost + Entry->Cost);
1899 
1900  if (ST->hasSSE2())
1901  if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
1902  return LT.first * (ExtraCost + Entry->Cost);
1903 
1904  if (ST->hasSSE1())
1905  if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
1906  return LT.first * (ExtraCost + Entry->Cost);
1907 
1908  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
1909 }
1910 
1912 
1915  unsigned ScalarizationCostPassed) {
1916  // Costs should match the codegen from:
1917  // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
1918  // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
1919  // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
1920  // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
1921  // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
1922  static const CostTblEntry AVX512CDCostTbl[] = {
1923  { ISD::CTLZ, MVT::v8i64, 1 },
1924  { ISD::CTLZ, MVT::v16i32, 1 },
1925  { ISD::CTLZ, MVT::v32i16, 8 },
1926  { ISD::CTLZ, MVT::v64i8, 20 },
1927  { ISD::CTLZ, MVT::v4i64, 1 },
1928  { ISD::CTLZ, MVT::v8i32, 1 },
1929  { ISD::CTLZ, MVT::v16i16, 4 },
1930  { ISD::CTLZ, MVT::v32i8, 10 },
1931  { ISD::CTLZ, MVT::v2i64, 1 },
1932  { ISD::CTLZ, MVT::v4i32, 1 },
1933  { ISD::CTLZ, MVT::v8i16, 4 },
1934  { ISD::CTLZ, MVT::v16i8, 4 },
1935  };
1936  static const CostTblEntry AVX512BWCostTbl[] = {
1937  { ISD::BITREVERSE, MVT::v8i64, 5 },
1938  { ISD::BITREVERSE, MVT::v16i32, 5 },
1939  { ISD::BITREVERSE, MVT::v32i16, 5 },
1940  { ISD::BITREVERSE, MVT::v64i8, 5 },
1941  { ISD::CTLZ, MVT::v8i64, 23 },
1942  { ISD::CTLZ, MVT::v16i32, 22 },
1943  { ISD::CTLZ, MVT::v32i16, 18 },
1944  { ISD::CTLZ, MVT::v64i8, 17 },
1945  { ISD::CTPOP, MVT::v8i64, 7 },
1946  { ISD::CTPOP, MVT::v16i32, 11 },
1947  { ISD::CTPOP, MVT::v32i16, 9 },
1948  { ISD::CTPOP, MVT::v64i8, 6 },
1949  { ISD::CTTZ, MVT::v8i64, 10 },
1950  { ISD::CTTZ, MVT::v16i32, 14 },
1951  { ISD::CTTZ, MVT::v32i16, 12 },
1952  { ISD::CTTZ, MVT::v64i8, 9 },
1953  { ISD::SADDSAT, MVT::v32i16, 1 },
1954  { ISD::SADDSAT, MVT::v64i8, 1 },
1955  { ISD::SSUBSAT, MVT::v32i16, 1 },
1956  { ISD::SSUBSAT, MVT::v64i8, 1 },
1957  { ISD::UADDSAT, MVT::v32i16, 1 },
1958  { ISD::UADDSAT, MVT::v64i8, 1 },
1959  { ISD::USUBSAT, MVT::v32i16, 1 },
1960  { ISD::USUBSAT, MVT::v64i8, 1 },
1961  };
1962  static const CostTblEntry AVX512CostTbl[] = {
1963  { ISD::BITREVERSE, MVT::v8i64, 36 },
1964  { ISD::BITREVERSE, MVT::v16i32, 24 },
1965  { ISD::CTLZ, MVT::v8i64, 29 },
1966  { ISD::CTLZ, MVT::v16i32, 35 },
1967  { ISD::CTPOP, MVT::v8i64, 16 },
1968  { ISD::CTPOP, MVT::v16i32, 24 },
1969  { ISD::CTTZ, MVT::v8i64, 20 },
1970  { ISD::CTTZ, MVT::v16i32, 28 },
1971  { ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd
1972  { ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq
1973  { ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq
1974  { ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq
1975  { ISD::UADDSAT, MVT::v16i32, 3 }, // not + pminud + paddd
1976  { ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq
1977  { ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq
1978  { ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq
1979  };
1980  static const CostTblEntry XOPCostTbl[] = {
1981  { ISD::BITREVERSE, MVT::v4i64, 4 },
1982  { ISD::BITREVERSE, MVT::v8i32, 4 },
1983  { ISD::BITREVERSE, MVT::v16i16, 4 },
1984  { ISD::BITREVERSE, MVT::v32i8, 4 },
1985  { ISD::BITREVERSE, MVT::v2i64, 1 },
1986  { ISD::BITREVERSE, MVT::v4i32, 1 },
1987  { ISD::BITREVERSE, MVT::v8i16, 1 },
1988  { ISD::BITREVERSE, MVT::v16i8, 1 },
1989  { ISD::BITREVERSE, MVT::i64, 3 },
1990  { ISD::BITREVERSE, MVT::i32, 3 },
1991  { ISD::BITREVERSE, MVT::i16, 3 },
1992  { ISD::BITREVERSE, MVT::i8, 3 }
1993  };
1994  static const CostTblEntry AVX2CostTbl[] = {
1995  { ISD::BITREVERSE, MVT::v4i64, 5 },
1996  { ISD::BITREVERSE, MVT::v8i32, 5 },
1997  { ISD::BITREVERSE, MVT::v16i16, 5 },
1998  { ISD::BITREVERSE, MVT::v32i8, 5 },
1999  { ISD::BSWAP, MVT::v4i64, 1 },
2000  { ISD::BSWAP, MVT::v8i32, 1 },
2001  { ISD::BSWAP, MVT::v16i16, 1 },
2002  { ISD::CTLZ, MVT::v4i64, 23 },
2003  { ISD::CTLZ, MVT::v8i32, 18 },
2004  { ISD::CTLZ, MVT::v16i16, 14 },
2005  { ISD::CTLZ, MVT::v32i8, 9 },
2006  { ISD::CTPOP, MVT::v4i64, 7 },
2007  { ISD::CTPOP, MVT::v8i32, 11 },
2008  { ISD::CTPOP, MVT::v16i16, 9 },
2009  { ISD::CTPOP, MVT::v32i8, 6 },
2010  { ISD::CTTZ, MVT::v4i64, 10 },
2011  { ISD::CTTZ, MVT::v8i32, 14 },
2012  { ISD::CTTZ, MVT::v16i16, 12 },
2013  { ISD::CTTZ, MVT::v32i8, 9 },
2014  { ISD::SADDSAT, MVT::v16i16, 1 },
2015  { ISD::SADDSAT, MVT::v32i8, 1 },
2016  { ISD::SSUBSAT, MVT::v16i16, 1 },
2017  { ISD::SSUBSAT, MVT::v32i8, 1 },
2018  { ISD::UADDSAT, MVT::v16i16, 1 },
2019  { ISD::UADDSAT, MVT::v32i8, 1 },
2020  { ISD::UADDSAT, MVT::v8i32, 3 }, // not + pminud + paddd
2021  { ISD::USUBSAT, MVT::v16i16, 1 },
2022  { ISD::USUBSAT, MVT::v32i8, 1 },
2023  { ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd
2024  { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/
2025  { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
2026  { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
2027  { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/
2028  { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
2029  { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
2030  };
2031  static const CostTblEntry AVX1CostTbl[] = {
2032  { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert
2033  { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert
2034  { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert
2035  { ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert
2036  { ISD::BSWAP, MVT::v4i64, 4 },
2037  { ISD::BSWAP, MVT::v8i32, 4 },
2038  { ISD::BSWAP, MVT::v16i16, 4 },
2039  { ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert
2040  { ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert
2041  { ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert
2042  { ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
2043  { ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert
2044  { ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert
2045  { ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert
2046  { ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert
2047  { ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert
2048  { ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert
2049  { ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert
2050  { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
2051  { ISD::SADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2052  { ISD::SADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2053  { ISD::SSUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2054  { ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2055  { ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2056  { ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2057  { ISD::UADDSAT, MVT::v8i32, 8 }, // 2 x 128-bit Op + extract/insert
2058  { ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2059  { ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2060  { ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert
2061  { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
2062  { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
2063  { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
2064  { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/
2065  { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/
2066  { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/
2067  };
2068  static const CostTblEntry GLMCostTbl[] = {
2069  { ISD::FSQRT, MVT::f32, 19 }, // sqrtss
2070  { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps
2071  { ISD::FSQRT, MVT::f64, 34 }, // sqrtsd
2072  { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd
2073  };
2074  static const CostTblEntry SLMCostTbl[] = {
2075  { ISD::FSQRT, MVT::f32, 20 }, // sqrtss
2076  { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps
2077  { ISD::FSQRT, MVT::f64, 35 }, // sqrtsd
2078  { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd
2079  };
2080  static const CostTblEntry SSE42CostTbl[] = {
2081  { ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd
2082  { ISD::UADDSAT, MVT::v4i32, 3 }, // not + pminud + paddd
2083  { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
2084  { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
2085  };
2086  static const CostTblEntry SSSE3CostTbl[] = {
2087  { ISD::BITREVERSE, MVT::v2i64, 5 },
2088  { ISD::BITREVERSE, MVT::v4i32, 5 },
2089  { ISD::BITREVERSE, MVT::v8i16, 5 },
2090  { ISD::BITREVERSE, MVT::v16i8, 5 },
2091  { ISD::BSWAP, MVT::v2i64, 1 },
2092  { ISD::BSWAP, MVT::v4i32, 1 },
2093  { ISD::BSWAP, MVT::v8i16, 1 },
2094  { ISD::CTLZ, MVT::v2i64, 23 },
2095  { ISD::CTLZ, MVT::v4i32, 18 },
2096  { ISD::CTLZ, MVT::v8i16, 14 },
2097  { ISD::CTLZ, MVT::v16i8, 9 },
2098  { ISD::CTPOP, MVT::v2i64, 7 },
2099  { ISD::CTPOP, MVT::v4i32, 11 },
2100  { ISD::CTPOP, MVT::v8i16, 9 },
2101  { ISD::CTPOP, MVT::v16i8, 6 },
2102  { ISD::CTTZ, MVT::v2i64, 10 },
2103  { ISD::CTTZ, MVT::v4i32, 14 },
2104  { ISD::CTTZ, MVT::v8i16, 12 },
2105  { ISD::CTTZ, MVT::v16i8, 9 }
2106  };
2107  static const CostTblEntry SSE2CostTbl[] = {
2108  { ISD::BITREVERSE, MVT::v2i64, 29 },
2109  { ISD::BITREVERSE, MVT::v4i32, 27 },
2110  { ISD::BITREVERSE, MVT::v8i16, 27 },
2111  { ISD::BITREVERSE, MVT::v16i8, 20 },
2112  { ISD::BSWAP, MVT::v2i64, 7 },
2113  { ISD::BSWAP, MVT::v4i32, 7 },
2114  { ISD::BSWAP, MVT::v8i16, 7 },
2115  { ISD::CTLZ, MVT::v2i64, 25 },
2116  { ISD::CTLZ, MVT::v4i32, 26 },
2117  { ISD::CTLZ, MVT::v8i16, 20 },
2118  { ISD::CTLZ, MVT::v16i8, 17 },
2119  { ISD::CTPOP, MVT::v2i64, 12 },
2120  { ISD::CTPOP, MVT::v4i32, 15 },
2121  { ISD::CTPOP, MVT::v8i16, 13 },
2122  { ISD::CTPOP, MVT::v16i8, 10 },
2123  { ISD::CTTZ, MVT::v2i64, 14 },
2124  { ISD::CTTZ, MVT::v4i32, 18 },
2125  { ISD::CTTZ, MVT::v8i16, 16 },
2126  { ISD::CTTZ, MVT::v16i8, 13 },
2127  { ISD::SADDSAT, MVT::v8i16, 1 },
2128  { ISD::SADDSAT, MVT::v16i8, 1 },
2129  { ISD::SSUBSAT, MVT::v8i16, 1 },
2130  { ISD::SSUBSAT, MVT::v16i8, 1 },
2131  { ISD::UADDSAT, MVT::v8i16, 1 },
2132  { ISD::UADDSAT, MVT::v16i8, 1 },
2133  { ISD::USUBSAT, MVT::v8i16, 1 },
2134  { ISD::USUBSAT, MVT::v16i8, 1 },
2135  { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/
2136  { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/
2137  };
2138  static const CostTblEntry SSE1CostTbl[] = {
2139  { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
2140  { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
2141  };
2142  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
2143  { ISD::BITREVERSE, MVT::i64, 14 },
2144  { ISD::SADDO, MVT::i64, 1 },
2145  { ISD::UADDO, MVT::i64, 1 },
2146  };
2147  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
2148  { ISD::BITREVERSE, MVT::i32, 14 },
2149  { ISD::BITREVERSE, MVT::i16, 14 },
2150  { ISD::BITREVERSE, MVT::i8, 11 },
2151  { ISD::SADDO, MVT::i32, 1 },
2152  { ISD::SADDO, MVT::i16, 1 },
2153  { ISD::SADDO, MVT::i8, 1 },
2154  { ISD::UADDO, MVT::i32, 1 },
2155  { ISD::UADDO, MVT::i16, 1 },
2156  { ISD::UADDO, MVT::i8, 1 },
2157  };
2158 
2159  Type *OpTy = RetTy;
2160  unsigned ISD = ISD::DELETED_NODE;
2161  switch (IID) {
2162  default:
2163  break;
2164  case Intrinsic::bitreverse:
2165  ISD = ISD::BITREVERSE;
2166  break;
2167  case Intrinsic::bswap:
2168  ISD = ISD::BSWAP;
2169  break;
2170  case Intrinsic::ctlz:
2171  ISD = ISD::CTLZ;
2172  break;
2173  case Intrinsic::ctpop:
2174  ISD = ISD::CTPOP;
2175  break;
2176  case Intrinsic::cttz:
2177  ISD = ISD::CTTZ;
2178  break;
2179  case Intrinsic::sadd_sat:
2180  ISD = ISD::SADDSAT;
2181  break;
2182  case Intrinsic::ssub_sat:
2183  ISD = ISD::SSUBSAT;
2184  break;
2185  case Intrinsic::uadd_sat:
2186  ISD = ISD::UADDSAT;
2187  break;
2188  case Intrinsic::usub_sat:
2189  ISD = ISD::USUBSAT;
2190  break;
2191  case Intrinsic::sqrt:
2192  ISD = ISD::FSQRT;
2193  break;
2194  case Intrinsic::sadd_with_overflow:
2195  case Intrinsic::ssub_with_overflow:
2196  // SSUBO has same costs so don't duplicate.
2197  ISD = ISD::SADDO;
2198  OpTy = RetTy->getContainedType(0);
2199  break;
2200  case Intrinsic::uadd_with_overflow:
2201  case Intrinsic::usub_with_overflow:
2202  // USUBO has same costs so don't duplicate.
2203  ISD = ISD::UADDO;
2204  OpTy = RetTy->getContainedType(0);
2205  break;
2206  }
2207 
2208  if (ISD != ISD::DELETED_NODE) {
2209  // Legalize the type.
2210  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, OpTy);
2211  MVT MTy = LT.second;
2212 
2213  // Attempt to lookup cost.
2214  if (ST->isGLM())
2215  if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
2216  return LT.first * Entry->Cost;
2217 
2218  if (ST->isSLM())
2219  if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
2220  return LT.first * Entry->Cost;
2221 
2222  if (ST->hasCDI())
2223  if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
2224  return LT.first * Entry->Cost;
2225 
2226  if (ST->hasBWI())
2227  if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
2228  return LT.first * Entry->Cost;
2229 
2230  if (ST->hasAVX512())
2231  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2232  return LT.first * Entry->Cost;
2233 
2234  if (ST->hasXOP())
2235  if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
2236  return LT.first * Entry->Cost;
2237 
2238  if (ST->hasAVX2())
2239  if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
2240  return LT.first * Entry->Cost;
2241 
2242  if (ST->hasAVX())
2243  if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
2244  return LT.first * Entry->Cost;
2245 
2246  if (ST->hasSSE42())
2247  if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
2248  return LT.first * Entry->Cost;
2249 
2250  if (ST->hasSSSE3())
2251  if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
2252  return LT.first * Entry->Cost;
2253 
2254  if (ST->hasSSE2())
2255  if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
2256  return LT.first * Entry->Cost;
2257 
2258  if (ST->hasSSE1())
2259  if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
2260  return LT.first * Entry->Cost;
2261 
2262  if (ST->is64Bit())
2263  if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
2264  return LT.first * Entry->Cost;
2265 
2266  if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
2267  return LT.first * Entry->Cost;
2268  }
2269 
2270  return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed);
2271 }
2272 
2275  unsigned VF) {
2276  static const CostTblEntry AVX512CostTbl[] = {
2277  { ISD::ROTL, MVT::v8i64, 1 },
2278  { ISD::ROTL, MVT::v4i64, 1 },
2279  { ISD::ROTL, MVT::v2i64, 1 },
2280  { ISD::ROTL, MVT::v16i32, 1 },
2281  { ISD::ROTL, MVT::v8i32, 1 },
2282  { ISD::ROTL, MVT::v4i32, 1 },
2283  { ISD::ROTR, MVT::v8i64, 1 },
2284  { ISD::ROTR, MVT::v4i64, 1 },
2285  { ISD::ROTR, MVT::v2i64, 1 },
2286  { ISD::ROTR, MVT::v16i32, 1 },
2287  { ISD::ROTR, MVT::v8i32, 1 },
2288  { ISD::ROTR, MVT::v4i32, 1 }
2289  };
2290  // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
2291  static const CostTblEntry XOPCostTbl[] = {
2292  { ISD::ROTL, MVT::v4i64, 4 },
2293  { ISD::ROTL, MVT::v8i32, 4 },
2294  { ISD::ROTL, MVT::v16i16, 4 },
2295  { ISD::ROTL, MVT::v32i8, 4 },
2296  { ISD::ROTL, MVT::v2i64, 1 },
2297  { ISD::ROTL, MVT::v4i32, 1 },
2298  { ISD::ROTL, MVT::v8i16, 1 },
2299  { ISD::ROTL, MVT::v16i8, 1 },
2300  { ISD::ROTR, MVT::v4i64, 6 },
2301  { ISD::ROTR, MVT::v8i32, 6 },
2302  { ISD::ROTR, MVT::v16i16, 6 },
2303  { ISD::ROTR, MVT::v32i8, 6 },
2304  { ISD::ROTR, MVT::v2i64, 2 },
2305  { ISD::ROTR, MVT::v4i32, 2 },
2306  { ISD::ROTR, MVT::v8i16, 2 },
2307  { ISD::ROTR, MVT::v16i8, 2 }
2308  };
2309  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
2310  { ISD::ROTL, MVT::i64, 1 },
2311  { ISD::ROTR, MVT::i64, 1 },
2312  { ISD::FSHL, MVT::i64, 4 }
2313  };
2314  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
2315  { ISD::ROTL, MVT::i32, 1 },
2316  { ISD::ROTL, MVT::i16, 1 },
2317  { ISD::ROTL, MVT::i8, 1 },
2318  { ISD::ROTR, MVT::i32, 1 },
2319  { ISD::ROTR, MVT::i16, 1 },
2320  { ISD::ROTR, MVT::i8, 1 },
2321  { ISD::FSHL, MVT::i32, 4 },
2322  { ISD::FSHL, MVT::i16, 4 },
2323  { ISD::FSHL, MVT::i8, 4 }
2324  };
2325 
2326  unsigned ISD = ISD::DELETED_NODE;
2327  switch (IID) {
2328  default:
2329  break;
2330  case Intrinsic::fshl:
2331  ISD = ISD::FSHL;
2332  if (Args[0] == Args[1])
2333  ISD = ISD::ROTL;
2334  break;
2335  case Intrinsic::fshr:
2336  // FSHR has same costs so don't duplicate.
2337  ISD = ISD::FSHL;
2338  if (Args[0] == Args[1])
2339  ISD = ISD::ROTR;
2340  break;
2341  }
2342 
2343  if (ISD != ISD::DELETED_NODE) {
2344  // Legalize the type.
2345  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
2346  MVT MTy = LT.second;
2347 
2348  // Attempt to lookup cost.
2349  if (ST->hasAVX512())
2350  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2351  return LT.first * Entry->Cost;
2352 
2353  if (ST->hasXOP())
2354  if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
2355  return LT.first * Entry->Cost;
2356 
2357  if (ST->is64Bit())
2358  if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
2359  return LT.first * Entry->Cost;
2360 
2361  if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
2362  return LT.first * Entry->Cost;
2363  }
2364 
2365  return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF);
2366 }
2367 
2368 int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
2369  assert(Val->isVectorTy() && "This must be a vector type");
2370 
2371  Type *ScalarType = Val->getScalarType();
2372 
2373  if (Index != -1U) {
2374  // Legalize the type.
2375  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
2376 
2377  // This type is legalized to a scalar type.
2378  if (!LT.second.isVector())
2379  return 0;
2380 
2381  // The type may be split. Normalize the index to the new type.
2382  unsigned Width = LT.second.getVectorNumElements();
2383  Index = Index % Width;
2384 
2385  // Floating point scalars are already located in index #0.
2386  if (ScalarType->isFloatingPointTy() && Index == 0)
2387  return 0;
2388  }
2389 
2390  // Add to the base cost if we know that the extracted element of a vector is
2391  // destined to be moved to and used in the integer register file.
2392  int RegisterFileMoveCost = 0;
2393  if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
2394  RegisterFileMoveCost = 1;
2395 
2396  return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
2397 }
2398 
2399 int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
2400  unsigned AddressSpace, const Instruction *I) {
2401  // Handle non-power-of-two vectors such as <3 x float>
2402  if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
2403  unsigned NumElem = VTy->getVectorNumElements();
2404 
2405  // Handle a few common cases:
2406  // <3 x float>
2407  if (NumElem == 3 && VTy->getScalarSizeInBits() == 32)
2408  // Cost = 64 bit store + extract + 32 bit store.
2409  return 3;
2410 
2411  // <3 x double>
2412  if (NumElem == 3 && VTy->getScalarSizeInBits() == 64)
2413  // Cost = 128 bit store + unpack + 64 bit store.
2414  return 3;
2415 
2416  // Assume that all other non-power-of-two numbers are scalarized.
2417  if (!isPowerOf2_32(NumElem)) {
2418  int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
2419  AddressSpace);
2420  int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load,
2421  Opcode == Instruction::Store);
2422  return NumElem * Cost + SplitCost;
2423  }
2424  }
2425 
2426  // Legalize the type.
2427  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
2428  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
2429  "Invalid Opcode");
2430 
2431  // Each load/store unit costs 1.
2432  int Cost = LT.first * 1;
2433 
2434  // This isn't exactly right. We're using slow unaligned 32-byte accesses as a
2435  // proxy for a double-pumped AVX memory interface such as on Sandybridge.
2436  if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow())
2437  Cost *= 2;
2438 
2439  return Cost;
2440 }
2441 
2442 int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
2443  unsigned Alignment,
2444  unsigned AddressSpace) {
2445  bool IsLoad = (Instruction::Load == Opcode);
2446  bool IsStore = (Instruction::Store == Opcode);
2447 
2448  VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy);
2449  if (!SrcVTy)
2450  // To calculate scalar take the regular cost, without mask
2451  return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace);
2452 
2453  unsigned NumElem = SrcVTy->getVectorNumElements();
2454  VectorType *MaskTy =
2455  VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
2456  if ((IsLoad && !isLegalMaskedLoad(SrcVTy)) ||
2457  (IsStore && !isLegalMaskedStore(SrcVTy)) || !isPowerOf2_32(NumElem)) {
2458  // Scalarization
2459  int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
2460  int ScalarCompareCost = getCmpSelInstrCost(
2461  Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr);
2462  int BranchCost = getCFInstrCost(Instruction::Br);
2463  int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
2464 
2465  int ValueSplitCost = getScalarizationOverhead(SrcVTy, IsLoad, IsStore);
2466  int MemopCost =
2467  NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
2468  Alignment, AddressSpace);
2469  return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
2470  }
2471 
2472  // Legalize the type.
2473  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
2474  auto VT = TLI->getValueType(DL, SrcVTy);
2475  int Cost = 0;
2476  if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
2477  LT.second.getVectorNumElements() == NumElem)
2478  // Promotion requires expand/truncate for data and a shuffle for mask.
2479  Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, 0, nullptr) +
2480  getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, 0, nullptr);
2481 
2482  else if (LT.second.getVectorNumElements() > NumElem) {
2483  VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(),
2484  LT.second.getVectorNumElements());
2485  // Expanding requires fill mask with zeroes
2486  Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy);
2487  }
2488 
2489  // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
2490  if (!ST->hasAVX512())
2491  return Cost + LT.first * (IsLoad ? 2 : 8);
2492 
2493  // AVX-512 masked load/store is cheapper
2494  return Cost + LT.first;
2495 }
2496 
2498  const SCEV *Ptr) {
2499  // Address computations in vectorized code with non-consecutive addresses will
2500  // likely result in more instructions compared to scalar code where the
2501  // computation can more often be merged into the index mode. The resulting
2502  // extra micro-ops can significantly decrease throughput.
2503  const unsigned NumVectorInstToHideOverhead = 10;
2504 
2505  // Cost modeling of Strided Access Computation is hidden by the indexing
2506  // modes of X86 regardless of the stride value. We dont believe that there
2507  // is a difference between constant strided access in gerenal and constant
2508  // strided value which is less than or equal to 64.
2509  // Even in the case of (loop invariant) stride whose value is not known at
2510  // compile time, the address computation will not incur more than one extra
2511  // ADD instruction.
2512  if (Ty->isVectorTy() && SE) {
2513  if (!BaseT::isStridedAccess(Ptr))
2514  return NumVectorInstToHideOverhead;
2515  if (!BaseT::getConstantStrideStep(SE, Ptr))
2516  return 1;
2517  }
2518 
2519  return BaseT::getAddressComputationCost(Ty, SE, Ptr);
2520 }
2521 
2522 int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
2523  bool IsPairwise) {
2524  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
2525  // and make it as the cost.
2526 
2527  static const CostTblEntry SSE42CostTblPairWise[] = {
2528  { ISD::FADD, MVT::v2f64, 2 },
2529  { ISD::FADD, MVT::v4f32, 4 },
2530  { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
2531  { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32.
2532  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
2533  { ISD::ADD, MVT::v2i16, 3 }, // FIXME: chosen to be less than v4i16
2534  { ISD::ADD, MVT::v4i16, 4 }, // FIXME: chosen to be less than v8i16
2535  { ISD::ADD, MVT::v8i16, 5 },
2536  };
2537 
2538  static const CostTblEntry AVX1CostTblPairWise[] = {
2539  { ISD::FADD, MVT::v4f32, 4 },
2540  { ISD::FADD, MVT::v4f64, 5 },
2541  { ISD::FADD, MVT::v8f32, 7 },
2542  { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
2543  { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
2544  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
2545  { ISD::ADD, MVT::v4i64, 5 }, // The data reported by the IACA tool is "4.8".
2546  { ISD::ADD, MVT::v2i16, 3 }, // FIXME: chosen to be less than v4i16
2547  { ISD::ADD, MVT::v4i16, 4 }, // FIXME: chosen to be less than v8i16
2548  { ISD::ADD, MVT::v8i16, 5 },
2549  { ISD::ADD, MVT::v8i32, 5 },
2550  };
2551 
2552  static const CostTblEntry SSE42CostTblNoPairWise[] = {
2553  { ISD::FADD, MVT::v2f64, 2 },
2554  { ISD::FADD, MVT::v4f32, 4 },
2555  { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
2556  { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
2557  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
2558  { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
2559  { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
2560  { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
2561  };
2562 
2563  static const CostTblEntry AVX1CostTblNoPairWise[] = {
2564  { ISD::FADD, MVT::v4f32, 3 },
2565  { ISD::FADD, MVT::v4f64, 3 },
2566  { ISD::FADD, MVT::v8f32, 4 },
2567  { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
2568  { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
2569  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "2.8".
2570  { ISD::ADD, MVT::v4i64, 3 },
2571  { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
2572  { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
2573  { ISD::ADD, MVT::v8i16, 4 },
2574  { ISD::ADD, MVT::v8i32, 5 },
2575  };
2576 
2577  int ISD = TLI->InstructionOpcodeToISD(Opcode);
2578  assert(ISD && "Invalid opcode");
2579 
2580  // Before legalizing the type, give a chance to look up illegal narrow types
2581  // in the table.
2582  // FIXME: Is there a better way to do this?
2583  EVT VT = TLI->getValueType(DL, ValTy);
2585  MVT MTy = VT.getSimpleVT();
2586  if (IsPairwise) {
2587  if (ST->hasAVX())
2588  if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
2589  return Entry->Cost;
2590 
2591  if (ST->hasSSE42())
2592  if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
2593  return Entry->Cost;
2594  } else {
2595  if (ST->hasAVX())
2596  if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
2597  return Entry->Cost;
2598 
2599  if (ST->hasSSE42())
2600  if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
2601  return Entry->Cost;
2602  }
2603  }
2604 
2605  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2606 
2607  MVT MTy = LT.second;
2608 
2609  if (IsPairwise) {
2610  if (ST->hasAVX())
2611  if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
2612  return LT.first * Entry->Cost;
2613 
2614  if (ST->hasSSE42())
2615  if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
2616  return LT.first * Entry->Cost;
2617  } else {
2618  if (ST->hasAVX())
2619  if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
2620  return LT.first * Entry->Cost;
2621 
2622  if (ST->hasSSE42())
2623  if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
2624  return LT.first * Entry->Cost;
2625  }
2626 
2627  static const CostTblEntry AVX2BoolReduction[] = {
2628  { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
2629  { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
2630  { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
2631  { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
2632  };
2633 
2634  static const CostTblEntry AVX1BoolReduction[] = {
2635  { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
2636  { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
2637  { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
2638  { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
2639  { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
2640  { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
2641  { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
2642  { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
2643  };
2644 
2645  static const CostTblEntry SSE2BoolReduction[] = {
2646  { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
2647  { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
2648  { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
2649  { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
2650  { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
2651  { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
2652  { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
2653  { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
2654  };
2655 
2656  // Handle bool allof/anyof patterns.
2657  if (ValTy->getVectorElementType()->isIntegerTy(1)) {
2658  if (ST->hasAVX2())
2659  if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
2660  return LT.first * Entry->Cost;
2661  if (ST->hasAVX())
2662  if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
2663  return LT.first * Entry->Cost;
2664  if (ST->hasSSE2())
2665  if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
2666  return LT.first * Entry->Cost;
2667  }
2668 
2669  return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise);
2670 }
2671 
2673  bool IsPairwise, bool IsUnsigned) {
2674  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2675 
2676  MVT MTy = LT.second;
2677 
2678  int ISD;
2679  if (ValTy->isIntOrIntVectorTy()) {
2680  ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
2681  } else {
2682  assert(ValTy->isFPOrFPVectorTy() &&
2683  "Expected float point or integer vector type.");
2684  ISD = ISD::FMINNUM;
2685  }
2686 
2687  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
2688  // and make it as the cost.
2689 
2690  static const CostTblEntry SSE1CostTblPairWise[] = {
2691  {ISD::FMINNUM, MVT::v4f32, 4},
2692  };
2693 
2694  static const CostTblEntry SSE2CostTblPairWise[] = {
2695  {ISD::FMINNUM, MVT::v2f64, 3},
2696  {ISD::SMIN, MVT::v2i64, 6},
2697  {ISD::UMIN, MVT::v2i64, 8},
2698  {ISD::SMIN, MVT::v4i32, 6},
2699  {ISD::UMIN, MVT::v4i32, 8},
2700  {ISD::SMIN, MVT::v8i16, 4},
2701  {ISD::UMIN, MVT::v8i16, 6},
2702  {ISD::SMIN, MVT::v16i8, 8},
2703  {ISD::UMIN, MVT::v16i8, 6},
2704  };
2705 
2706  static const CostTblEntry SSE41CostTblPairWise[] = {
2707  {ISD::FMINNUM, MVT::v4f32, 2},
2708  {ISD::SMIN, MVT::v2i64, 9},
2709  {ISD::UMIN, MVT::v2i64,10},
2710  {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
2711  {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
2712  {ISD::SMIN, MVT::v8i16, 2},
2713  {ISD::UMIN, MVT::v8i16, 2},
2714  {ISD::SMIN, MVT::v16i8, 3},
2715  {ISD::UMIN, MVT::v16i8, 3},
2716  };
2717 
2718  static const CostTblEntry SSE42CostTblPairWise[] = {
2719  {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
2720  {ISD::UMIN, MVT::v2i64, 8}, // The data reported by the IACA is "8.6"
2721  };
2722 
2723  static const CostTblEntry AVX1CostTblPairWise[] = {
2724  {ISD::FMINNUM, MVT::v4f32, 1},
2725  {ISD::FMINNUM, MVT::v4f64, 1},
2726  {ISD::FMINNUM, MVT::v8f32, 2},
2727  {ISD::SMIN, MVT::v2i64, 3},
2728  {ISD::UMIN, MVT::v2i64, 3},
2729  {ISD::SMIN, MVT::v4i32, 1},
2730  {ISD::UMIN, MVT::v4i32, 1},
2731  {ISD::SMIN, MVT::v8i16, 1},
2732  {ISD::UMIN, MVT::v8i16, 1},
2733  {ISD::SMIN, MVT::v16i8, 2},
2734  {ISD::UMIN, MVT::v16i8, 2},
2735  {ISD::SMIN, MVT::v4i64, 7},
2736  {ISD::UMIN, MVT::v4i64, 7},
2737  {ISD::SMIN, MVT::v8i32, 3},
2738  {ISD::UMIN, MVT::v8i32, 3},
2739  {ISD::SMIN, MVT::v16i16, 3},
2740  {ISD::UMIN, MVT::v16i16, 3},
2741  {ISD::SMIN, MVT::v32i8, 3},
2742  {ISD::UMIN, MVT::v32i8, 3},
2743  };
2744 
2745  static const CostTblEntry AVX2CostTblPairWise[] = {
2746  {ISD::SMIN, MVT::v4i64, 2},
2747  {ISD::UMIN, MVT::v4i64, 2},
2748  {ISD::SMIN, MVT::v8i32, 1},
2749  {ISD::UMIN, MVT::v8i32, 1},
2750  {ISD::SMIN, MVT::v16i16, 1},
2751  {ISD::UMIN, MVT::v16i16, 1},
2752  {ISD::SMIN, MVT::v32i8, 2},
2753  {ISD::UMIN, MVT::v32i8, 2},
2754  };
2755 
2756  static const CostTblEntry AVX512CostTblPairWise[] = {
2757  {ISD::FMINNUM, MVT::v8f64, 1},
2758  {ISD::FMINNUM, MVT::v16f32, 2},
2759  {ISD::SMIN, MVT::v8i64, 2},
2760  {ISD::UMIN, MVT::v8i64, 2},
2761  {ISD::SMIN, MVT::v16i32, 1},
2762  {ISD::UMIN, MVT::v16i32, 1},
2763  };
2764 
2765  static const CostTblEntry SSE1CostTblNoPairWise[] = {
2766  {ISD::FMINNUM, MVT::v4f32, 4},
2767  };
2768 
2769  static const CostTblEntry SSE2CostTblNoPairWise[] = {
2770  {ISD::FMINNUM, MVT::v2f64, 3},
2771  {ISD::SMIN, MVT::v2i64, 6},
2772  {ISD::UMIN, MVT::v2i64, 8},
2773  {ISD::SMIN, MVT::v4i32, 6},
2774  {ISD::UMIN, MVT::v4i32, 8},
2775  {ISD::SMIN, MVT::v8i16, 4},
2776  {ISD::UMIN, MVT::v8i16, 6},
2777  {ISD::SMIN, MVT::v16i8, 8},
2778  {ISD::UMIN, MVT::v16i8, 6},
2779  };
2780 
2781  static const CostTblEntry SSE41CostTblNoPairWise[] = {
2782  {ISD::FMINNUM, MVT::v4f32, 3},
2783  {ISD::SMIN, MVT::v2i64, 9},
2784  {ISD::UMIN, MVT::v2i64,11},
2785  {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
2786  {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
2787  {ISD::SMIN, MVT::v8i16, 1}, // The data reported by the IACA is "1.5"
2788  {ISD::UMIN, MVT::v8i16, 2}, // The data reported by the IACA is "1.8"
2789  {ISD::SMIN, MVT::v16i8, 3},
2790  {ISD::UMIN, MVT::v16i8, 3},
2791  };
2792 
2793  static const CostTblEntry SSE42CostTblNoPairWise[] = {
2794  {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
2795  {ISD::UMIN, MVT::v2i64, 9}, // The data reported by the IACA is "8.6"
2796  };
2797 
2798  static const CostTblEntry AVX1CostTblNoPairWise[] = {
2799  {ISD::FMINNUM, MVT::v4f32, 1},
2800  {ISD::FMINNUM, MVT::v4f64, 1},
2801  {ISD::FMINNUM, MVT::v8f32, 1},
2802  {ISD::SMIN, MVT::v2i64, 3},
2803  {ISD::UMIN, MVT::v2i64, 3},
2804  {ISD::SMIN, MVT::v4i32, 1},
2805  {ISD::UMIN, MVT::v4i32, 1},
2806  {ISD::SMIN, MVT::v8i16, 1},
2807  {ISD::UMIN, MVT::v8i16, 1},
2808  {ISD::SMIN, MVT::v16i8, 2},
2809  {ISD::UMIN, MVT::v16i8, 2},
2810  {ISD::SMIN, MVT::v4i64, 7},
2811  {ISD::UMIN, MVT::v4i64, 7},
2812  {ISD::SMIN, MVT::v8i32, 2},
2813  {ISD::UMIN, MVT::v8i32, 2},
2814  {ISD::SMIN, MVT::v16i16, 2},
2815  {ISD::UMIN, MVT::v16i16, 2},
2816  {ISD::SMIN, MVT::v32i8, 2},
2817  {ISD::UMIN, MVT::v32i8, 2},
2818  };
2819 
2820  static const CostTblEntry AVX2CostTblNoPairWise[] = {
2821  {ISD::SMIN, MVT::v4i64, 1},
2822  {ISD::UMIN, MVT::v4i64, 1},
2823  {ISD::SMIN, MVT::v8i32, 1},
2824  {ISD::UMIN, MVT::v8i32, 1},
2825  {ISD::SMIN, MVT::v16i16, 1},
2826  {ISD::UMIN, MVT::v16i16, 1},
2827  {ISD::SMIN, MVT::v32i8, 1},
2828  {ISD::UMIN, MVT::v32i8, 1},
2829  };
2830 
2831  static const CostTblEntry AVX512CostTblNoPairWise[] = {
2832  {ISD::FMINNUM, MVT::v8f64, 1},
2833  {ISD::FMINNUM, MVT::v16f32, 2},
2834  {ISD::SMIN, MVT::v8i64, 1},
2835  {ISD::UMIN, MVT::v8i64, 1},
2836  {ISD::SMIN, MVT::v16i32, 1},
2837  {ISD::UMIN, MVT::v16i32, 1},
2838  };
2839 
2840  if (IsPairwise) {
2841  if (ST->hasAVX512())
2842  if (const auto *Entry = CostTableLookup(AVX512CostTblPairWise, ISD, MTy))
2843  return LT.first * Entry->Cost;
2844 
2845  if (ST->hasAVX2())
2846  if (const auto *Entry = CostTableLookup(AVX2CostTblPairWise, ISD, MTy))
2847  return LT.first * Entry->Cost;
2848 
2849  if (ST->hasAVX())
2850  if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
2851  return LT.first * Entry->Cost;
2852 
2853  if (ST->hasSSE42())
2854  if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
2855  return LT.first * Entry->Cost;
2856 
2857  if (ST->hasSSE41())
2858  if (const auto *Entry = CostTableLookup(SSE41CostTblPairWise, ISD, MTy))
2859  return LT.first * Entry->Cost;
2860 
2861  if (ST->hasSSE2())
2862  if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy))
2863  return LT.first * Entry->Cost;
2864 
2865  if (ST->hasSSE1())
2866  if (const auto *Entry = CostTableLookup(SSE1CostTblPairWise, ISD, MTy))
2867  return LT.first * Entry->Cost;
2868  } else {
2869  if (ST->hasAVX512())
2870  if (const auto *Entry =
2871  CostTableLookup(AVX512CostTblNoPairWise, ISD, MTy))
2872  return LT.first * Entry->Cost;
2873 
2874  if (ST->hasAVX2())
2875  if (const auto *Entry = CostTableLookup(AVX2CostTblNoPairWise, ISD, MTy))
2876  return LT.first * Entry->Cost;
2877 
2878  if (ST->hasAVX())
2879  if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
2880  return LT.first * Entry->Cost;
2881 
2882  if (ST->hasSSE42())
2883  if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
2884  return LT.first * Entry->Cost;
2885 
2886  if (ST->hasSSE41())
2887  if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy))
2888  return LT.first * Entry->Cost;
2889 
2890  if (ST->hasSSE2())
2891  if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
2892  return LT.first * Entry->Cost;
2893 
2894  if (ST->hasSSE1())
2895  if (const auto *Entry = CostTableLookup(SSE1CostTblNoPairWise, ISD, MTy))
2896  return LT.first * Entry->Cost;
2897  }
2898 
2899  return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned);
2900 }
2901 
2902 /// Calculate the cost of materializing a 64-bit value. This helper
2903 /// method might only calculate a fraction of a larger immediate. Therefore it
2904 /// is valid to return a cost of ZERO.
2905 int X86TTIImpl::getIntImmCost(int64_t Val) {
2906  if (Val == 0)
2907  return TTI::TCC_Free;
2908 
2909  if (isInt<32>(Val))
2910  return TTI::TCC_Basic;
2911 
2912  return 2 * TTI::TCC_Basic;
2913 }
2914 
2915 int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
2916  assert(Ty->isIntegerTy());
2917 
2918  unsigned BitSize = Ty->getPrimitiveSizeInBits();
2919  if (BitSize == 0)
2920  return ~0U;
2921 
2922  // Never hoist constants larger than 128bit, because this might lead to
2923  // incorrect code generation or assertions in codegen.
2924  // Fixme: Create a cost model for types larger than i128 once the codegen
2925  // issues have been fixed.
2926  if (BitSize > 128)
2927  return TTI::TCC_Free;
2928 
2929  if (Imm == 0)
2930  return TTI::TCC_Free;
2931 
2932  // Sign-extend all constants to a multiple of 64-bit.
2933  APInt ImmVal = Imm;
2934  if (BitSize % 64 != 0)
2935  ImmVal = Imm.sext(alignTo(BitSize, 64));
2936 
2937  // Split the constant into 64-bit chunks and calculate the cost for each
2938  // chunk.
2939  int Cost = 0;
2940  for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
2941  APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
2942  int64_t Val = Tmp.getSExtValue();
2943  Cost += getIntImmCost(Val);
2944  }
2945  // We need at least one instruction to materialize the constant.
2946  return std::max(1, Cost);
2947 }
2948 
2949 int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
2950  Type *Ty) {
2951  assert(Ty->isIntegerTy());
2952 
2953  unsigned BitSize = Ty->getPrimitiveSizeInBits();
2954  // There is no cost model for constants with a bit size of 0. Return TCC_Free
2955  // here, so that constant hoisting will ignore this constant.
2956  if (BitSize == 0)
2957  return TTI::TCC_Free;
2958 
2959  unsigned ImmIdx = ~0U;
2960  switch (Opcode) {
2961  default:
2962  return TTI::TCC_Free;
2963  case Instruction::GetElementPtr:
2964  // Always hoist the base address of a GetElementPtr. This prevents the
2965  // creation of new constants for every base constant that gets constant
2966  // folded with the offset.
2967  if (Idx == 0)
2968  return 2 * TTI::TCC_Basic;
2969  return TTI::TCC_Free;
2970  case Instruction::Store:
2971  ImmIdx = 0;
2972  break;
2973  case Instruction::ICmp:
2974  // This is an imperfect hack to prevent constant hoisting of
2975  // compares that might be trying to check if a 64-bit value fits in
2976  // 32-bits. The backend can optimize these cases using a right shift by 32.
2977  // Ideally we would check the compare predicate here. There also other
2978  // similar immediates the backend can use shifts for.
2979  if (Idx == 1 && Imm.getBitWidth() == 64) {
2980  uint64_t ImmVal = Imm.getZExtValue();
2981  if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
2982  return TTI::TCC_Free;
2983  }
2984  ImmIdx = 1;
2985  break;
2986  case Instruction::And:
2987  // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
2988  // by using a 32-bit operation with implicit zero extension. Detect such
2989  // immediates here as the normal path expects bit 31 to be sign extended.
2990  if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue()))
2991  return TTI::TCC_Free;
2992  ImmIdx = 1;
2993  break;
2994  case Instruction::Add:
2995  case Instruction::Sub:
2996  // For add/sub, we can use the opposite instruction for INT32_MIN.
2997  if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000)
2998  return TTI::TCC_Free;
2999  ImmIdx = 1;
3000  break;
3001  case Instruction::UDiv:
3002  case Instruction::SDiv:
3003  case Instruction::URem:
3004  case Instruction::SRem:
3005  // Division by constant is typically expanded later into a different
3006  // instruction sequence. This completely changes the constants.
3007  // Report them as "free" to stop ConstantHoist from marking them as opaque.
3008  return TTI::TCC_Free;
3009  case Instruction::Mul:
3010  case Instruction::Or:
3011  case Instruction::Xor:
3012  ImmIdx = 1;
3013  break;
3014  // Always return TCC_Free for the shift value of a shift instruction.
3015  case Instruction::Shl:
3016  case Instruction::LShr:
3017  case Instruction::AShr:
3018  if (Idx == 1)
3019  return TTI::TCC_Free;
3020  break;
3021  case Instruction::Trunc:
3022  case Instruction::ZExt:
3023  case Instruction::SExt:
3024  case Instruction::IntToPtr:
3025  case Instruction::PtrToInt:
3026  case Instruction::BitCast:
3027  case Instruction::PHI:
3028  case Instruction::Call:
3029  case Instruction::Select:
3030  case Instruction::Ret:
3031  case Instruction::Load:
3032  break;
3033  }
3034 
3035  if (Idx == ImmIdx) {
3036  int NumConstants = divideCeil(BitSize, 64);
3037  int Cost = X86TTIImpl::getIntImmCost(Imm, Ty);
3038  return (Cost <= NumConstants * TTI::TCC_Basic)
3039  ? static_cast<int>(TTI::TCC_Free)
3040  : Cost;
3041  }
3042 
3043  return X86TTIImpl::getIntImmCost(Imm, Ty);
3044 }
3045 
3046 int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
3047  Type *Ty) {
3048  assert(Ty->isIntegerTy());
3049 
3050  unsigned BitSize = Ty->getPrimitiveSizeInBits();
3051  // There is no cost model for constants with a bit size of 0. Return TCC_Free
3052  // here, so that constant hoisting will ignore this constant.
3053  if (BitSize == 0)
3054  return TTI::TCC_Free;
3055 
3056  switch (IID) {
3057  default:
3058  return TTI::TCC_Free;
3059  case Intrinsic::sadd_with_overflow:
3060  case Intrinsic::uadd_with_overflow:
3061  case Intrinsic::ssub_with_overflow:
3062  case Intrinsic::usub_with_overflow:
3063  case Intrinsic::smul_with_overflow:
3064  case Intrinsic::umul_with_overflow:
3065  if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
3066  return TTI::TCC_Free;
3067  break;
3068  case Intrinsic::experimental_stackmap:
3069  if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
3070  return TTI::TCC_Free;
3071  break;
3072  case Intrinsic::experimental_patchpoint_void:
3073  case Intrinsic::experimental_patchpoint_i64:
3074  if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
3075  return TTI::TCC_Free;
3076  break;
3077  }
3078  return X86TTIImpl::getIntImmCost(Imm, Ty);
3079 }
3080 
3081 unsigned X86TTIImpl::getUserCost(const User *U,
3083  if (isa<StoreInst>(U)) {
3084  Value *Ptr = U->getOperand(1);
3085  // Store instruction with index and scale costs 2 Uops.
3086  // Check the preceding GEP to identify non-const indices.
3087  if (auto GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
3088  if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
3089  return TTI::TCC_Basic * 2;
3090  }
3091  return TTI::TCC_Basic;
3092  }
3093  return BaseT::getUserCost(U, Operands);
3094 }
3095 
3096 // Return an average cost of Gather / Scatter instruction, maybe improved later
3097 int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
3098  unsigned Alignment, unsigned AddressSpace) {
3099 
3100  assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
3101  unsigned VF = SrcVTy->getVectorNumElements();
3102 
3103  // Try to reduce index size from 64 bit (default for GEP)
3104  // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
3105  // operation will use 16 x 64 indices which do not fit in a zmm and needs
3106  // to split. Also check that the base pointer is the same for all lanes,
3107  // and that there's at most one variable index.
3108  auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) {
3109  unsigned IndexSize = DL.getPointerSizeInBits();
3111  if (IndexSize < 64 || !GEP)
3112  return IndexSize;
3113 
3114  unsigned NumOfVarIndices = 0;
3115  Value *Ptrs = GEP->getPointerOperand();
3116  if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
3117  return IndexSize;
3118  for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
3119  if (isa<Constant>(GEP->getOperand(i)))
3120  continue;
3121  Type *IndxTy = GEP->getOperand(i)->getType();
3122  if (IndxTy->isVectorTy())
3123  IndxTy = IndxTy->getVectorElementType();
3124  if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
3125  !isa<SExtInst>(GEP->getOperand(i))) ||
3126  ++NumOfVarIndices > 1)
3127  return IndexSize; // 64
3128  }
3129  return (unsigned)32;
3130  };
3131 
3132 
3133  // Trying to reduce IndexSize to 32 bits for vector 16.
3134  // By default the IndexSize is equal to pointer size.
3135  unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
3136  ? getIndexSizeInBits(Ptr, DL)
3138 
3139  Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(),
3140  IndexSize), VF);
3141  std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
3142  std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
3143  int SplitFactor = std::max(IdxsLT.first, SrcLT.first);
3144  if (SplitFactor > 1) {
3145  // Handle splitting of vector of pointers
3146  Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
3147  return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
3148  AddressSpace);
3149  }
3150 
3151  // The gather / scatter cost is given by Intel architects. It is a rough
3152  // number since we are looking at one instruction in a time.
3153  const int GSOverhead = (Opcode == Instruction::Load)
3154  ? ST->getGatherOverhead()
3155  : ST->getScatterOverhead();
3156  return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
3157  Alignment, AddressSpace);
3158 }
3159 
3160 /// Return the cost of full scalarization of gather / scatter operation.
3161 ///
3162 /// Opcode - Load or Store instruction.
3163 /// SrcVTy - The type of the data vector that should be gathered or scattered.
3164 /// VariableMask - The mask is non-constant at compile time.
3165 /// Alignment - Alignment for one element.
3166 /// AddressSpace - pointer[s] address space.
3167 ///
3168 int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
3169  bool VariableMask, unsigned Alignment,
3170  unsigned AddressSpace) {
3171  unsigned VF = SrcVTy->getVectorNumElements();
3172 
3173  int MaskUnpackCost = 0;
3174  if (VariableMask) {
3175  VectorType *MaskTy =
3176  VectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
3177  MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true);
3178  int ScalarCompareCost =
3179  getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()),
3180  nullptr);
3181  int BranchCost = getCFInstrCost(Instruction::Br);
3182  MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
3183  }
3184 
3185  // The cost of the scalar loads/stores.
3186  int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
3187  Alignment, AddressSpace);
3188 
3189  int InsertExtractCost = 0;
3190  if (Opcode == Instruction::Load)
3191  for (unsigned i = 0; i < VF; ++i)
3192  // Add the cost of inserting each scalar load into the vector
3193  InsertExtractCost +=
3194  getVectorInstrCost(Instruction::InsertElement, SrcVTy, i);
3195  else
3196  for (unsigned i = 0; i < VF; ++i)
3197  // Add the cost of extracting each element out of the data vector
3198  InsertExtractCost +=
3199  getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i);
3200 
3201  return MemoryOpCost + MaskUnpackCost + InsertExtractCost;
3202 }
3203 
3204 /// Calculate the cost of Gather / Scatter operation
3205 int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
3206  Value *Ptr, bool VariableMask,
3207  unsigned Alignment) {
3208  assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
3209  unsigned VF = SrcVTy->getVectorNumElements();
3210  PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
3211  if (!PtrTy && Ptr->getType()->isVectorTy())
3212  PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType());
3213  assert(PtrTy && "Unexpected type for Ptr argument");
3214  unsigned AddressSpace = PtrTy->getAddressSpace();
3215 
3216  bool Scalarize = false;
3217  if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) ||
3218  (Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy)))
3219  Scalarize = true;
3220  // Gather / Scatter for vector 2 is not profitable on KNL / SKX
3221  // Vector-4 of gather/scatter instruction does not exist on KNL.
3222  // We can extend it to 8 elements, but zeroing upper bits of
3223  // the mask vector will add more instructions. Right now we give the scalar
3224  // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction
3225  // is better in the VariableMask case.
3226  if (ST->hasAVX512() && (VF == 2 || (VF == 4 && !ST->hasVLX())))
3227  Scalarize = true;
3228 
3229  if (Scalarize)
3230  return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
3231  AddressSpace);
3232 
3233  return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
3234 }
3235 
3238  // X86 specific here are "instruction number 1st priority".
3239  return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
3240  C1.NumIVMuls, C1.NumBaseAdds,
3241  C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
3242  std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
3243  C2.NumIVMuls, C2.NumBaseAdds,
3244  C2.ScaleCost, C2.ImmCost, C2.SetupCost);
3245 }
3246 
3248  return ST->hasMacroFusion() || ST->hasBranchFusion();
3249 }
3250 
3252  if (!ST->hasAVX())
3253  return false;
3254 
3255  // The backend can't handle a single element vector.
3256  if (isa<VectorType>(DataTy) && DataTy->getVectorNumElements() == 1)
3257  return false;
3258  Type *ScalarTy = DataTy->getScalarType();
3259 
3260  if (ScalarTy->isPointerTy())
3261  return true;
3262 
3263  if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
3264  return true;
3265 
3266  if (!ScalarTy->isIntegerTy())
3267  return false;
3268 
3269  unsigned IntWidth = ScalarTy->getIntegerBitWidth();
3270  return IntWidth == 32 || IntWidth == 64 ||
3271  ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
3272 }
3273 
3275  return isLegalMaskedLoad(DataType);
3276 }
3277 
3279  unsigned DataSize = DL.getTypeStoreSize(DataType);
3280  // The only supported nontemporal loads are for aligned vectors of 16 or 32
3281  // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
3282  // (the equivalent stores only require AVX).
3283  if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
3284  return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
3285 
3286  return false;
3287 }
3288 
3290  unsigned DataSize = DL.getTypeStoreSize(DataType);
3291 
3292  // SSE4A supports nontemporal stores of float and double at arbitrary
3293  // alignment.
3294  if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
3295  return true;
3296 
3297  // Besides the SSE4A subtarget exception above, only aligned stores are
3298  // available nontemporaly on any other subtarget. And only stores with a size
3299  // of 4..32 bytes (powers of 2, only) are permitted.
3300  if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
3301  !isPowerOf2_32(DataSize))
3302  return false;
3303 
3304  // 32-byte vector nontemporal stores are supported by AVX (the equivalent
3305  // loads require AVX2).
3306  if (DataSize == 32)
3307  return ST->hasAVX();
3308  else if (DataSize == 16)
3309  return ST->hasSSE1();
3310  return true;
3311 }
3312 
3314  if (!isa<VectorType>(DataTy))
3315  return false;
3316 
3317  if (!ST->hasAVX512())
3318  return false;
3319 
3320  // The backend can't handle a single element vector.
3321  if (DataTy->getVectorNumElements() == 1)
3322  return false;
3323 
3324  Type *ScalarTy = DataTy->getVectorElementType();
3325 
3326  if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
3327  return true;
3328 
3329  if (!ScalarTy->isIntegerTy())
3330  return false;
3331 
3332  unsigned IntWidth = ScalarTy->getIntegerBitWidth();
3333  return IntWidth == 32 || IntWidth == 64 ||
3334  ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
3335 }
3336 
3338  return isLegalMaskedExpandLoad(DataTy);
3339 }
3340 
3342  // Some CPUs have better gather performance than others.
3343  // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
3344  // enable gather with a -march.
3345  if (!(ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2())))
3346  return false;
3347 
3348  // This function is called now in two cases: from the Loop Vectorizer
3349  // and from the Scalarizer.
3350  // When the Loop Vectorizer asks about legality of the feature,
3351  // the vectorization factor is not calculated yet. The Loop Vectorizer
3352  // sends a scalar type and the decision is based on the width of the
3353  // scalar element.
3354  // Later on, the cost model will estimate usage this intrinsic based on
3355  // the vector type.
3356  // The Scalarizer asks again about legality. It sends a vector type.
3357  // In this case we can reject non-power-of-2 vectors.
3358  // We also reject single element vectors as the type legalizer can't
3359  // scalarize it.
3360  if (isa<VectorType>(DataTy)) {
3361  unsigned NumElts = DataTy->getVectorNumElements();
3362  if (NumElts == 1 || !isPowerOf2_32(NumElts))
3363  return false;
3364  }
3365  Type *ScalarTy = DataTy->getScalarType();
3366  if (ScalarTy->isPointerTy())
3367  return true;
3368 
3369  if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
3370  return true;
3371 
3372  if (!ScalarTy->isIntegerTy())
3373  return false;
3374 
3375  unsigned IntWidth = ScalarTy->getIntegerBitWidth();
3376  return IntWidth == 32 || IntWidth == 64;
3377 }
3378 
3380  // AVX2 doesn't support scatter
3381  if (!ST->hasAVX512())
3382  return false;
3383  return isLegalMaskedGather(DataType);
3384 }
3385 
3386 bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
3387  EVT VT = TLI->getValueType(DL, DataType);
3388  return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
3389 }
3390 
3392  return false;
3393 }
3394 
3396  const Function *Callee) const {
3397  const TargetMachine &TM = getTLI()->getTargetMachine();
3398 
3399  // Work this as a subsetting of subtarget features.
3400  const FeatureBitset &CallerBits =
3401  TM.getSubtargetImpl(*Caller)->getFeatureBits();
3402  const FeatureBitset &CalleeBits =
3403  TM.getSubtargetImpl(*Callee)->getFeatureBits();
3404 
3405  FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
3406  FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
3407  return (RealCallerBits & RealCalleeBits) == RealCalleeBits;
3408 }
3409 
3411  const Function *Caller, const Function *Callee,
3413  if (!BaseT::areFunctionArgsABICompatible(Caller, Callee, Args))
3414  return false;
3415 
3416  // If we get here, we know the target features match. If one function
3417  // considers 512-bit vectors legal and the other does not, consider them
3418  // incompatible.
3419  // FIXME Look at the arguments and only consider 512 bit or larger vectors?
3420  const TargetMachine &TM = getTLI()->getTargetMachine();
3421 
3422  return TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
3423  TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs();
3424 }
3425 
3427 X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3429  Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3430  Options.NumLoadsPerBlock = 2;
3431  if (IsZeroCmp) {
3432  // Only enable vector loads for equality comparison. Right now the vector
3433  // version is not as fast for three way compare (see #33329).
3434  // TODO: enable AVX512 when the DAG is ready.
3435  // if (ST->hasAVX512()) Options.LoadSizes.push_back(64);
3436  const unsigned PreferredWidth = ST->getPreferVectorWidth();
3437  if (PreferredWidth >= 256 && ST->hasAVX2()) Options.LoadSizes.push_back(32);
3438  if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
3439  // All GPR and vector loads can be unaligned. SIMD compare requires integer
3440  // vectors (SSE2/AVX2).
3441  Options.AllowOverlappingLoads = true;
3442  }
3443  if (ST->is64Bit()) {
3444  Options.LoadSizes.push_back(8);
3445  }
3446  Options.LoadSizes.push_back(4);
3447  Options.LoadSizes.push_back(2);
3448  Options.LoadSizes.push_back(1);
3449  return Options;
3450 }
3451 
3453  // TODO: We expect this to be beneficial regardless of arch,
3454  // but there are currently some unexplained performance artifacts on Atom.
3455  // As a temporary solution, disable on Atom.
3456  return !(ST->isAtom());
3457 }
3458 
3459 // Get estimation for interleaved load/store operations for AVX2.
3460 // \p Factor is the interleaved-access factor (stride) - number of
3461 // (interleaved) elements in the group.
3462 // \p Indices contains the indices for a strided load: when the
3463 // interleaved load has gaps they indicate which elements are used.
3464 // If Indices is empty (or if the number of indices is equal to the size
3465 // of the interleaved-access as given in \p Factor) the access has no gaps.
3466 //
3467 // As opposed to AVX-512, AVX2 does not have generic shuffles that allow
3468 // computing the cost using a generic formula as a function of generic
3469 // shuffles. We therefore use a lookup table instead, filled according to
3470 // the instruction sequences that codegen currently generates.
3472  unsigned Factor,
3473  ArrayRef<unsigned> Indices,
3474  unsigned Alignment,
3475  unsigned AddressSpace,
3476  bool UseMaskForCond,
3477  bool UseMaskForGaps) {
3478 
3479  if (UseMaskForCond || UseMaskForGaps)
3480  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3481  Alignment, AddressSpace,
3482  UseMaskForCond, UseMaskForGaps);
3483 
3484  // We currently Support only fully-interleaved groups, with no gaps.
3485  // TODO: Support also strided loads (interleaved-groups with gaps).
3486  if (Indices.size() && Indices.size() != Factor)
3487  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3488  Alignment, AddressSpace);
3489 
3490  // VecTy for interleave memop is <VF*Factor x Elt>.
3491  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
3492  // VecTy = <12 x i32>.
3493  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
3494 
3495  // This function can be called with VecTy=<6xi128>, Factor=3, in which case
3496  // the VF=2, while v2i128 is an unsupported MVT vector type
3497  // (see MachineValueType.h::getVectorVT()).
3498  if (!LegalVT.isVector())
3499  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3500  Alignment, AddressSpace);
3501 
3502  unsigned VF = VecTy->getVectorNumElements() / Factor;
3503  Type *ScalarTy = VecTy->getVectorElementType();
3504 
3505  // Calculate the number of memory operations (NumOfMemOps), required
3506  // for load/store the VecTy.
3507  unsigned VecTySize = DL.getTypeStoreSize(VecTy);
3508  unsigned LegalVTSize = LegalVT.getStoreSize();
3509  unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
3510 
3511  // Get the cost of one memory operation.
3512  Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
3513  LegalVT.getVectorNumElements());
3514  unsigned MemOpCost =
3515  getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
3516 
3517  VectorType *VT = VectorType::get(ScalarTy, VF);
3518  EVT ETy = TLI->getValueType(DL, VT);
3519  if (!ETy.isSimple())
3520  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3521  Alignment, AddressSpace);
3522 
3523  // TODO: Complete for other data-types and strides.
3524  // Each combination of Stride, ElementTy and VF results in a different
3525  // sequence; The cost tables are therefore accessed with:
3526  // Factor (stride) and VectorType=VFxElemType.
3527  // The Cost accounts only for the shuffle sequence;
3528  // The cost of the loads/stores is accounted for separately.
3529  //
3530  static const CostTblEntry AVX2InterleavedLoadTbl[] = {
3531  { 2, MVT::v4i64, 6 }, //(load 8i64 and) deinterleave into 2 x 4i64
3532  { 2, MVT::v4f64, 6 }, //(load 8f64 and) deinterleave into 2 x 4f64
3533 
3534  { 3, MVT::v2i8, 10 }, //(load 6i8 and) deinterleave into 3 x 2i8
3535  { 3, MVT::v4i8, 4 }, //(load 12i8 and) deinterleave into 3 x 4i8
3536  { 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8
3537  { 3, MVT::v16i8, 11}, //(load 48i8 and) deinterleave into 3 x 16i8
3538  { 3, MVT::v32i8, 13}, //(load 96i8 and) deinterleave into 3 x 32i8
3539  { 3, MVT::v8f32, 17 }, //(load 24f32 and)deinterleave into 3 x 8f32
3540 
3541  { 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8
3542  { 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8
3543  { 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8
3544  { 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8
3545  { 4, MVT::v32i8, 80 }, //(load 128i8 and) deinterleave into 4 x 32i8
3546 
3547  { 8, MVT::v8f32, 40 } //(load 64f32 and)deinterleave into 8 x 8f32
3548  };
3549 
3550  static const CostTblEntry AVX2InterleavedStoreTbl[] = {
3551  { 2, MVT::v4i64, 6 }, //interleave into 2 x 4i64 into 8i64 (and store)
3552  { 2, MVT::v4f64, 6 }, //interleave into 2 x 4f64 into 8f64 (and store)
3553 
3554  { 3, MVT::v2i8, 7 }, //interleave 3 x 2i8 into 6i8 (and store)
3555  { 3, MVT::v4i8, 8 }, //interleave 3 x 4i8 into 12i8 (and store)
3556  { 3, MVT::v8i8, 11 }, //interleave 3 x 8i8 into 24i8 (and store)
3557  { 3, MVT::v16i8, 11 }, //interleave 3 x 16i8 into 48i8 (and store)
3558  { 3, MVT::v32i8, 13 }, //interleave 3 x 32i8 into 96i8 (and store)
3559 
3560  { 4, MVT::v2i8, 12 }, //interleave 4 x 2i8 into 8i8 (and store)
3561  { 4, MVT::v4i8, 9 }, //interleave 4 x 4i8 into 16i8 (and store)
3562  { 4, MVT::v8i8, 10 }, //interleave 4 x 8i8 into 32i8 (and store)
3563  { 4, MVT::v16i8, 10 }, //interleave 4 x 16i8 into 64i8 (and store)
3564  { 4, MVT::v32i8, 12 } //interleave 4 x 32i8 into 128i8 (and store)
3565  };
3566 
3567  if (Opcode == Instruction::Load) {
3568  if (const auto *Entry =
3569  CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT()))
3570  return NumOfMemOps * MemOpCost + Entry->Cost;
3571  } else {
3572  assert(Opcode == Instruction::Store &&
3573  "Expected Store Instruction at this point");
3574  if (const auto *Entry =
3575  CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT()))
3576  return NumOfMemOps * MemOpCost + Entry->Cost;
3577  }
3578 
3579  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3580  Alignment, AddressSpace);
3581 }
3582 
3583 // Get estimation for interleaved load/store operations and strided load.
3584 // \p Indices contains indices for strided load.
3585 // \p Factor - the factor of interleaving.
3586 // AVX-512 provides 3-src shuffles that significantly reduces the cost.
3588  unsigned Factor,
3589  ArrayRef<unsigned> Indices,
3590  unsigned Alignment,
3591  unsigned AddressSpace,
3592  bool UseMaskForCond,
3593  bool UseMaskForGaps) {
3594 
3595  if (UseMaskForCond || UseMaskForGaps)
3596  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3597  Alignment, AddressSpace,
3598  UseMaskForCond, UseMaskForGaps);
3599 
3600  // VecTy for interleave memop is <VF*Factor x Elt>.
3601  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
3602  // VecTy = <12 x i32>.
3603 
3604  // Calculate the number of memory operations (NumOfMemOps), required
3605  // for load/store the VecTy.
3606  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
3607  unsigned VecTySize = DL.getTypeStoreSize(VecTy);
3608  unsigned LegalVTSize = LegalVT.getStoreSize();
3609  unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
3610 
3611  // Get the cost of one memory operation.
3612  Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
3613  LegalVT.getVectorNumElements());
3614  unsigned MemOpCost =
3615  getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
3616 
3617  unsigned VF = VecTy->getVectorNumElements() / Factor;
3618  MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
3619 
3620  if (Opcode == Instruction::Load) {
3621  // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
3622  // contain the cost of the optimized shuffle sequence that the
3623  // X86InterleavedAccess pass will generate.
3624  // The cost of loads and stores are computed separately from the table.
3625 
3626  // X86InterleavedAccess support only the following interleaved-access group.
3627  static const CostTblEntry AVX512InterleavedLoadTbl[] = {
3628  {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
3629  {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
3630  {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
3631  };
3632 
3633  if (const auto *Entry =
3634  CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
3635  return NumOfMemOps * MemOpCost + Entry->Cost;
3636  //If an entry does not exist, fallback to the default implementation.
3637 
3638  // Kind of shuffle depends on number of loaded values.
3639  // If we load the entire data in one register, we can use a 1-src shuffle.
3640  // Otherwise, we'll merge 2 sources in each operation.
3641  TTI::ShuffleKind ShuffleKind =
3642  (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
3643 
3644  unsigned ShuffleCost =
3645  getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr);
3646 
3647  unsigned NumOfLoadsInInterleaveGrp =
3648  Indices.size() ? Indices.size() : Factor;
3649  Type *ResultTy = VectorType::get(VecTy->getVectorElementType(),
3650  VecTy->getVectorNumElements() / Factor);
3651  unsigned NumOfResults =
3652  getTLI()->getTypeLegalizationCost(DL, ResultTy).first *
3653  NumOfLoadsInInterleaveGrp;
3654 
3655  // About a half of the loads may be folded in shuffles when we have only
3656  // one result. If we have more than one result, we do not fold loads at all.
3657  unsigned NumOfUnfoldedLoads =
3658  NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
3659 
3660  // Get a number of shuffle operations per result.
3661  unsigned NumOfShufflesPerResult =
3662  std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
3663 
3664  // The SK_MergeTwoSrc shuffle clobbers one of src operands.
3665  // When we have more than one destination, we need additional instructions
3666  // to keep sources.
3667  unsigned NumOfMoves = 0;
3668  if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
3669  NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
3670 
3671  int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
3672  NumOfUnfoldedLoads * MemOpCost + NumOfMoves;
3673 
3674  return Cost;
3675  }
3676 
3677  // Store.
3678  assert(Opcode == Instruction::Store &&
3679  "Expected Store Instruction at this point");
3680  // X86InterleavedAccess support only the following interleaved-access group.
3681  static const CostTblEntry AVX512InterleavedStoreTbl[] = {
3682  {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
3683  {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
3684  {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
3685 
3686  {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
3687  {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
3688  {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
3689  {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
3690  };
3691 
3692  if (const auto *Entry =
3693  CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
3694  return NumOfMemOps * MemOpCost + Entry->Cost;
3695  //If an entry does not exist, fallback to the default implementation.
3696 
3697  // There is no strided stores meanwhile. And store can't be folded in
3698  // shuffle.
3699  unsigned NumOfSources = Factor; // The number of values to be merged.
3700  unsigned ShuffleCost =
3701  getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr);
3702  unsigned NumOfShufflesPerStore = NumOfSources - 1;
3703 
3704  // The SK_MergeTwoSrc shuffle clobbers one of src operands.
3705  // We need additional instructions to keep sources.
3706  unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
3707  int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
3708  NumOfMoves;
3709  return Cost;
3710 }
3711 
3712 int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
3713  unsigned Factor,
3714  ArrayRef<unsigned> Indices,
3715  unsigned Alignment,
3716  unsigned AddressSpace,
3717  bool UseMaskForCond,
3718  bool UseMaskForGaps) {
3719  auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) {
3720  Type *EltTy = VecTy->getVectorElementType();
3721  if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
3722  EltTy->isIntegerTy(32) || EltTy->isPointerTy())
3723  return true;
3724  if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8))
3725  return HasBW;
3726  return false;
3727  };
3728  if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
3729  return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
3730  Alignment, AddressSpace,
3731  UseMaskForCond, UseMaskForGaps);
3732  if (ST->hasAVX2())
3733  return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
3734  Alignment, AddressSpace,
3735  UseMaskForCond, UseMaskForGaps);
3736 
3737  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3738  Alignment, AddressSpace,
3739  UseMaskForCond, UseMaskForGaps);
3740 }
bool hasAVX() const
Definition: X86Subtarget.h:584
Type * getVectorElementType() const
Definition: Type.h:371
constexpr bool isUInt< 32 >(uint64_t x)
Definition: MathExtras.h:348
X = FP_ROUND(Y, TRUNC) - Rounding &#39;Y&#39; from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:569
bool is64Bit() const
Is this x86_64? (disregarding specific ABI / programming model)
Definition: X86Subtarget.h:545
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
bool areFunctionArgsABICompatible(const Function *Caller, const Function *Callee, SmallPtrSetImpl< Argument *> &Args) const
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:622
unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >())
Definition: BasicTTIImpl.h:589
static IntegerType * getInt1Ty(LLVMContext &C)
Definition: Type.cpp:172
llvm::Optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1569
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:836
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:41
bool hasSSE41() const
Definition: X86Subtarget.h:582
This class represents lattice values for constants.
Definition: AllocatorList.h:23
unsigned minRequiredElementSize(const Value *Val, bool &isSigned)
static MVT getVectorVT(MVT VT, unsigned NumElements)
Cost tables and simple lookup functions.
int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty)
int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, bool VariableMask, unsigned Alignment)
Calculate the cost of Gather / Scatter operation.
bool hasPOPCNT() const
Definition: X86Subtarget.h:592
bool hasAVX2() const
Definition: X86Subtarget.h:585
const Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
bool isVector() const
Return true if this is a vector value type.
unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:528
void push_back(const T &Elt)
Definition: SmallVector.h:211
This file a TargetTransformInfo::Concept conforming object specific to the X86 target machine...
The main scalar evolution driver.
unsigned getRegisterBitWidth(bool Vector) const
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:252
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:250
bool hasVBMI2() const
Definition: X86Subtarget.h:619
unsigned getVectorNumElements() const
bool isLegalMaskedScatter(Type *DataType)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1165
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:389
bool useAVX512Regs() const
Definition: X86Subtarget.h:723
Type Conversion Cost Table.
Definition: CostTable.h:44
[US]{MIN/MAX} - Binary minimum or maximum or signed or unsigned integers.
Definition: ISDOpcodes.h:415
Hexagon Common GEP
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:229
bool isLegalNTLoad(Type *DataType, llvm::Align Alignment)
unsigned getAtomicMemIntrinsicMaxElementSize() const
Cost Table Entry.
Definition: CostTable.h:24
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1515
unsigned getNumberOfRegisters(bool Vector)
int getGatherOverhead() const
Definition: X86Subtarget.h:645
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:343
unsigned getMaxInterleaveFactor(unsigned VF)
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:209
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
bool isUnalignedMem32Slow() const
Definition: X86Subtarget.h:644
unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef< Value * > Args, FastMathFlags FMF, unsigned VF=1)
Get intrinsic cost based on arguments.
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:709
cl::opt< bool > ExperimentalVectorWideningLegalization
bool isLegalMaskedStore(Type *DataType)
unsigned getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
int getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value *> Args=ArrayRef< const Value *>())
int getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond=false, bool UseMaskForGaps=false)
const FeatureBitset & getFeatureBits() const
Shift and rotation operations.
Definition: ISDOpcodes.h:441
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:196
unsigned getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwise)
Try to calculate arithmetic and shuffle op costs for reduction operations.