Bug Summary

File:build/llvm-toolchain-snapshot-16~++20220819100721+9e51cbac9ef9/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
Warning:line 3741, column 15
Division by zero

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name X86TargetTransformInfo.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-16~++20220819100721+9e51cbac9ef9/build-llvm -resource-dir /usr/lib/llvm-16/lib/clang/16.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Target/X86 -I /build/llvm-toolchain-snapshot-16~++20220819100721+9e51cbac9ef9/llvm/lib/Target/X86 -I include -I /build/llvm-toolchain-snapshot-16~++20220819100721+9e51cbac9ef9/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-16/lib/clang/16.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/llvm-toolchain-snapshot-16~++20220819100721+9e51cbac9ef9/build-llvm=build-llvm -fmacro-prefix-map=/build/llvm-toolchain-snapshot-16~++20220819100721+9e51cbac9ef9/= -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-16~++20220819100721+9e51cbac9ef9/build-llvm=build-llvm -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-16~++20220819100721+9e51cbac9ef9/= -O3 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-16~++20220819100721+9e51cbac9ef9/build-llvm -fdebug-prefix-map=/build/llvm-toolchain-snapshot-16~++20220819100721+9e51cbac9ef9/build-llvm=build-llvm -fdebug-prefix-map=/build/llvm-toolchain-snapshot-16~++20220819100721+9e51cbac9ef9/= -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-08-19-233747-121207-1 -x c++ /build/llvm-toolchain-snapshot-16~++20220819100721+9e51cbac9ef9/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
1//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements a TargetTransformInfo analysis pass specific to the
10/// X86 target machine. It uses the target's detailed information to provide
11/// more precise answers to certain TTI queries, while letting the target
12/// independent and default TTI implementations handle the rest.
13///
14//===----------------------------------------------------------------------===//
15/// About Cost Model numbers used below it's necessary to say the following:
16/// the numbers correspond to some "generic" X86 CPU instead of usage of
17/// concrete CPU model. Usually the numbers correspond to CPU where the feature
18/// apeared at the first time. For example, if we do Subtarget.hasSSE42() in
19/// the lookups below the cost is based on Nehalem as that was the first CPU
20/// to support that feature level and thus has most likely the worst case cost.
21/// Some examples of other technologies/CPUs:
22/// SSE 3 - Pentium4 / Athlon64
23/// SSE 4.1 - Penryn
24/// SSE 4.2 - Nehalem
25/// AVX - Sandy Bridge
26/// AVX2 - Haswell
27/// AVX-512 - Xeon Phi / Skylake
28/// And some examples of instruction target dependent costs (latency)
29/// divss sqrtss rsqrtss
30/// AMD K7 11-16 19 3
31/// Piledriver 9-24 13-15 5
32/// Jaguar 14 16 2
33/// Pentium II,III 18 30 2
34/// Nehalem 7-14 7-18 3
35/// Haswell 10-13 11 5
36/// TODO: Develop and implement the target dependent cost model and
37/// specialize cost numbers for different Cost Model Targets such as throughput,
38/// code size, latency and uop count.
39//===----------------------------------------------------------------------===//
40
41#include "X86TargetTransformInfo.h"
42#include "llvm/Analysis/TargetTransformInfo.h"
43#include "llvm/CodeGen/BasicTTIImpl.h"
44#include "llvm/CodeGen/CostTable.h"
45#include "llvm/CodeGen/TargetLowering.h"
46#include "llvm/IR/InstIterator.h"
47#include "llvm/IR/IntrinsicInst.h"
48#include "llvm/Support/Debug.h"
49
50using namespace llvm;
51
52#define DEBUG_TYPE"x86tti" "x86tti"
53
54//===----------------------------------------------------------------------===//
55//
56// X86 cost model.
57//
58//===----------------------------------------------------------------------===//
59
60TargetTransformInfo::PopcntSupportKind
61X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
62 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2")(static_cast <bool> (isPowerOf2_32(TyWidth) && "Ty width must be power of 2"
) ? void (0) : __assert_fail ("isPowerOf2_32(TyWidth) && \"Ty width must be power of 2\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 62, __extension__
__PRETTY_FUNCTION__))
;
63 // TODO: Currently the __builtin_popcount() implementation using SSE3
64 // instructions is inefficient. Once the problem is fixed, we should
65 // call ST->hasSSE3() instead of ST->hasPOPCNT().
66 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
67}
68
69llvm::Optional<unsigned> X86TTIImpl::getCacheSize(
70 TargetTransformInfo::CacheLevel Level) const {
71 switch (Level) {
72 case TargetTransformInfo::CacheLevel::L1D:
73 // - Penryn
74 // - Nehalem
75 // - Westmere
76 // - Sandy Bridge
77 // - Ivy Bridge
78 // - Haswell
79 // - Broadwell
80 // - Skylake
81 // - Kabylake
82 return 32 * 1024; // 32 KByte
83 case TargetTransformInfo::CacheLevel::L2D:
84 // - Penryn
85 // - Nehalem
86 // - Westmere
87 // - Sandy Bridge
88 // - Ivy Bridge
89 // - Haswell
90 // - Broadwell
91 // - Skylake
92 // - Kabylake
93 return 256 * 1024; // 256 KByte
94 }
95
96 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel")::llvm::llvm_unreachable_internal("Unknown TargetTransformInfo::CacheLevel"
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 96)
;
97}
98
99llvm::Optional<unsigned> X86TTIImpl::getCacheAssociativity(
100 TargetTransformInfo::CacheLevel Level) const {
101 // - Penryn
102 // - Nehalem
103 // - Westmere
104 // - Sandy Bridge
105 // - Ivy Bridge
106 // - Haswell
107 // - Broadwell
108 // - Skylake
109 // - Kabylake
110 switch (Level) {
111 case TargetTransformInfo::CacheLevel::L1D:
112 [[fallthrough]];
113 case TargetTransformInfo::CacheLevel::L2D:
114 return 8;
115 }
116
117 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel")::llvm::llvm_unreachable_internal("Unknown TargetTransformInfo::CacheLevel"
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 117)
;
118}
119
120unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
121 bool Vector = (ClassID == 1);
122 if (Vector && !ST->hasSSE1())
123 return 0;
124
125 if (ST->is64Bit()) {
126 if (Vector && ST->hasAVX512())
127 return 32;
128 return 16;
129 }
130 return 8;
131}
132
133TypeSize
134X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
135 unsigned PreferVectorWidth = ST->getPreferVectorWidth();
136 switch (K) {
137 case TargetTransformInfo::RGK_Scalar:
138 return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
139 case TargetTransformInfo::RGK_FixedWidthVector:
140 if (ST->hasAVX512() && PreferVectorWidth >= 512)
141 return TypeSize::getFixed(512);
142 if (ST->hasAVX() && PreferVectorWidth >= 256)
143 return TypeSize::getFixed(256);
144 if (ST->hasSSE1() && PreferVectorWidth >= 128)
145 return TypeSize::getFixed(128);
146 return TypeSize::getFixed(0);
147 case TargetTransformInfo::RGK_ScalableVector:
148 return TypeSize::getScalable(0);
149 }
150
151 llvm_unreachable("Unsupported register kind")::llvm::llvm_unreachable_internal("Unsupported register kind"
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 151)
;
152}
153
154unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
155 return getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
156 .getFixedSize();
157}
158
159unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
160 // If the loop will not be vectorized, don't interleave the loop.
161 // Let regular unroll to unroll the loop, which saves the overflow
162 // check and memory check cost.
163 if (VF == 1)
164 return 1;
165
166 if (ST->isAtom())
167 return 1;
168
169 // Sandybridge and Haswell have multiple execution ports and pipelined
170 // vector units.
171 if (ST->hasAVX())
172 return 4;
173
174 return 2;
175}
176
177InstructionCost X86TTIImpl::getArithmeticInstrCost(
178 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
179 TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
180 TTI::OperandValueProperties Opd1PropInfo,
181 TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
182 const Instruction *CxtI) {
183 // vXi8 multiplications are always promoted to vXi16.
184 if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
185 Ty->getScalarSizeInBits() == 8) {
186 Type *WideVecTy =
187 VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
188 return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
189 TargetTransformInfo::CastContextHint::None,
190 CostKind) +
191 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
192 TargetTransformInfo::CastContextHint::None,
193 CostKind) +
194 getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info,
195 Opd1PropInfo, Opd2PropInfo);
196 }
197
198 // Legalize the type.
199 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
200
201 int ISD = TLI->InstructionOpcodeToISD(Opcode);
202 assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ?
void (0) : __assert_fail ("ISD && \"Invalid opcode\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 202, __extension__
__PRETTY_FUNCTION__))
;
203
204 if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
205 LT.second.getScalarType() == MVT::i32) {
206 // Check if the operands can be represented as a smaller datatype.
207 bool Op1Signed = false, Op2Signed = false;
208 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
209 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
210 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
211
212 // If both are representable as i15 and at least one is constant,
213 // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
214 // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
215 if (OpMinSize <= 15 && !ST->isPMADDWDSlow()) {
216 bool Op1Constant =
217 isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
218 bool Op2Constant =
219 isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
220 bool Op1Sext = isa<SExtInst>(Args[0]) &&
221 (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
222 bool Op2Sext = isa<SExtInst>(Args[1]) &&
223 (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
224
225 bool IsZeroExtended = !Op1Signed || !Op2Signed;
226 bool IsConstant = Op1Constant || Op2Constant;
227 bool IsSext = Op1Sext || Op2Sext;
228 if (IsConstant || IsZeroExtended || IsSext)
229 LT.second =
230 MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
231 }
232 }
233
234 // Vector multiply by pow2 will be simplified to shifts.
235 if (ISD == ISD::MUL &&
236 (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
237 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
238 Opd2PropInfo == TargetTransformInfo::OP_PowerOf2)
239 return getArithmeticInstrCost(Instruction::Shl, Ty, CostKind, Op1Info,
240 Op2Info, TargetTransformInfo::OP_None,
241 TargetTransformInfo::OP_None);
242
243 // On X86, vector signed division by constants power-of-two are
244 // normally expanded to the sequence SRA + SRL + ADD + SRA.
245 // The OperandValue properties may not be the same as that of the previous
246 // operation; conservatively assume OP_None.
247 if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
248 (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
249 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
250 Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
251 InstructionCost Cost =
252 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Op1Info,
253 Op2Info, TargetTransformInfo::OP_None,
254 TargetTransformInfo::OP_None);
255 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info,
256 Op2Info, TargetTransformInfo::OP_None,
257 TargetTransformInfo::OP_None);
258 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, Op1Info,
259 Op2Info, TargetTransformInfo::OP_None,
260 TargetTransformInfo::OP_None);
261
262 if (ISD == ISD::SREM) {
263 // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
264 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info,
265 Op2Info);
266 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info,
267 Op2Info);
268 }
269
270 return Cost;
271 }
272
273 // Vector unsigned division/remainder will be simplified to shifts/masks.
274 if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
275 (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
276 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
277 Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
278 if (ISD == ISD::UDIV)
279 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info,
280 Op2Info, TargetTransformInfo::OP_None,
281 TargetTransformInfo::OP_None);
282 // UREM
283 return getArithmeticInstrCost(Instruction::And, Ty, CostKind, Op1Info,
284 Op2Info, TargetTransformInfo::OP_None,
285 TargetTransformInfo::OP_None);
286 }
287
288 // TODO: Handle more cost kinds.
289 if (CostKind != TTI::TCK_RecipThroughput)
290 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
291 Opd1PropInfo, Opd2PropInfo, Args,
292 CxtI);
293
294 static const CostTblEntry GLMCostTable[] = {
295 { ISD::FDIV, MVT::f32, 18 }, // divss
296 { ISD::FDIV, MVT::v4f32, 35 }, // divps
297 { ISD::FDIV, MVT::f64, 33 }, // divsd
298 { ISD::FDIV, MVT::v2f64, 65 }, // divpd
299 };
300
301 if (ST->useGLMDivSqrtCosts())
302 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD,
303 LT.second))
304 return LT.first * Entry->Cost;
305
306 static const CostTblEntry SLMCostTable[] = {
307 { ISD::MUL, MVT::v4i32, 11 }, // pmulld
308 { ISD::MUL, MVT::v8i16, 2 }, // pmullw
309 { ISD::FMUL, MVT::f64, 2 }, // mulsd
310 { ISD::FMUL, MVT::v2f64, 4 }, // mulpd
311 { ISD::FMUL, MVT::v4f32, 2 }, // mulps
312 { ISD::FDIV, MVT::f32, 17 }, // divss
313 { ISD::FDIV, MVT::v4f32, 39 }, // divps
314 { ISD::FDIV, MVT::f64, 32 }, // divsd
315 { ISD::FDIV, MVT::v2f64, 69 }, // divpd
316 { ISD::FADD, MVT::v2f64, 2 }, // addpd
317 { ISD::FSUB, MVT::v2f64, 2 }, // subpd
318 // v2i64/v4i64 mul is custom lowered as a series of long:
319 // multiplies(3), shifts(3) and adds(2)
320 // slm muldq version throughput is 2 and addq throughput 4
321 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
322 // 3X4 (addq throughput) = 17
323 { ISD::MUL, MVT::v2i64, 17 },
324 // slm addq\subq throughput is 4
325 { ISD::ADD, MVT::v2i64, 4 },
326 { ISD::SUB, MVT::v2i64, 4 },
327 };
328
329 if (ST->useSLMArithCosts()) {
330 if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
331 // Check if the operands can be shrinked into a smaller datatype.
332 // TODO: Merge this into generiic vXi32 MUL patterns above.
333 bool Op1Signed = false;
334 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
335 bool Op2Signed = false;
336 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
337
338 bool SignedMode = Op1Signed || Op2Signed;
339 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
340
341 if (OpMinSize <= 7)
342 return LT.first * 3; // pmullw/sext
343 if (!SignedMode && OpMinSize <= 8)
344 return LT.first * 3; // pmullw/zext
345 if (OpMinSize <= 15)
346 return LT.first * 5; // pmullw/pmulhw/pshuf
347 if (!SignedMode && OpMinSize <= 16)
348 return LT.first * 5; // pmullw/pmulhw/pshuf
349 }
350
351 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
352 LT.second)) {
353 return LT.first * Entry->Cost;
354 }
355 }
356
357 static const CostTblEntry AVX512BWUniformConstCostTable[] = {
358 { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand.
359 { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand.
360 { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb.
361 };
362
363 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
364 ST->hasBWI()) {
365 if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
366 LT.second))
367 return LT.first * Entry->Cost;
368 }
369
370 static const CostTblEntry AVX512UniformConstCostTable[] = {
371 { ISD::SRA, MVT::v2i64, 1 },
372 { ISD::SRA, MVT::v4i64, 1 },
373 { ISD::SRA, MVT::v8i64, 1 },
374
375 { ISD::SHL, MVT::v64i8, 4 }, // psllw + pand.
376 { ISD::SRL, MVT::v64i8, 4 }, // psrlw + pand.
377 { ISD::SRA, MVT::v64i8, 8 }, // psrlw, pand, pxor, psubb.
378
379 { ISD::SDIV, MVT::v16i32, 6 }, // pmuludq sequence
380 { ISD::SREM, MVT::v16i32, 8 }, // pmuludq+mul+sub sequence
381 { ISD::UDIV, MVT::v16i32, 5 }, // pmuludq sequence
382 { ISD::UREM, MVT::v16i32, 7 }, // pmuludq+mul+sub sequence
383 };
384
385 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
386 ST->hasAVX512()) {
387 if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
388 LT.second))
389 return LT.first * Entry->Cost;
390 }
391
392 static const CostTblEntry AVX2UniformConstCostTable[] = {
393 { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand.
394 { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand.
395 { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb.
396
397 { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
398
399 { ISD::SDIV, MVT::v8i32, 6 }, // pmuludq sequence
400 { ISD::SREM, MVT::v8i32, 8 }, // pmuludq+mul+sub sequence
401 { ISD::UDIV, MVT::v8i32, 5 }, // pmuludq sequence
402 { ISD::UREM, MVT::v8i32, 7 }, // pmuludq+mul+sub sequence
403 };
404
405 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
406 ST->hasAVX2()) {
407 if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
408 LT.second))
409 return LT.first * Entry->Cost;
410 }
411
412 static const CostTblEntry SSE2UniformConstCostTable[] = {
413 { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand.
414 { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand.
415 { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
416
417 { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split.
418 { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split.
419 { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
420
421 { ISD::SDIV, MVT::v8i32, 12+2 }, // 2*pmuludq sequence + split.
422 { ISD::SREM, MVT::v8i32, 16+2 }, // 2*pmuludq+mul+sub sequence + split.
423 { ISD::SDIV, MVT::v4i32, 6 }, // pmuludq sequence
424 { ISD::SREM, MVT::v4i32, 8 }, // pmuludq+mul+sub sequence
425 { ISD::UDIV, MVT::v8i32, 10+2 }, // 2*pmuludq sequence + split.
426 { ISD::UREM, MVT::v8i32, 14+2 }, // 2*pmuludq+mul+sub sequence + split.
427 { ISD::UDIV, MVT::v4i32, 5 }, // pmuludq sequence
428 { ISD::UREM, MVT::v4i32, 7 }, // pmuludq+mul+sub sequence
429 };
430
431 // XOP has faster vXi8 shifts.
432 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
433 ST->hasSSE2() && !ST->hasXOP()) {
434 if (const auto *Entry =
435 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
436 return LT.first * Entry->Cost;
437 }
438
439 static const CostTblEntry AVX512BWConstCostTable[] = {
440 { ISD::SDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
441 { ISD::SREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
442 { ISD::UDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
443 { ISD::UREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
444 { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence
445 { ISD::SREM, MVT::v32i16, 8 }, // vpmulhw+mul+sub sequence
446 { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence
447 { ISD::UREM, MVT::v32i16, 8 }, // vpmulhuw+mul+sub sequence
448 };
449
450 if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
451 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
452 ST->hasBWI()) {
453 if (const auto *Entry =
454 CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
455 return LT.first * Entry->Cost;
456 }
457
458 static const CostTblEntry AVX512ConstCostTable[] = {
459 { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
460 { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
461 { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
462 { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
463 { ISD::SDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
464 { ISD::SREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
465 { ISD::UDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
466 { ISD::UREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
467 { ISD::SDIV, MVT::v32i16, 12 }, // 2*vpmulhw sequence
468 { ISD::SREM, MVT::v32i16, 16 }, // 2*vpmulhw+mul+sub sequence
469 { ISD::UDIV, MVT::v32i16, 12 }, // 2*vpmulhuw sequence
470 { ISD::UREM, MVT::v32i16, 16 }, // 2*vpmulhuw+mul+sub sequence
471 };
472
473 if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
474 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
475 ST->hasAVX512()) {
476 if (const auto *Entry =
477 CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
478 return LT.first * Entry->Cost;
479 }
480
481 static const CostTblEntry AVX2ConstCostTable[] = {
482 { ISD::SDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
483 { ISD::SREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
484 { ISD::UDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
485 { ISD::UREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
486 { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
487 { ISD::SREM, MVT::v16i16, 8 }, // vpmulhw+mul+sub sequence
488 { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
489 { ISD::UREM, MVT::v16i16, 8 }, // vpmulhuw+mul+sub sequence
490 { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
491 { ISD::SREM, MVT::v8i32, 19 }, // vpmuldq+mul+sub sequence
492 { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
493 { ISD::UREM, MVT::v8i32, 19 }, // vpmuludq+mul+sub sequence
494 };
495
496 if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
497 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
498 ST->hasAVX2()) {
499 if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
500 return LT.first * Entry->Cost;
501 }
502
503 static const CostTblEntry SSE2ConstCostTable[] = {
504 { ISD::SDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
505 { ISD::SREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
506 { ISD::SDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
507 { ISD::SREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
508 { ISD::UDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
509 { ISD::UREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
510 { ISD::UDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
511 { ISD::UREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
512 { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
513 { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split.
514 { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
515 { ISD::SREM, MVT::v8i16, 8 }, // pmulhw+mul+sub sequence
516 { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
517 { ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split.
518 { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
519 { ISD::UREM, MVT::v8i16, 8 }, // pmulhuw+mul+sub sequence
520 { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split.
521 { ISD::SREM, MVT::v8i32, 48+2 }, // 2*pmuludq+mul+sub sequence + split.
522 { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
523 { ISD::SREM, MVT::v4i32, 24 }, // pmuludq+mul+sub sequence
524 { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split.
525 { ISD::UREM, MVT::v8i32, 40+2 }, // 2*pmuludq+mul+sub sequence + split.
526 { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
527 { ISD::UREM, MVT::v4i32, 20 }, // pmuludq+mul+sub sequence
528 };
529
530 if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
531 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
532 ST->hasSSE2()) {
533 // pmuldq sequence.
534 if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
535 return LT.first * 32;
536 if (ISD == ISD::SREM && LT.second == MVT::v8i32 && ST->hasAVX())
537 return LT.first * 38;
538 if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
539 return LT.first * 15;
540 if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41())
541 return LT.first * 20;
542
543 if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
544 return LT.first * Entry->Cost;
545 }
546
547 static const CostTblEntry AVX512BWShiftCostTable[] = {
548 { ISD::SHL, MVT::v16i8, 4 }, // extend/vpsllvw/pack sequence.
549 { ISD::SRL, MVT::v16i8, 4 }, // extend/vpsrlvw/pack sequence.
550 { ISD::SRA, MVT::v16i8, 4 }, // extend/vpsravw/pack sequence.
551 { ISD::SHL, MVT::v32i8, 4 }, // extend/vpsllvw/pack sequence.
552 { ISD::SRL, MVT::v32i8, 4 }, // extend/vpsrlvw/pack sequence.
553 { ISD::SRA, MVT::v32i8, 6 }, // extend/vpsravw/pack sequence.
554 { ISD::SHL, MVT::v64i8, 6 }, // extend/vpsllvw/pack sequence.
555 { ISD::SRL, MVT::v64i8, 7 }, // extend/vpsrlvw/pack sequence.
556 { ISD::SRA, MVT::v64i8, 15 }, // extend/vpsravw/pack sequence.
557
558 { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
559 { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
560 { ISD::SRA, MVT::v8i16, 1 }, // vpsravw
561 { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
562 { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
563 { ISD::SRA, MVT::v16i16, 1 }, // vpsravw
564 { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
565 { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
566 { ISD::SRA, MVT::v32i16, 1 }, // vpsravw
567 };
568
569 if (ST->hasBWI())
570 if (const auto *Entry = CostTableLookup(AVX512BWShiftCostTable, ISD, LT.second))
571 return LT.first * Entry->Cost;
572
573 static const CostTblEntry AVX2UniformCostTable[] = {
574 // Uniform splats are cheaper for the following instructions.
575 { ISD::SHL, MVT::v16i16, 1 }, // psllw.
576 { ISD::SRL, MVT::v16i16, 1 }, // psrlw.
577 { ISD::SRA, MVT::v16i16, 1 }, // psraw.
578 { ISD::SHL, MVT::v32i16, 2 }, // 2*psllw.
579 { ISD::SRL, MVT::v32i16, 2 }, // 2*psrlw.
580 { ISD::SRA, MVT::v32i16, 2 }, // 2*psraw.
581
582 { ISD::SHL, MVT::v8i32, 1 }, // pslld
583 { ISD::SRL, MVT::v8i32, 1 }, // psrld
584 { ISD::SRA, MVT::v8i32, 1 }, // psrad
585 { ISD::SHL, MVT::v4i64, 1 }, // psllq
586 { ISD::SRL, MVT::v4i64, 1 }, // psrlq
587 };
588
589 if (ST->hasAVX2() &&
590 ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
591 (Op2Info == TargetTransformInfo::OK_UniformValue))) {
592 if (const auto *Entry =
593 CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
594 return LT.first * Entry->Cost;
595 }
596
597 static const CostTblEntry SSE2UniformCostTable[] = {
598 // Uniform splats are cheaper for the following instructions.
599 { ISD::SHL, MVT::v8i16, 1 }, // psllw.
600 { ISD::SHL, MVT::v4i32, 1 }, // pslld
601 { ISD::SHL, MVT::v2i64, 1 }, // psllq.
602
603 { ISD::SRL, MVT::v8i16, 1 }, // psrlw.
604 { ISD::SRL, MVT::v4i32, 1 }, // psrld.
605 { ISD::SRL, MVT::v2i64, 1 }, // psrlq.
606
607 { ISD::SRA, MVT::v8i16, 1 }, // psraw.
608 { ISD::SRA, MVT::v4i32, 1 }, // psrad.
609 };
610
611 if (ST->hasSSE2() &&
612 ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
613 (Op2Info == TargetTransformInfo::OK_UniformValue))) {
614 if (const auto *Entry =
615 CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
616 return LT.first * Entry->Cost;
617 }
618
619 static const CostTblEntry AVX512DQCostTable[] = {
620 { ISD::MUL, MVT::v2i64, 2 }, // pmullq
621 { ISD::MUL, MVT::v4i64, 2 }, // pmullq
622 { ISD::MUL, MVT::v8i64, 2 } // pmullq
623 };
624
625 // Look for AVX512DQ lowering tricks for custom cases.
626 if (ST->hasDQI())
627 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
628 return LT.first * Entry->Cost;
629
630 static const CostTblEntry AVX512BWCostTable[] = {
631 { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence.
632 { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence.
633 { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence.
634 };
635
636 // Look for AVX512BW lowering tricks for custom cases.
637 if (ST->hasBWI())
638 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
639 return LT.first * Entry->Cost;
640
641 static const CostTblEntry AVX512CostTable[] = {
642 { ISD::SHL, MVT::v4i32, 1 },
643 { ISD::SRL, MVT::v4i32, 1 },
644 { ISD::SRA, MVT::v4i32, 1 },
645 { ISD::SHL, MVT::v8i32, 1 },
646 { ISD::SRL, MVT::v8i32, 1 },
647 { ISD::SRA, MVT::v8i32, 1 },
648 { ISD::SHL, MVT::v16i32, 1 },
649 { ISD::SRL, MVT::v16i32, 1 },
650 { ISD::SRA, MVT::v16i32, 1 },
651
652 { ISD::SHL, MVT::v2i64, 1 },
653 { ISD::SRL, MVT::v2i64, 1 },
654 { ISD::SHL, MVT::v4i64, 1 },
655 { ISD::SRL, MVT::v4i64, 1 },
656 { ISD::SHL, MVT::v8i64, 1 },
657 { ISD::SRL, MVT::v8i64, 1 },
658
659 { ISD::SRA, MVT::v2i64, 1 },
660 { ISD::SRA, MVT::v4i64, 1 },
661 { ISD::SRA, MVT::v8i64, 1 },
662
663 { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org)
664 { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org)
665 { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org)
666 { ISD::MUL, MVT::v8i64, 6 }, // 3*pmuludq/3*shift/2*add
667 { ISD::MUL, MVT::i64, 1 }, // Skylake from http://www.agner.org/
668
669 { ISD::FNEG, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
670 { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
671 { ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
672 { ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
673 { ISD::FDIV, MVT::f64, 4 }, // Skylake from http://www.agner.org/
674 { ISD::FDIV, MVT::v2f64, 4 }, // Skylake from http://www.agner.org/
675 { ISD::FDIV, MVT::v4f64, 8 }, // Skylake from http://www.agner.org/
676 { ISD::FDIV, MVT::v8f64, 16 }, // Skylake from http://www.agner.org/
677
678 { ISD::FNEG, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
679 { ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
680 { ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
681 { ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
682 { ISD::FDIV, MVT::f32, 3 }, // Skylake from http://www.agner.org/
683 { ISD::FDIV, MVT::v4f32, 3 }, // Skylake from http://www.agner.org/
684 { ISD::FDIV, MVT::v8f32, 5 }, // Skylake from http://www.agner.org/
685 { ISD::FDIV, MVT::v16f32, 10 }, // Skylake from http://www.agner.org/
686 };
687
688 if (ST->hasAVX512())
689 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
690 return LT.first * Entry->Cost;
691
692 static const CostTblEntry AVX2ShiftCostTable[] = {
693 // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
694 // customize them to detect the cases where shift amount is a scalar one.
695 { ISD::SHL, MVT::v4i32, 2 }, // vpsllvd (Haswell from agner.org)
696 { ISD::SRL, MVT::v4i32, 2 }, // vpsrlvd (Haswell from agner.org)
697 { ISD::SRA, MVT::v4i32, 2 }, // vpsravd (Haswell from agner.org)
698 { ISD::SHL, MVT::v8i32, 2 }, // vpsllvd (Haswell from agner.org)
699 { ISD::SRL, MVT::v8i32, 2 }, // vpsrlvd (Haswell from agner.org)
700 { ISD::SRA, MVT::v8i32, 2 }, // vpsravd (Haswell from agner.org)
701 { ISD::SHL, MVT::v2i64, 1 }, // vpsllvq (Haswell from agner.org)
702 { ISD::SRL, MVT::v2i64, 1 }, // vpsrlvq (Haswell from agner.org)
703 { ISD::SHL, MVT::v4i64, 1 }, // vpsllvq (Haswell from agner.org)
704 { ISD::SRL, MVT::v4i64, 1 }, // vpsrlvq (Haswell from agner.org)
705 };
706
707 if (ST->hasAVX512()) {
708 if (ISD == ISD::SHL && LT.second == MVT::v32i16 &&
709 (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
710 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
711 // On AVX512, a packed v32i16 shift left by a constant build_vector
712 // is lowered into a vector multiply (vpmullw).
713 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
714 Op1Info, Op2Info,
715 TargetTransformInfo::OP_None,
716 TargetTransformInfo::OP_None);
717 }
718
719 // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
720 if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
721 if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
722 (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
723 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
724 // On AVX2, a packed v16i16 shift left by a constant build_vector
725 // is lowered into a vector multiply (vpmullw).
726 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
727 Op1Info, Op2Info,
728 TargetTransformInfo::OP_None,
729 TargetTransformInfo::OP_None);
730
731 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
732 return LT.first * Entry->Cost;
733 }
734
735 static const CostTblEntry XOPShiftCostTable[] = {
736 // 128bit shifts take 1cy, but right shifts require negation beforehand.
737 { ISD::SHL, MVT::v16i8, 1 },
738 { ISD::SRL, MVT::v16i8, 2 },
739 { ISD::SRA, MVT::v16i8, 2 },
740 { ISD::SHL, MVT::v8i16, 1 },
741 { ISD::SRL, MVT::v8i16, 2 },
742 { ISD::SRA, MVT::v8i16, 2 },
743 { ISD::SHL, MVT::v4i32, 1 },
744 { ISD::SRL, MVT::v4i32, 2 },
745 { ISD::SRA, MVT::v4i32, 2 },
746 { ISD::SHL, MVT::v2i64, 1 },
747 { ISD::SRL, MVT::v2i64, 2 },
748 { ISD::SRA, MVT::v2i64, 2 },
749 // 256bit shifts require splitting if AVX2 didn't catch them above.
750 { ISD::SHL, MVT::v32i8, 2+2 },
751 { ISD::SRL, MVT::v32i8, 4+2 },
752 { ISD::SRA, MVT::v32i8, 4+2 },
753 { ISD::SHL, MVT::v16i16, 2+2 },
754 { ISD::SRL, MVT::v16i16, 4+2 },
755 { ISD::SRA, MVT::v16i16, 4+2 },
756 { ISD::SHL, MVT::v8i32, 2+2 },
757 { ISD::SRL, MVT::v8i32, 4+2 },
758 { ISD::SRA, MVT::v8i32, 4+2 },
759 { ISD::SHL, MVT::v4i64, 2+2 },
760 { ISD::SRL, MVT::v4i64, 4+2 },
761 { ISD::SRA, MVT::v4i64, 4+2 },
762 };
763
764 // Look for XOP lowering tricks.
765 if (ST->hasXOP()) {
766 // If the right shift is constant then we'll fold the negation so
767 // it's as cheap as a left shift.
768 int ShiftISD = ISD;
769 if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) &&
770 (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
771 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
772 ShiftISD = ISD::SHL;
773 if (const auto *Entry =
774 CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
775 return LT.first * Entry->Cost;
776 }
777
778 static const CostTblEntry SSE2UniformShiftCostTable[] = {
779 // Uniform splats are cheaper for the following instructions.
780 { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split.
781 { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split.
782 { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split.
783
784 { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split.
785 { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split.
786 { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split.
787
788 { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split.
789 { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split.
790 { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle.
791 { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split.
792 };
793
794 if (ST->hasSSE2() &&
795 ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
796 (Op2Info == TargetTransformInfo::OK_UniformValue))) {
797
798 // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
799 if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2())
800 return LT.first * 4; // 2*psrad + shuffle.
801
802 if (const auto *Entry =
803 CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
804 return LT.first * Entry->Cost;
805 }
806
807 if (ISD == ISD::SHL &&
808 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) {
809 MVT VT = LT.second;
810 // Vector shift left by non uniform constant can be lowered
811 // into vector multiply.
812 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
813 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
814 ISD = ISD::MUL;
815 }
816
817 static const CostTblEntry AVX2CostTable[] = {
818 { ISD::SHL, MVT::v16i8, 6 }, // vpblendvb sequence.
819 { ISD::SHL, MVT::v32i8, 6 }, // vpblendvb sequence.
820 { ISD::SHL, MVT::v64i8, 12 }, // 2*vpblendvb sequence.
821 { ISD::SHL, MVT::v8i16, 5 }, // extend/vpsrlvd/pack sequence.
822 { ISD::SHL, MVT::v16i16, 7 }, // extend/vpsrlvd/pack sequence.
823 { ISD::SHL, MVT::v32i16, 14 }, // 2*extend/vpsrlvd/pack sequence.
824
825 { ISD::SRL, MVT::v16i8, 6 }, // vpblendvb sequence.
826 { ISD::SRL, MVT::v32i8, 6 }, // vpblendvb sequence.
827 { ISD::SRL, MVT::v64i8, 12 }, // 2*vpblendvb sequence.
828 { ISD::SRL, MVT::v8i16, 5 }, // extend/vpsrlvd/pack sequence.
829 { ISD::SRL, MVT::v16i16, 7 }, // extend/vpsrlvd/pack sequence.
830 { ISD::SRL, MVT::v32i16, 14 }, // 2*extend/vpsrlvd/pack sequence.
831
832 { ISD::SRA, MVT::v16i8, 17 }, // vpblendvb sequence.
833 { ISD::SRA, MVT::v32i8, 17 }, // vpblendvb sequence.
834 { ISD::SRA, MVT::v64i8, 34 }, // 2*vpblendvb sequence.
835 { ISD::SRA, MVT::v8i16, 5 }, // extend/vpsravd/pack sequence.
836 { ISD::SRA, MVT::v16i16, 7 }, // extend/vpsravd/pack sequence.
837 { ISD::SRA, MVT::v32i16, 14 }, // 2*extend/vpsravd/pack sequence.
838 { ISD::SRA, MVT::v2i64, 2 }, // srl/xor/sub sequence.
839 { ISD::SRA, MVT::v4i64, 2 }, // srl/xor/sub sequence.
840
841 { ISD::SUB, MVT::v32i8, 1 }, // psubb
842 { ISD::ADD, MVT::v32i8, 1 }, // paddb
843 { ISD::SUB, MVT::v16i16, 1 }, // psubw
844 { ISD::ADD, MVT::v16i16, 1 }, // paddw
845 { ISD::SUB, MVT::v8i32, 1 }, // psubd
846 { ISD::ADD, MVT::v8i32, 1 }, // paddd
847 { ISD::SUB, MVT::v4i64, 1 }, // psubq
848 { ISD::ADD, MVT::v4i64, 1 }, // paddq
849
850 { ISD::MUL, MVT::v16i16, 1 }, // pmullw
851 { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org)
852 { ISD::MUL, MVT::v4i64, 6 }, // 3*pmuludq/3*shift/2*add
853
854 { ISD::FNEG, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
855 { ISD::FNEG, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
856 { ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
857 { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
858 { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
859 { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
860 { ISD::FMUL, MVT::f64, 1 }, // Haswell from http://www.agner.org/
861 { ISD::FMUL, MVT::v2f64, 1 }, // Haswell from http://www.agner.org/
862 { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
863 { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
864
865 { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
866 { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
867 { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
868 { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/
869 { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
870 { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
871 };
872
873 // Look for AVX2 lowering tricks for custom cases.
874 if (ST->hasAVX2())
875 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
876 return LT.first * Entry->Cost;
877
878 static const CostTblEntry AVX1CostTable[] = {
879 // We don't have to scalarize unsupported ops. We can issue two half-sized
880 // operations and we only need to extract the upper YMM half.
881 // Two ops + 1 extract + 1 insert = 4.
882 { ISD::MUL, MVT::v16i16, 4 },
883 { ISD::MUL, MVT::v8i32, 5 }, // BTVER2 from http://www.agner.org/
884 { ISD::MUL, MVT::v4i64, 12 },
885
886 { ISD::SUB, MVT::v32i8, 4 },
887 { ISD::ADD, MVT::v32i8, 4 },
888 { ISD::SUB, MVT::v16i16, 4 },
889 { ISD::ADD, MVT::v16i16, 4 },
890 { ISD::SUB, MVT::v8i32, 4 },
891 { ISD::ADD, MVT::v8i32, 4 },
892 { ISD::SUB, MVT::v4i64, 4 },
893 { ISD::ADD, MVT::v4i64, 4 },
894
895 { ISD::SHL, MVT::v32i8, 22 }, // pblendvb sequence + split.
896 { ISD::SHL, MVT::v8i16, 6 }, // pblendvb sequence.
897 { ISD::SHL, MVT::v16i16, 13 }, // pblendvb sequence + split.
898 { ISD::SHL, MVT::v4i32, 3 }, // pslld/paddd/cvttps2dq/pmulld
899 { ISD::SHL, MVT::v8i32, 9 }, // pslld/paddd/cvttps2dq/pmulld + split
900 { ISD::SHL, MVT::v2i64, 2 }, // Shift each lane + blend.
901 { ISD::SHL, MVT::v4i64, 6 }, // Shift each lane + blend + split.
902
903 { ISD::SRL, MVT::v32i8, 23 }, // pblendvb sequence + split.
904 { ISD::SRL, MVT::v16i16, 28 }, // pblendvb sequence + split.
905 { ISD::SRL, MVT::v4i32, 6 }, // Shift each lane + blend.
906 { ISD::SRL, MVT::v8i32, 14 }, // Shift each lane + blend + split.
907 { ISD::SRL, MVT::v2i64, 2 }, // Shift each lane + blend.
908 { ISD::SRL, MVT::v4i64, 6 }, // Shift each lane + blend + split.
909
910 { ISD::SRA, MVT::v32i8, 44 }, // pblendvb sequence + split.
911 { ISD::SRA, MVT::v16i16, 28 }, // pblendvb sequence + split.
912 { ISD::SRA, MVT::v4i32, 6 }, // Shift each lane + blend.
913 { ISD::SRA, MVT::v8i32, 14 }, // Shift each lane + blend + split.
914 { ISD::SRA, MVT::v2i64, 5 }, // Shift each lane + blend.
915 { ISD::SRA, MVT::v4i64, 12 }, // Shift each lane + blend + split.
916
917 { ISD::FNEG, MVT::v4f64, 2 }, // BTVER2 from http://www.agner.org/
918 { ISD::FNEG, MVT::v8f32, 2 }, // BTVER2 from http://www.agner.org/
919
920 { ISD::FMUL, MVT::f64, 2 }, // BTVER2 from http://www.agner.org/
921 { ISD::FMUL, MVT::v2f64, 2 }, // BTVER2 from http://www.agner.org/
922 { ISD::FMUL, MVT::v4f64, 4 }, // BTVER2 from http://www.agner.org/
923
924 { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
925 { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
926 { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
927 { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/
928 { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/
929 { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/
930 };
931
932 if (ST->hasAVX())
933 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
934 return LT.first * Entry->Cost;
935
936 static const CostTblEntry SSE42CostTable[] = {
937 { ISD::FADD, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
938 { ISD::FADD, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
939 { ISD::FADD, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
940 { ISD::FADD, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
941
942 { ISD::FSUB, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
943 { ISD::FSUB, MVT::f32 , 1 }, // Nehalem from http://www.agner.org/
944 { ISD::FSUB, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
945 { ISD::FSUB, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
946
947 { ISD::FMUL, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
948 { ISD::FMUL, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
949 { ISD::FMUL, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
950 { ISD::FMUL, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
951
952 { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/
953 { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
954 { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/
955 { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
956
957 { ISD::MUL, MVT::v2i64, 6 } // 3*pmuludq/3*shift/2*add
958 };
959
960 if (ST->hasSSE42())
961 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
962 return LT.first * Entry->Cost;
963
964 static const CostTblEntry SSE41CostTable[] = {
965 { ISD::SHL, MVT::v16i8, 10 }, // pblendvb sequence.
966 { ISD::SHL, MVT::v8i16, 11 }, // pblendvb sequence.
967 { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
968
969 { ISD::SRL, MVT::v16i8, 11 }, // pblendvb sequence.
970 { ISD::SRL, MVT::v8i16, 13 }, // pblendvb sequence.
971 { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
972
973 { ISD::SRA, MVT::v16i8, 21 }, // pblendvb sequence.
974 { ISD::SRA, MVT::v8i16, 13 }, // pblendvb sequence.
975
976 { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org)
977 };
978
979 if (ST->hasSSE41())
980 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
981 return LT.first * Entry->Cost;
982
983 static const CostTblEntry SSE2CostTable[] = {
984 // We don't correctly identify costs of casts because they are marked as
985 // custom.
986 { ISD::SHL, MVT::v16i8, 13 }, // cmpgtb sequence.
987 { ISD::SHL, MVT::v8i16, 25 }, // cmpgtw sequence.
988 { ISD::SHL, MVT::v4i32, 16 }, // pslld/paddd/cvttps2dq/pmuludq.
989 { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
990
991 { ISD::SRL, MVT::v16i8, 14 }, // cmpgtb sequence.
992 { ISD::SRL, MVT::v8i16, 16 }, // cmpgtw sequence.
993 { ISD::SRL, MVT::v4i32, 12 }, // Shift each lane + blend.
994 { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
995
996 { ISD::SRA, MVT::v16i8, 27 }, // unpacked cmpgtb sequence.
997 { ISD::SRA, MVT::v8i16, 16 }, // cmpgtw sequence.
998 { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
999 { ISD::SRA, MVT::v2i64, 8 }, // srl/xor/sub splat+shuffle sequence.
1000
1001 { ISD::MUL, MVT::v8i16, 1 }, // pmullw
1002 { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
1003 { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
1004
1005 { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/
1006 { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
1007 { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
1008 { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
1009
1010 { ISD::FNEG, MVT::f32, 1 }, // Pentium IV from http://www.agner.org/
1011 { ISD::FNEG, MVT::f64, 1 }, // Pentium IV from http://www.agner.org/
1012 { ISD::FNEG, MVT::v4f32, 1 }, // Pentium IV from http://www.agner.org/
1013 { ISD::FNEG, MVT::v2f64, 1 }, // Pentium IV from http://www.agner.org/
1014
1015 { ISD::FADD, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
1016 { ISD::FADD, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
1017
1018 { ISD::FSUB, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
1019 { ISD::FSUB, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
1020 };
1021
1022 if (ST->hasSSE2())
1023 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1024 return LT.first * Entry->Cost;
1025
1026 static const CostTblEntry SSE1CostTable[] = {
1027 { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/
1028 { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
1029
1030 { ISD::FNEG, MVT::f32, 2 }, // Pentium III from http://www.agner.org/
1031 { ISD::FNEG, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
1032
1033 { ISD::FADD, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
1034 { ISD::FADD, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
1035
1036 { ISD::FSUB, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
1037 { ISD::FSUB, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
1038 };
1039
1040 if (ST->hasSSE1())
1041 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1042 return LT.first * Entry->Cost;
1043
1044 static const CostTblEntry X64CostTbl[] = { // 64-bit targets
1045 { ISD::ADD, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/
1046 { ISD::SUB, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/
1047 { ISD::MUL, MVT::i64, 2 }, // Nehalem from http://www.agner.org/
1048 };
1049
1050 if (ST->is64Bit())
1051 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1052 return LT.first * Entry->Cost;
1053
1054 static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1055 { ISD::ADD, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
1056 { ISD::ADD, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
1057 { ISD::ADD, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
1058
1059 { ISD::SUB, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
1060 { ISD::SUB, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
1061 { ISD::SUB, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
1062 };
1063
1064 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1065 return LT.first * Entry->Cost;
1066
1067 // It is not a good idea to vectorize division. We have to scalarize it and
1068 // in the process we will often end up having to spilling regular
1069 // registers. The overhead of division is going to dominate most kernels
1070 // anyways so try hard to prevent vectorization of division - it is
1071 // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1072 // to hide "20 cycles" for each lane.
1073 if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM ||
1074 ISD == ISD::UDIV || ISD == ISD::UREM)) {
1075 InstructionCost ScalarCost = getArithmeticInstrCost(
1076 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info,
1077 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
1078 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1079 }
1080
1081 // Fallback to the default implementation.
1082 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1083 Opd1PropInfo, Opd2PropInfo, Args, CxtI);
1084}
1085
1086InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1087 VectorType *BaseTp,
1088 ArrayRef<int> Mask, int Index,
1089 VectorType *SubTp,
1090 ArrayRef<const Value *> Args) {
1091 // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1092 // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1093 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(BaseTp);
1094
1095 Kind = improveShuffleKindFromMask(Kind, Mask);
1096 // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1097 if (Kind == TTI::SK_Transpose)
1098 Kind = TTI::SK_PermuteTwoSrc;
1099
1100 // For Broadcasts we are splatting the first element from the first input
1101 // register, so only need to reference that input and all the output
1102 // registers are the same.
1103 if (Kind == TTI::SK_Broadcast)
1104 LT.first = 1;
1105
1106 // Subvector extractions are free if they start at the beginning of a
1107 // vector and cheap if the subvectors are aligned.
1108 if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1109 int NumElts = LT.second.getVectorNumElements();
1110 if ((Index % NumElts) == 0)
1111 return 0;
1112 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1113 if (SubLT.second.isVector()) {
1114 int NumSubElts = SubLT.second.getVectorNumElements();
1115 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1116 return SubLT.first;
1117 // Handle some cases for widening legalization. For now we only handle
1118 // cases where the original subvector was naturally aligned and evenly
1119 // fit in its legalized subvector type.
1120 // FIXME: Remove some of the alignment restrictions.
1121 // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1122 // vectors.
1123 int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1124 if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1125 (NumSubElts % OrigSubElts) == 0 &&
1126 LT.second.getVectorElementType() ==
1127 SubLT.second.getVectorElementType() &&
1128 LT.second.getVectorElementType().getSizeInBits() ==
1129 BaseTp->getElementType()->getPrimitiveSizeInBits()) {
1130 assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&(static_cast <bool> (NumElts >= NumSubElts &&
NumElts > OrigSubElts && "Unexpected number of elements!"
) ? void (0) : __assert_fail ("NumElts >= NumSubElts && NumElts > OrigSubElts && \"Unexpected number of elements!\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 1131, __extension__
__PRETTY_FUNCTION__))
1131 "Unexpected number of elements!")(static_cast <bool> (NumElts >= NumSubElts &&
NumElts > OrigSubElts && "Unexpected number of elements!"
) ? void (0) : __assert_fail ("NumElts >= NumSubElts && NumElts > OrigSubElts && \"Unexpected number of elements!\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 1131, __extension__
__PRETTY_FUNCTION__))
;
1132 auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
1133 LT.second.getVectorNumElements());
1134 auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
1135 SubLT.second.getVectorNumElements());
1136 int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1137 InstructionCost ExtractCost = getShuffleCost(
1138 TTI::SK_ExtractSubvector, VecTy, None, ExtractIndex, SubTy);
1139
1140 // If the original size is 32-bits or more, we can use pshufd. Otherwise
1141 // if we have SSSE3 we can use pshufb.
1142 if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1143 return ExtractCost + 1; // pshufd or pshufb
1144
1145 assert(SubTp->getPrimitiveSizeInBits() == 16 &&(static_cast <bool> (SubTp->getPrimitiveSizeInBits()
== 16 && "Unexpected vector size") ? void (0) : __assert_fail
("SubTp->getPrimitiveSizeInBits() == 16 && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 1146, __extension__
__PRETTY_FUNCTION__))
1146 "Unexpected vector size")(static_cast <bool> (SubTp->getPrimitiveSizeInBits()
== 16 && "Unexpected vector size") ? void (0) : __assert_fail
("SubTp->getPrimitiveSizeInBits() == 16 && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 1146, __extension__
__PRETTY_FUNCTION__))
;
1147
1148 return ExtractCost + 2; // worst case pshufhw + pshufd
1149 }
1150 }
1151 }
1152
1153 // Subvector insertions are cheap if the subvectors are aligned.
1154 // Note that in general, the insertion starting at the beginning of a vector
1155 // isn't free, because we need to preserve the rest of the wide vector.
1156 if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1157 int NumElts = LT.second.getVectorNumElements();
1158 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1159 if (SubLT.second.isVector()) {
1160 int NumSubElts = SubLT.second.getVectorNumElements();
1161 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1162 return SubLT.first;
1163 }
1164
1165 // If the insertion isn't aligned, treat it like a 2-op shuffle.
1166 Kind = TTI::SK_PermuteTwoSrc;
1167 }
1168
1169 // Handle some common (illegal) sub-vector types as they are often very cheap
1170 // to shuffle even on targets without PSHUFB.
1171 EVT VT = TLI->getValueType(DL, BaseTp);
1172 if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1173 !ST->hasSSSE3()) {
1174 static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
1175 {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw
1176 {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw
1177 {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw
1178 {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw
1179 {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck
1180
1181 {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw
1182 {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw
1183 {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus
1184 {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck
1185
1186 {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw
1187 {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw
1188 {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw
1189 {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw
1190 {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck
1191
1192 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
1193 {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
1194 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw
1195 {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw
1196 {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck
1197 };
1198
1199 if (ST->hasSSE2())
1200 if (const auto *Entry =
1201 CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1202 return Entry->Cost;
1203 }
1204
1205 // We are going to permute multiple sources and the result will be in multiple
1206 // destinations. Providing an accurate cost only for splits where the element
1207 // type remains the same.
1208 if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
1209 MVT LegalVT = LT.second;
1210 if (LegalVT.isVector() &&
1211 LegalVT.getVectorElementType().getSizeInBits() ==
1212 BaseTp->getElementType()->getPrimitiveSizeInBits() &&
1213 LegalVT.getVectorNumElements() <
1214 cast<FixedVectorType>(BaseTp)->getNumElements()) {
1215
1216 unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
1217 unsigned LegalVTSize = LegalVT.getStoreSize();
1218 // Number of source vectors after legalization:
1219 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1220 // Number of destination vectors after legalization:
1221 InstructionCost NumOfDests = LT.first;
1222
1223 auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
1224 LegalVT.getVectorNumElements());
1225
1226 if (!Mask.empty() && NumOfDests.isValid()) {
1227 // Try to perform better estimation of the permutation.
1228 // 1. Split the source/destination vectors into real registers.
1229 // 2. Do the mask analysis to identify which real registers are
1230 // permuted. If more than 1 source registers are used for the
1231 // destination register building, the cost for this destination register
1232 // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
1233 // source register is used, build mask and calculate the cost as a cost
1234 // of PermuteSingleSrc.
1235 // Also, for the single register permute we try to identify if the
1236 // destination register is just a copy of the source register or the
1237 // copy of the previous destination register (the cost is
1238 // TTI::TCC_Basic). If the source register is just reused, the cost for
1239 // this operation is 0.
1240 unsigned E = *NumOfDests.getValue();
1241 unsigned NormalizedVF =
1242 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
1243 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
1244 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
1245 SmallVector<int> NormalizedMask(NormalizedVF, UndefMaskElem);
1246 copy(Mask, NormalizedMask.begin());
1247 unsigned PrevSrcReg = 0;
1248 ArrayRef<int> PrevRegMask;
1249 InstructionCost Cost = 0;
1250 processShuffleMasks(
1251 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
1252 [this, SingleOpTy, &PrevSrcReg, &PrevRegMask,
1253 &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
1254 if (!ShuffleVectorInst::isIdentityMask(RegMask)) {
1255 // Check if the previous register can be just copied to the next
1256 // one.
1257 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
1258 PrevRegMask != RegMask)
1259 Cost += getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
1260 RegMask, 0, nullptr);
1261 else
1262 // Just a copy of previous destination register.
1263 Cost += TTI::TCC_Basic;
1264 return;
1265 }
1266 if (SrcReg != DestReg &&
1267 any_of(RegMask, [](int I) { return I != UndefMaskElem; })) {
1268 // Just a copy of the source register.
1269 Cost += TTI::TCC_Basic;
1270 }
1271 PrevSrcReg = SrcReg;
1272 PrevRegMask = RegMask;
1273 },
1274 [this, SingleOpTy, &Cost](ArrayRef<int> RegMask,
1275 unsigned /*Unused*/,
1276 unsigned /*Unused*/) {
1277 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
1278 0, nullptr);
1279 });
1280 return Cost;
1281 }
1282
1283 InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1284 return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1285 None, 0, nullptr);
1286 }
1287
1288 return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp);
1289 }
1290
1291 // For 2-input shuffles, we must account for splitting the 2 inputs into many.
1292 if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
1293 // We assume that source and destination have the same vector type.
1294 InstructionCost NumOfDests = LT.first;
1295 InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1;
1296 LT.first = NumOfDests * NumOfShufflesPerDest;
1297 }
1298
1299 static const CostTblEntry AVX512VBMIShuffleTbl[] = {
1300 {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
1301 {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
1302
1303 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
1304 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
1305
1306 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
1307 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
1308 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b
1309 };
1310
1311 if (ST->hasVBMI())
1312 if (const auto *Entry =
1313 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1314 return LT.first * Entry->Cost;
1315
1316 static const CostTblEntry AVX512BWShuffleTbl[] = {
1317 {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1318 {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
1319 {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1320
1321 {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
1322 {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
1323 {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
1324 {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
1325
1326 {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
1327 {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
1328 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
1329 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
1330 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
1331
1332 {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
1333 {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
1334 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
1335 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
1336 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1337
1338 {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
1339 {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
1340 };
1341
1342 if (ST->hasBWI())
1343 if (const auto *Entry =
1344 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1345 return LT.first * Entry->Cost;
1346
1347 static const CostTblEntry AVX512ShuffleTbl[] = {
1348 {TTI::SK_Broadcast, MVT::v8f64, 1}, // vbroadcastpd
1349 {TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps
1350 {TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq
1351 {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd
1352 {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1353 {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
1354 {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1355
1356 {TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd
1357 {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps
1358 {TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq
1359 {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd
1360 {TTI::SK_Reverse, MVT::v32i16, 7}, // per mca
1361 {TTI::SK_Reverse, MVT::v32f16, 7}, // per mca
1362 {TTI::SK_Reverse, MVT::v64i8, 7}, // per mca
1363
1364 {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd
1365 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1366 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // vpermpd
1367 {TTI::SK_PermuteSingleSrc, MVT::v16f32, 1}, // vpermps
1368 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1369 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 1}, // vpermps
1370 {TTI::SK_PermuteSingleSrc, MVT::v8i64, 1}, // vpermq
1371 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1372 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // vpermq
1373 {TTI::SK_PermuteSingleSrc, MVT::v16i32, 1}, // vpermd
1374 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1375 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // vpermd
1376 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1377
1378 {TTI::SK_PermuteTwoSrc, MVT::v8f64, 1}, // vpermt2pd
1379 {TTI::SK_PermuteTwoSrc, MVT::v16f32, 1}, // vpermt2ps
1380 {TTI::SK_PermuteTwoSrc, MVT::v8i64, 1}, // vpermt2q
1381 {TTI::SK_PermuteTwoSrc, MVT::v16i32, 1}, // vpermt2d
1382 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 1}, // vpermt2pd
1383 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 1}, // vpermt2ps
1384 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 1}, // vpermt2q
1385 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 1}, // vpermt2d
1386 {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1}, // vpermt2pd
1387 {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1}, // vpermt2ps
1388 {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1}, // vpermt2q
1389 {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1}, // vpermt2d
1390
1391 // FIXME: This just applies the type legalization cost rules above
1392 // assuming these completely split.
1393 {TTI::SK_PermuteSingleSrc, MVT::v32i16, 14},
1394 {TTI::SK_PermuteSingleSrc, MVT::v32f16, 14},
1395 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 14},
1396 {TTI::SK_PermuteTwoSrc, MVT::v32i16, 42},
1397 {TTI::SK_PermuteTwoSrc, MVT::v32f16, 42},
1398 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 42},
1399
1400 {TTI::SK_Select, MVT::v32i16, 1}, // vpternlogq
1401 {TTI::SK_Select, MVT::v32f16, 1}, // vpternlogq
1402 {TTI::SK_Select, MVT::v64i8, 1}, // vpternlogq
1403 {TTI::SK_Select, MVT::v8f64, 1}, // vblendmpd
1404 {TTI::SK_Select, MVT::v16f32, 1}, // vblendmps
1405 {TTI::SK_Select, MVT::v8i64, 1}, // vblendmq
1406 {TTI::SK_Select, MVT::v16i32, 1}, // vblendmd
1407 };
1408
1409 if (ST->hasAVX512())
1410 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1411 return LT.first * Entry->Cost;
1412
1413 static const CostTblEntry AVX2ShuffleTbl[] = {
1414 {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1415 {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1416 {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1417 {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1418 {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1419 {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
1420 {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1421
1422 {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1423 {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1424 {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1425 {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1426 {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1427 {TTI::SK_Reverse, MVT::v16f16, 2}, // vperm2i128 + pshufb
1428 {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1429
1430 {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1431 {TTI::SK_Select, MVT::v16f16, 1}, // vpblendvb
1432 {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1433
1434 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1435 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1436 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1437 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1438 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1439 // + vpblendvb
1440 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vperm2i128 + 2*vpshufb
1441 // + vpblendvb
1442 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1443 // + vpblendvb
1444
1445 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1446 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1447 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1448 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1449 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1450 // + vpblendvb
1451 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 7}, // 2*vperm2i128 + 4*vpshufb
1452 // + vpblendvb
1453 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
1454 // + vpblendvb
1455 };
1456
1457 if (ST->hasAVX2())
1458 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1459 return LT.first * Entry->Cost;
1460
1461 static const CostTblEntry XOPShuffleTbl[] = {
1462 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
1463 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
1464 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
1465 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
1466 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1467 // + vinsertf128
1468 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
1469 // + vinsertf128
1470
1471 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1472 // + vinsertf128
1473 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
1474 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
1475 // + vinsertf128
1476 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
1477 };
1478
1479 if (ST->hasXOP())
1480 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1481 return LT.first * Entry->Cost;
1482
1483 static const CostTblEntry AVX1ShuffleTbl[] = {
1484 {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1485 {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1486 {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1487 {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1488 {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1489 {TTI::SK_Broadcast, MVT::v16f16, 3}, // vpshuflw + vpshufd + vinsertf128
1490 {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
1491
1492 {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1493 {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1494 {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1495 {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1496 {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1497 // + vinsertf128
1498 {TTI::SK_Reverse, MVT::v16f16, 4}, // vextractf128 + 2*pshufb
1499 // + vinsertf128
1500 {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
1501 // + vinsertf128
1502
1503 {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
1504 {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
1505 {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
1506 {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
1507 {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
1508 {TTI::SK_Select, MVT::v16f16, 3}, // vpand + vpandn + vpor
1509 {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
1510
1511 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
1512 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
1513 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1514 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1515 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
1516 // + 2*por + vinsertf128
1517 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 8}, // vextractf128 + 4*pshufb
1518 // + 2*por + vinsertf128
1519 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
1520 // + 2*por + vinsertf128
1521
1522 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
1523 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
1524 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1525 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1526 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
1527 // + 4*por + vinsertf128
1528 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 15}, // 2*vextractf128 + 8*pshufb
1529 // + 4*por + vinsertf128
1530 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
1531 // + 4*por + vinsertf128
1532 };
1533
1534 if (ST->hasAVX())
1535 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
1536 return LT.first * Entry->Cost;
1537
1538 static const CostTblEntry SSE41ShuffleTbl[] = {
1539 {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
1540 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1541 {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
1542 {TTI::SK_Select, MVT::v4f32, 1}, // blendps
1543 {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
1544 {TTI::SK_Select, MVT::v8f16, 1}, // pblendw
1545 {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
1546 };
1547
1548 if (ST->hasSSE41())
1549 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
1550 return LT.first * Entry->Cost;
1551
1552 static const CostTblEntry SSSE3ShuffleTbl[] = {
1553 {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
1554 {TTI::SK_Broadcast, MVT::v8f16, 1}, // pshufb
1555 {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
1556
1557 {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
1558 {TTI::SK_Reverse, MVT::v8f16, 1}, // pshufb
1559 {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
1560
1561 {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
1562 {TTI::SK_Select, MVT::v8f16, 3}, // 2*pshufb + por
1563 {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
1564
1565 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
1566 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // pshufb
1567 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1568
1569 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
1570 {TTI::SK_PermuteTwoSrc, MVT::v8f16, 3}, // 2*pshufb + por
1571 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
1572 };
1573
1574 if (ST->hasSSSE3())
1575 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
1576 return LT.first * Entry->Cost;
1577
1578 static const CostTblEntry SSE2ShuffleTbl[] = {
1579 {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
1580 {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
1581 {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
1582 {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
1583 {TTI::SK_Broadcast, MVT::v8f16, 2}, // pshuflw + pshufd
1584 {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
1585
1586 {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
1587 {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
1588 {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
1589 {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
1590 {TTI::SK_Reverse, MVT::v8f16, 3}, // pshuflw + pshufhw + pshufd
1591 {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
1592 // + 2*pshufd + 2*unpck + packus
1593
1594 {TTI::SK_Select, MVT::v2i64, 1}, // movsd
1595 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1596 {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
1597 {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
1598 {TTI::SK_Select, MVT::v8f16, 3}, // pand + pandn + por
1599 {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
1600
1601 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
1602 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
1603 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
1604 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
1605 // + pshufd/unpck
1606 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 5}, // 2*pshuflw + 2*pshufhw
1607 // + pshufd/unpck
1608 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
1609 // + 2*pshufd + 2*unpck + 2*packus
1610
1611 { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
1612 { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
1613 { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
1614 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
1615 { TTI::SK_PermuteTwoSrc, MVT::v8f16, 8 }, // blend+permute
1616 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
1617 };
1618
1619 static const CostTblEntry SSE3BroadcastLoadTbl[] = {
1620 {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
1621 };
1622
1623 if (ST->hasSSE2()) {
1624 bool IsLoad =
1625 llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
1626 if (ST->hasSSE3() && IsLoad)
1627 if (const auto *Entry =
1628 CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
1629 assert(isLegalBroadcastLoad(BaseTp->getElementType(),(static_cast <bool> (isLegalBroadcastLoad(BaseTp->getElementType
(), LT.second.getVectorElementCount()) && "Table entry missing from isLegalBroadcastLoad()"
) ? void (0) : __assert_fail ("isLegalBroadcastLoad(BaseTp->getElementType(), LT.second.getVectorElementCount()) && \"Table entry missing from isLegalBroadcastLoad()\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 1631, __extension__
__PRETTY_FUNCTION__))
1630 LT.second.getVectorElementCount()) &&(static_cast <bool> (isLegalBroadcastLoad(BaseTp->getElementType
(), LT.second.getVectorElementCount()) && "Table entry missing from isLegalBroadcastLoad()"
) ? void (0) : __assert_fail ("isLegalBroadcastLoad(BaseTp->getElementType(), LT.second.getVectorElementCount()) && \"Table entry missing from isLegalBroadcastLoad()\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 1631, __extension__
__PRETTY_FUNCTION__))
1631 "Table entry missing from isLegalBroadcastLoad()")(static_cast <bool> (isLegalBroadcastLoad(BaseTp->getElementType
(), LT.second.getVectorElementCount()) && "Table entry missing from isLegalBroadcastLoad()"
) ? void (0) : __assert_fail ("isLegalBroadcastLoad(BaseTp->getElementType(), LT.second.getVectorElementCount()) && \"Table entry missing from isLegalBroadcastLoad()\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 1631, __extension__
__PRETTY_FUNCTION__))
;
1632 return LT.first * Entry->Cost;
1633 }
1634
1635 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
1636 return LT.first * Entry->Cost;
1637 }
1638
1639 static const CostTblEntry SSE1ShuffleTbl[] = {
1640 { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
1641 { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
1642 { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
1643 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
1644 { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
1645 };
1646
1647 if (ST->hasSSE1())
1648 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
1649 return LT.first * Entry->Cost;
1650
1651 return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp);
1652}
1653
1654InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
1655 Type *Src,
1656 TTI::CastContextHint CCH,
1657 TTI::TargetCostKind CostKind,
1658 const Instruction *I) {
1659 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1660 assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ?
void (0) : __assert_fail ("ISD && \"Invalid opcode\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 1660, __extension__
__PRETTY_FUNCTION__))
;
1661
1662 // TODO: Allow non-throughput costs that aren't binary.
1663 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
1664 if (CostKind != TTI::TCK_RecipThroughput)
1665 return Cost == 0 ? 0 : 1;
1666 return Cost;
1667 };
1668
1669 // The cost tables include both specific, custom (non-legal) src/dst type
1670 // conversions and generic, legalized types. We test for customs first, before
1671 // falling back to legalization.
1672 // FIXME: Need a better design of the cost table to handle non-simple types of
1673 // potential massive combinations (elem_num x src_type x dst_type).
1674 static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
1675 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
1676 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
1677
1678 // Mask sign extend has an instruction.
1679 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
1680 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 },
1681 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
1682 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 },
1683 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
1684 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 },
1685 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
1686 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 },
1687 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
1688 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 },
1689 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
1690 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
1691 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
1692 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
1693 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 },
1694 { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 },
1695 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, 1 },
1696
1697 // Mask zero extend is a sext + shift.
1698 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
1699 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 },
1700 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
1701 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 },
1702 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
1703 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 },
1704 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
1705 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 },
1706 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
1707 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 },
1708 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
1709 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
1710 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
1711 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
1712 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 },
1713 { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 },
1714 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, 2 },
1715
1716 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 },
1717 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 },
1718 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 },
1719 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 },
1720 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 },
1721 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 },
1722 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 },
1723 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 },
1724 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 },
1725 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 },
1726 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 },
1727 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 },
1728 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 },
1729 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 },
1730 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, 2 },
1731 { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, 2 },
1732 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, 2 },
1733
1734 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 2 },
1735 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm
1736 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // vpmovwb
1737 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // vpmovwb
1738 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // vpmovwb
1739 };
1740
1741 static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
1742 // Mask sign extend has an instruction.
1743 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 },
1744 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 },
1745 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 },
1746 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 },
1747 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 },
1748 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, 1 },
1749 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 },
1750 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 },
1751
1752 // Mask zero extend is a sext + shift.
1753 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 },
1754 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 },
1755 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 },
1756 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 },
1757 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 },
1758 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, 2 },
1759 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 },
1760 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 },
1761
1762 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 },
1763 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 },
1764 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 },
1765 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 },
1766 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
1767 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 },
1768 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 },
1769 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, 2 },
1770
1771 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
1772 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
1773
1774 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
1775 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
1776
1777 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 },
1778 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 },
1779
1780 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 },
1781 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 },
1782 };
1783
1784 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
1785 // 256-bit wide vectors.
1786
1787 static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
1788 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 },
1789 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 },
1790 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 },
1791
1792 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
1793 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
1794 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
1795 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd
1796 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
1797 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
1798 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
1799 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd
1800 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd
1801 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd
1802 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd
1803 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd
1804 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq
1805 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq
1806 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq
1807 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 2 }, // vpmovdb
1808 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 2 }, // vpmovdb
1809 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 }, // vpmovdb
1810 { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, 2 }, // vpmovdb
1811 { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, 2 }, // vpmovdb
1812 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 }, // vpmovdw
1813 { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, 2 }, // vpmovdw
1814 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 2 }, // vpmovqb
1815 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1 }, // vpshufb
1816 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 }, // vpmovqb
1817 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, 2 }, // vpmovqb
1818 { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, 2 }, // vpmovqb
1819 { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, 2 }, // vpmovqb
1820 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 }, // vpmovqw
1821 { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, 2 }, // vpmovqw
1822 { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, 2 }, // vpmovqw
1823 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, // vpmovqd
1824 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd
1825 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb
1826
1827 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32
1828 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 8 },
1829 { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, 8 },
1830
1831 // Sign extend is zmm vpternlogd+vptruncdb.
1832 // Zero extend is zmm broadcast load+vptruncdw.
1833 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 3 },
1834 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 4 },
1835 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 3 },
1836 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 4 },
1837 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 3 },
1838 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 4 },
1839 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 3 },
1840 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 4 },
1841
1842 // Sign extend is zmm vpternlogd+vptruncdw.
1843 // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
1844 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 3 },
1845 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 4 },
1846 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 3 },
1847 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 4 },
1848 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 3 },
1849 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 4 },
1850 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 3 },
1851 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
1852
1853 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd
1854 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld
1855 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd
1856 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld
1857 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd
1858 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld
1859 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq
1860 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq
1861 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq
1862 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq
1863
1864 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd
1865 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld
1866 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq
1867 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq
1868
1869 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
1870 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
1871 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
1872 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
1873 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
1874 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
1875 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
1876 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
1877 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
1878 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
1879
1880 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
1881 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
1882
1883 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
1884 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
1885 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 },
1886 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 },
1887 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
1888 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 },
1889 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
1890 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
1891
1892 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
1893 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
1894 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 },
1895 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 },
1896 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
1897 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 },
1898 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
1899 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
1900 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 },
1901 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 },
1902
1903 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 },
1904 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, 7 },
1905 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64,15 },
1906 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32,11 },
1907 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64,31 },
1908 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, 3 },
1909 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, 7 },
1910 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, 5 },
1911 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64,15 },
1912 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 1 },
1913 { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, 3 },
1914
1915 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 },
1916 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 3 },
1917 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 3 },
1918 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 },
1919 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 3 },
1920 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 3 },
1921 };
1922
1923 static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] {
1924 // Mask sign extend has an instruction.
1925 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
1926 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 },
1927 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
1928 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 },
1929 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
1930 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 },
1931 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
1932 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 },
1933 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
1934 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 },
1935 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
1936 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
1937 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
1938 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
1939 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, 1 },
1940 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, 1 },
1941 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, 1 },
1942
1943 // Mask zero extend is a sext + shift.
1944 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
1945 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 },
1946 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
1947 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 },
1948 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
1949 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 },
1950 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
1951 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 },
1952 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
1953 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 },
1954 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
1955 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
1956 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
1957 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
1958 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, 2 },
1959 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, 2 },
1960 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, 2 },
1961
1962 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 },
1963 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 },
1964 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 },
1965 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 },
1966 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 },
1967 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 },
1968 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 },
1969 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 },
1970 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 },
1971 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 },
1972 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 },
1973 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 },
1974 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 },
1975 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 },
1976 { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, 2 },
1977 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, 2 },
1978 { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, 2 },
1979
1980 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 },
1981 };
1982
1983 static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = {
1984 // Mask sign extend has an instruction.
1985 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 },
1986 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 },
1987 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 },
1988 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, 1 },
1989 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 },
1990 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, 1 },
1991 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, 1 },
1992 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 },
1993
1994 // Mask zero extend is a sext + shift.
1995 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 },
1996 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 },
1997 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 },
1998 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, 2 },
1999 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 },
2000 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, 2 },
2001 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, 2 },
2002 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 },
2003
2004 { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, 2 },
2005 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, 2 },
2006 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 },
2007 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 },
2008 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 },
2009 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 },
2010 { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, 2 },
2011 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
2012
2013 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
2014 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
2015 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
2016 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
2017
2018 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
2019 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
2020 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
2021 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
2022
2023 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, 1 },
2024 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 },
2025 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
2026 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 },
2027
2028 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, 1 },
2029 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 },
2030 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
2031 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 },
2032 };
2033
2034 static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = {
2035 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
2036 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
2037 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
2038 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8
2039 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
2040 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
2041 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
2042 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16
2043 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd
2044 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd
2045 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd
2046 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq
2047 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq
2048 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd
2049 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, // vpmovqb
2050 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, // vpmovqw
2051 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, // vpmovwb
2052
2053 // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
2054 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
2055 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 5 },
2056 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 6 },
2057 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 5 },
2058 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 6 },
2059 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 5 },
2060 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 6 },
2061 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 10 },
2062 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 12 },
2063
2064 // sign extend is vpcmpeq+maskedmove+vpmovdw
2065 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
2066 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 4 },
2067 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 5 },
2068 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 4 },
2069 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 5 },
2070 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 4 },
2071 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 5 },
2072 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 10 },
2073 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 12 },
2074
2075 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd
2076 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld
2077 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd
2078 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld
2079 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd
2080 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld
2081 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq
2082 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq
2083 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq
2084 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq
2085
2086 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 1 },
2087 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 1 },
2088 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 1 },
2089 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 1 },
2090 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
2091 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
2092 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 1 },
2093 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 1 },
2094 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
2095 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
2096 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
2097 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
2098
2099 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
2100 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 },
2101 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
2102 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 },
2103
2104 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 },
2105 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 },
2106 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
2107 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 },
2108 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
2109 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 },
2110 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
2111 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
2112 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
2113 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
2114 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 },
2115 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
2116 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 5 },
2117
2118 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 },
2119 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 },
2120 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, 5 },
2121
2122 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 },
2123 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 },
2124 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
2125 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 1 },
2126 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 },
2127 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 },
2128 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 },
2129 };
2130
2131 static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
2132 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
2133 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
2134 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
2135 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
2136 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
2137 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
2138
2139 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 2 },
2140 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 2 },
2141 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 2 },
2142 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 2 },
2143 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
2144 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
2145 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 2 },
2146 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 2 },
2147 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
2148 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
2149 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
2150 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
2151 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
2152 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
2153
2154 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
2155
2156 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 4 },
2157 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 4 },
2158 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 1 },
2159 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 1 },
2160 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 1 },
2161 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 4 },
2162 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 4 },
2163 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 1 },
2164 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 1 },
2165 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 5 },
2166 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 },
2167 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 },
2168
2169 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 },
2170 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 },
2171
2172 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 1 },
2173 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 1 },
2174 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 1 },
2175 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 3 },
2176
2177 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 3 },
2178 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 3 },
2179 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 1 },
2180 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 },
2181 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
2182 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4 },
2183 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 3 },
2184 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 4 },
2185
2186 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 },
2187 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 },
2188 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 },
2189 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 },
2190 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
2191 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
2192 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 3 },
2193
2194 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 },
2195 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 },
2196 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 },
2197 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 },
2198 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 },
2199 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
2200 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 2 },
2201 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
2202 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
2203 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 },
2204 };
2205
2206 static const TypeConversionCostTblEntry AVXConversionTbl[] = {
2207 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 },
2208 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 },
2209 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 },
2210 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 },
2211 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
2212 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
2213
2214 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 3 },
2215 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 3 },
2216 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 3 },
2217 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 3 },
2218 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
2219 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
2220 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 3 },
2221 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 3 },
2222 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
2223 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
2224 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
2225 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
2226
2227 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 4 },
2228 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 5 },
2229 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 4 },
2230 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 9 },
2231 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, 11 },
2232
2233 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 },
2234 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
2235 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // and+extract+packuswb
2236 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 5 },
2237 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
2238 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 5 },
2239 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 3 }, // and+extract+2*packusdw
2240 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 },
2241
2242 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
2243 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 },
2244 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 },
2245 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 },
2246 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 },
2247 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
2248 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 },
2249 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
2250 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
2251 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 },
2252 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 5 },
2253 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 8 },
2254
2255 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 },
2256 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 },
2257 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 },
2258 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 },
2259 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 },
2260 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
2261 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 },
2262 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 4 },
2263 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 4 },
2264 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
2265 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 },
2266 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 },
2267 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 10 },
2268 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 10 },
2269 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 18 },
2270 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
2271 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 10 },
2272
2273 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 },
2274 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, 2 },
2275 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, 2 },
2276 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, 2 },
2277 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 2 },
2278 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, 2 },
2279 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 2 },
2280 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, 2 },
2281 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 2 },
2282 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 2 },
2283 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 5 },
2284
2285 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, 2 },
2286 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, 2 },
2287 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, 2 },
2288 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, 2 },
2289 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 2 },
2290 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, 2 },
2291 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 2 },
2292 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, 2 },
2293 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 },
2294 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
2295 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 6 },
2296 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 7 },
2297 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 7 },
2298
2299 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 },
2300 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 },
2301 };
2302
2303 static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
2304 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 1 },
2305 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 1 },
2306 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 1 },
2307 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 1 },
2308 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
2309 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
2310 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 1 },
2311 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 1 },
2312 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
2313 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
2314 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
2315 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
2316
2317 // These truncates end up widening elements.
2318 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ
2319 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ
2320 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD
2321
2322 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 2 },
2323 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 2 },
2324 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 2 },
2325
2326 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 1 },
2327 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 1 },
2328 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 1 },
2329 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 1 },
2330 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 },
2331 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
2332 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 },
2333 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
2334 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
2335 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 1 },
2336 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
2337
2338 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 1 },
2339 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 1 },
2340 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 4 },
2341 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 },
2342 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 },
2343 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
2344 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 },
2345 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
2346 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 3 },
2347 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 },
2348 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 2 },
2349 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 12 },
2350 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 22 },
2351 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 4 },
2352
2353 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 1 },
2354 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 1 },
2355 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 1 },
2356 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 1 },
2357 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 2 },
2358 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 2 },
2359 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 1 },
2360 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 1 },
2361 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
2362 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 1 },
2363
2364 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 1 },
2365 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 },
2366 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 1 },
2367 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 },
2368 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 2 },
2369 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 2 },
2370 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 1 },
2371 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 1 },
2372 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 4 },
2373 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
2374 };
2375
2376 static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
2377 // These are somewhat magic numbers justified by comparing the
2378 // output of llvm-mca for our various supported scheduler models
2379 // and basing it off the worst case scenario.
2380 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 3 },
2381 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 3 },
2382 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 3 },
2383 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 3 },
2384 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 3 },
2385 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 },
2386 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 3 },
2387 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 },
2388 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 },
2389 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4 },
2390 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 8 },
2391 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 8 },
2392
2393 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 3 },
2394 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 3 },
2395 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 8 },
2396 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 9 },
2397 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 },
2398 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 4 },
2399 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 4 },
2400 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 },
2401 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 7 },
2402 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 7 },
2403 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
2404 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 15 },
2405 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 18 },
2406
2407 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 4 },
2408 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 4 },
2409 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 4 },
2410 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 4 },
2411 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 6 },
2412 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 6 },
2413 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 5 },
2414 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 5 },
2415 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 4 },
2416 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 4 },
2417
2418 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 4 },
2419 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 },
2420 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 4 },
2421 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 15 },
2422 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 6 },
2423 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 6 },
2424 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 5 },
2425 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 5 },
2426 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 8 },
2427 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 8 },
2428
2429 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 4 },
2430 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 4 },
2431 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 2 },
2432 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 3 },
2433 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
2434 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 2 },
2435 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 2 },
2436 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 3 },
2437 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
2438 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 2 },
2439 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
2440 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 2 },
2441
2442 // These truncates are really widening elements.
2443 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD
2444 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // PUNPCKLWD+DQ
2445 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // PUNPCKLBW+WD+PSHUFD
2446 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 1 }, // PUNPCKLWD
2447 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD
2448 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW
2449
2450 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 2 }, // PAND+PACKUSWB
2451 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 },
2452 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 3 }, // PAND+2*PACKUSWB
2453 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 },
2454 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 },
2455 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 3 },
2456 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
2457 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32,10 },
2458 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB
2459 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW
2460 { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, 1 }, // PSHUFD
2461 };
2462
2463 // Attempt to map directly to (simple) MVT types to let us match custom entries.
2464 EVT SrcTy = TLI->getValueType(DL, Src);
2465 EVT DstTy = TLI->getValueType(DL, Dst);
2466
2467 // The function getSimpleVT only handles simple value types.
2468 if (SrcTy.isSimple() && DstTy.isSimple()) {
2469 MVT SimpleSrcTy = SrcTy.getSimpleVT();
2470 MVT SimpleDstTy = DstTy.getSimpleVT();
2471
2472 if (ST->useAVX512Regs()) {
2473 if (ST->hasBWI())
2474 if (const auto *Entry = ConvertCostTableLookup(
2475 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2476 return AdjustCost(Entry->Cost);
2477
2478 if (ST->hasDQI())
2479 if (const auto *Entry = ConvertCostTableLookup(
2480 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2481 return AdjustCost(Entry->Cost);
2482
2483 if (ST->hasAVX512())
2484 if (const auto *Entry = ConvertCostTableLookup(
2485 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2486 return AdjustCost(Entry->Cost);
2487 }
2488
2489 if (ST->hasBWI())
2490 if (const auto *Entry = ConvertCostTableLookup(
2491 AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2492 return AdjustCost(Entry->Cost);
2493
2494 if (ST->hasDQI())
2495 if (const auto *Entry = ConvertCostTableLookup(
2496 AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2497 return AdjustCost(Entry->Cost);
2498
2499 if (ST->hasAVX512())
2500 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2501 SimpleDstTy, SimpleSrcTy))
2502 return AdjustCost(Entry->Cost);
2503
2504 if (ST->hasAVX2()) {
2505 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2506 SimpleDstTy, SimpleSrcTy))
2507 return AdjustCost(Entry->Cost);
2508 }
2509
2510 if (ST->hasAVX()) {
2511 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2512 SimpleDstTy, SimpleSrcTy))
2513 return AdjustCost(Entry->Cost);
2514 }
2515
2516 if (ST->hasSSE41()) {
2517 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
2518 SimpleDstTy, SimpleSrcTy))
2519 return AdjustCost(Entry->Cost);
2520 }
2521
2522 if (ST->hasSSE2()) {
2523 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2524 SimpleDstTy, SimpleSrcTy))
2525 return AdjustCost(Entry->Cost);
2526 }
2527 }
2528
2529 // Fall back to legalized types.
2530 std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src);
2531 std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst);
2532
2533 // If we're truncating to the same legalized type - just assume its free.
2534 if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
2535 return TTI::TCC_Free;
2536
2537 if (ST->useAVX512Regs()) {
2538 if (ST->hasBWI())
2539 if (const auto *Entry = ConvertCostTableLookup(
2540 AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
2541 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2542
2543 if (ST->hasDQI())
2544 if (const auto *Entry = ConvertCostTableLookup(
2545 AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
2546 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2547
2548 if (ST->hasAVX512())
2549 if (const auto *Entry = ConvertCostTableLookup(
2550 AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
2551 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2552 }
2553
2554 if (ST->hasBWI())
2555 if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
2556 LTDest.second, LTSrc.second))
2557 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2558
2559 if (ST->hasDQI())
2560 if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
2561 LTDest.second, LTSrc.second))
2562 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2563
2564 if (ST->hasAVX512())
2565 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2566 LTDest.second, LTSrc.second))
2567 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2568
2569 if (ST->hasAVX2())
2570 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2571 LTDest.second, LTSrc.second))
2572 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2573
2574 if (ST->hasAVX())
2575 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2576 LTDest.second, LTSrc.second))
2577 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2578
2579 if (ST->hasSSE41())
2580 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
2581 LTDest.second, LTSrc.second))
2582 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2583
2584 if (ST->hasSSE2())
2585 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2586 LTDest.second, LTSrc.second))
2587 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2588
2589 // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
2590 // sitofp.
2591 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
2592 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
2593 Type *ExtSrc = Src->getWithNewBitWidth(32);
2594 unsigned ExtOpc =
2595 (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
2596
2597 // For scalar loads the extend would be free.
2598 InstructionCost ExtCost = 0;
2599 if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
2600 ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
2601
2602 return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
2603 TTI::CastContextHint::None, CostKind);
2604 }
2605
2606 // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
2607 // i32.
2608 if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
2609 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
2610 Type *TruncDst = Dst->getWithNewBitWidth(32);
2611 return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
2612 getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
2613 TTI::CastContextHint::None, CostKind);
2614 }
2615
2616 return AdjustCost(
2617 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2618}
2619
2620InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
2621 Type *CondTy,
2622 CmpInst::Predicate VecPred,
2623 TTI::TargetCostKind CostKind,
2624 const Instruction *I) {
2625 // Assume a 3cy latency for fp select ops.
2626 if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select)
2627 if (ValTy->getScalarType()->isFloatingPointTy())
2628 return 3;
2629
2630 // TODO: Handle other cost kinds.
2631 if (CostKind != TTI::TCK_RecipThroughput)
2632 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2633 I);
2634
2635 // Legalize the type.
2636 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2637
2638 MVT MTy = LT.second;
2639
2640 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2641 assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ?
void (0) : __assert_fail ("ISD && \"Invalid opcode\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 2641, __extension__
__PRETTY_FUNCTION__))
;
2642
2643 InstructionCost ExtraCost = 0;
2644 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
2645 // Some vector comparison predicates cost extra instructions.
2646 // TODO: Should we invert this and assume worst case cmp costs
2647 // and reduce for particular predicates?
2648 if (MTy.isVector() &&
2649 !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
2650 (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
2651 ST->hasBWI())) {
2652 // Fallback to I if a specific predicate wasn't specified.
2653 CmpInst::Predicate Pred = VecPred;
2654 if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
2655 Pred == CmpInst::BAD_FCMP_PREDICATE))
2656 Pred = cast<CmpInst>(I)->getPredicate();
2657
2658 switch (Pred) {
2659 case CmpInst::Predicate::ICMP_NE:
2660 // xor(cmpeq(x,y),-1)
2661 ExtraCost = 1;
2662 break;
2663 case CmpInst::Predicate::ICMP_SGE:
2664 case CmpInst::Predicate::ICMP_SLE:
2665 // xor(cmpgt(x,y),-1)
2666 ExtraCost = 1;
2667 break;
2668 case CmpInst::Predicate::ICMP_ULT:
2669 case CmpInst::Predicate::ICMP_UGT:
2670 // cmpgt(xor(x,signbit),xor(y,signbit))
2671 // xor(cmpeq(pmaxu(x,y),x),-1)
2672 ExtraCost = 2;
2673 break;
2674 case CmpInst::Predicate::ICMP_ULE:
2675 case CmpInst::Predicate::ICMP_UGE:
2676 if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
2677 (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
2678 // cmpeq(psubus(x,y),0)
2679 // cmpeq(pminu(x,y),x)
2680 ExtraCost = 1;
2681 } else {
2682 // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
2683 ExtraCost = 3;
2684 }
2685 break;
2686 case CmpInst::Predicate::BAD_ICMP_PREDICATE:
2687 case CmpInst::Predicate::BAD_FCMP_PREDICATE:
2688 // Assume worst case scenario and add the maximum extra cost.
2689 ExtraCost = 3;
2690 break;
2691 default:
2692 break;
2693 }
2694 }
2695 }
2696
2697 static const CostTblEntry SLMCostTbl[] = {
2698 // slm pcmpeq/pcmpgt throughput is 2
2699 { ISD::SETCC, MVT::v2i64, 2 },
2700 // slm pblendvb/blendvpd/blendvps throughput is 4
2701 { ISD::SELECT, MVT::v2f64, 4 }, // vblendvpd
2702 { ISD::SELECT, MVT::v4f32, 4 }, // vblendvps
2703 { ISD::SELECT, MVT::v2i64, 4 }, // pblendvb
2704 { ISD::SELECT, MVT::v8i32, 4 }, // pblendvb
2705 { ISD::SELECT, MVT::v8i16, 4 }, // pblendvb
2706 { ISD::SELECT, MVT::v16i8, 4 }, // pblendvb
2707 };
2708
2709 static const CostTblEntry AVX512BWCostTbl[] = {
2710 { ISD::SETCC, MVT::v32i16, 1 },
2711 { ISD::SETCC, MVT::v64i8, 1 },
2712
2713 { ISD::SELECT, MVT::v32i16, 1 },
2714 { ISD::SELECT, MVT::v64i8, 1 },
2715 };
2716
2717 static const CostTblEntry AVX512CostTbl[] = {
2718 { ISD::SETCC, MVT::v8i64, 1 },
2719 { ISD::SETCC, MVT::v16i32, 1 },
2720 { ISD::SETCC, MVT::v8f64, 1 },
2721 { ISD::SETCC, MVT::v16f32, 1 },
2722
2723 { ISD::SELECT, MVT::v8i64, 1 },
2724 { ISD::SELECT, MVT::v4i64, 1 },
2725 { ISD::SELECT, MVT::v2i64, 1 },
2726 { ISD::SELECT, MVT::v16i32, 1 },
2727 { ISD::SELECT, MVT::v8i32, 1 },
2728 { ISD::SELECT, MVT::v4i32, 1 },
2729 { ISD::SELECT, MVT::v8f64, 1 },
2730 { ISD::SELECT, MVT::v4f64, 1 },
2731 { ISD::SELECT, MVT::v2f64, 1 },
2732 { ISD::SELECT, MVT::f64, 1 },
2733 { ISD::SELECT, MVT::v16f32, 1 },
2734 { ISD::SELECT, MVT::v8f32 , 1 },
2735 { ISD::SELECT, MVT::v4f32, 1 },
2736 { ISD::SELECT, MVT::f32 , 1 },
2737
2738 { ISD::SETCC, MVT::v32i16, 2 }, // FIXME: should probably be 4
2739 { ISD::SETCC, MVT::v64i8, 2 }, // FIXME: should probably be 4
2740
2741 { ISD::SELECT, MVT::v32i16, 2 },
2742 { ISD::SELECT, MVT::v16i16, 1 },
2743 { ISD::SELECT, MVT::v8i16, 1 },
2744 { ISD::SELECT, MVT::v64i8, 2 },
2745 { ISD::SELECT, MVT::v32i8, 1 },
2746 { ISD::SELECT, MVT::v16i8, 1 },
2747 };
2748
2749 static const CostTblEntry AVX2CostTbl[] = {
2750 { ISD::SETCC, MVT::v4i64, 1 },
2751 { ISD::SETCC, MVT::v8i32, 1 },
2752 { ISD::SETCC, MVT::v16i16, 1 },
2753 { ISD::SETCC, MVT::v32i8, 1 },
2754
2755 { ISD::SELECT, MVT::v4f64, 2 }, // vblendvpd
2756 { ISD::SELECT, MVT::v8f32, 2 }, // vblendvps
2757 { ISD::SELECT, MVT::v4i64, 2 }, // pblendvb
2758 { ISD::SELECT, MVT::v8i32, 2 }, // pblendvb
2759 { ISD::SELECT, MVT::v16i16, 2 }, // pblendvb
2760 { ISD::SELECT, MVT::v32i8, 2 }, // pblendvb
2761 };
2762
2763 static const CostTblEntry AVX1CostTbl[] = {
2764 { ISD::SETCC, MVT::v4f64, 1 },
2765 { ISD::SETCC, MVT::v8f32, 1 },
2766 // AVX1 does not support 8-wide integer compare.
2767 { ISD::SETCC, MVT::v4i64, 4 },
2768 { ISD::SETCC, MVT::v8i32, 4 },
2769 { ISD::SETCC, MVT::v16i16, 4 },
2770 { ISD::SETCC, MVT::v32i8, 4 },
2771
2772 { ISD::SELECT, MVT::v4f64, 3 }, // vblendvpd
2773 { ISD::SELECT, MVT::v8f32, 3 }, // vblendvps
2774 { ISD::SELECT, MVT::v4i64, 3 }, // vblendvpd
2775 { ISD::SELECT, MVT::v8i32, 3 }, // vblendvps
2776 { ISD::SELECT, MVT::v16i16, 3 }, // vandps + vandnps + vorps
2777 { ISD::SELECT, MVT::v32i8, 3 }, // vandps + vandnps + vorps
2778 };
2779
2780 static const CostTblEntry SSE42CostTbl[] = {
2781 { ISD::SETCC, MVT::v2i64, 1 },
2782 };
2783
2784 static const CostTblEntry SSE41CostTbl[] = {
2785 { ISD::SETCC, MVT::v2f64, 1 },
2786 { ISD::SETCC, MVT::v4f32, 1 },
2787
2788 { ISD::SELECT, MVT::v2f64, 2 }, // blendvpd
2789 { ISD::SELECT, MVT::f64, 2 }, // blendvpd
2790 { ISD::SELECT, MVT::v4f32, 2 }, // blendvps
2791 { ISD::SELECT, MVT::f32 , 2 }, // blendvps
2792 { ISD::SELECT, MVT::v2i64, 2 }, // pblendvb
2793 { ISD::SELECT, MVT::v4i32, 2 }, // pblendvb
2794 { ISD::SELECT, MVT::v8i16, 2 }, // pblendvb
2795 { ISD::SELECT, MVT::v16i8, 2 }, // pblendvb
2796 };
2797
2798 static const CostTblEntry SSE2CostTbl[] = {
2799 { ISD::SETCC, MVT::v2f64, 2 },
2800 { ISD::SETCC, MVT::f64, 1 },
2801 { ISD::SETCC, MVT::v2i64, 5 }, // pcmpeqd/pcmpgtd expansion
2802 { ISD::SETCC, MVT::v4i32, 1 },
2803 { ISD::SETCC, MVT::v8i16, 1 },
2804 { ISD::SETCC, MVT::v16i8, 1 },
2805
2806 { ISD::SELECT, MVT::v2f64, 2 }, // andpd + andnpd + orpd
2807 { ISD::SELECT, MVT::f64, 2 }, // andpd + andnpd + orpd
2808 { ISD::SELECT, MVT::v2i64, 2 }, // pand + pandn + por
2809 { ISD::SELECT, MVT::v4i32, 2 }, // pand + pandn + por
2810 { ISD::SELECT, MVT::v8i16, 2 }, // pand + pandn + por
2811 { ISD::SELECT, MVT::v16i8, 2 }, // pand + pandn + por
2812 };
2813
2814 static const CostTblEntry SSE1CostTbl[] = {
2815 { ISD::SETCC, MVT::v4f32, 2 },
2816 { ISD::SETCC, MVT::f32, 1 },
2817
2818 { ISD::SELECT, MVT::v4f32, 2 }, // andps + andnps + orps
2819 { ISD::SELECT, MVT::f32, 2 }, // andps + andnps + orps
2820 };
2821
2822 if (ST->useSLMArithCosts())
2823 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
2824 return LT.first * (ExtraCost + Entry->Cost);
2825
2826 if (ST->hasBWI())
2827 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
2828 return LT.first * (ExtraCost + Entry->Cost);
2829
2830 if (ST->hasAVX512())
2831 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2832 return LT.first * (ExtraCost + Entry->Cost);
2833
2834 if (ST->hasAVX2())
2835 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
2836 return LT.first * (ExtraCost + Entry->Cost);
2837
2838 if (ST->hasAVX())
2839 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
2840 return LT.first * (ExtraCost + Entry->Cost);
2841
2842 if (ST->hasSSE42())
2843 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
2844 return LT.first * (ExtraCost + Entry->Cost);
2845
2846 if (ST->hasSSE41())
2847 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
2848 return LT.first * (ExtraCost + Entry->Cost);
2849
2850 if (ST->hasSSE2())
2851 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
2852 return LT.first * (ExtraCost + Entry->Cost);
2853
2854 if (ST->hasSSE1())
2855 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
2856 return LT.first * (ExtraCost + Entry->Cost);
2857
2858 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
2859}
2860
2861unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; }
2862
2863InstructionCost
2864X86TTIImpl::getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
2865 TTI::TargetCostKind CostKind) {
2866
2867 // Costs should match the codegen from:
2868 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
2869 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
2870 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
2871 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
2872 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
2873
2874 // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
2875 // specialized in these tables yet.
2876 static const CostTblEntry AVX512BITALGCostTbl[] = {
2877 { ISD::CTPOP, MVT::v32i16, 1 },
2878 { ISD::CTPOP, MVT::v64i8, 1 },
2879 { ISD::CTPOP, MVT::v16i16, 1 },
2880 { ISD::CTPOP, MVT::v32i8, 1 },
2881 { ISD::CTPOP, MVT::v8i16, 1 },
2882 { ISD::CTPOP, MVT::v16i8, 1 },
2883 };
2884 static const CostTblEntry AVX512VPOPCNTDQCostTbl[] = {
2885 { ISD::CTPOP, MVT::v8i64, 1 },
2886 { ISD::CTPOP, MVT::v16i32, 1 },
2887 { ISD::CTPOP, MVT::v4i64, 1 },
2888 { ISD::CTPOP, MVT::v8i32, 1 },
2889 { ISD::CTPOP, MVT::v2i64, 1 },
2890 { ISD::CTPOP, MVT::v4i32, 1 },
2891 };
2892 static const CostTblEntry AVX512CDCostTbl[] = {
2893 { ISD::CTLZ, MVT::v8i64, 1 },
2894 { ISD::CTLZ, MVT::v16i32, 1 },
2895 { ISD::CTLZ, MVT::v32i16, 8 },
2896 { ISD::CTLZ, MVT::v64i8, 20 },
2897 { ISD::CTLZ, MVT::v4i64, 1 },
2898 { ISD::CTLZ, MVT::v8i32, 1 },
2899 { ISD::CTLZ, MVT::v16i16, 4 },
2900 { ISD::CTLZ, MVT::v32i8, 10 },
2901 { ISD::CTLZ, MVT::v2i64, 1 },
2902 { ISD::CTLZ, MVT::v4i32, 1 },
2903 { ISD::CTLZ, MVT::v8i16, 4 },
2904 { ISD::CTLZ, MVT::v16i8, 4 },
2905 };
2906 static const CostTblEntry AVX512BWCostTbl[] = {
2907 { ISD::ABS, MVT::v32i16, 1 },
2908 { ISD::ABS, MVT::v64i8, 1 },
2909 { ISD::BITREVERSE, MVT::v8i64, 3 },
2910 { ISD::BITREVERSE, MVT::v16i32, 3 },
2911 { ISD::BITREVERSE, MVT::v32i16, 3 },
2912 { ISD::BITREVERSE, MVT::v64i8, 2 },
2913 { ISD::BSWAP, MVT::v8i64, 1 },
2914 { ISD::BSWAP, MVT::v16i32, 1 },
2915 { ISD::BSWAP, MVT::v32i16, 1 },
2916 { ISD::CTLZ, MVT::v8i64, 23 },
2917 { ISD::CTLZ, MVT::v16i32, 22 },
2918 { ISD::CTLZ, MVT::v32i16, 18 },
2919 { ISD::CTLZ, MVT::v64i8, 17 },
2920 { ISD::CTPOP, MVT::v8i64, 7 },
2921 { ISD::CTPOP, MVT::v16i32, 11 },
2922 { ISD::CTPOP, MVT::v32i16, 9 },
2923 { ISD::CTPOP, MVT::v64i8, 6 },
2924 { ISD::CTTZ, MVT::v8i64, 10 },
2925 { ISD::CTTZ, MVT::v16i32, 14 },
2926 { ISD::CTTZ, MVT::v32i16, 12 },
2927 { ISD::CTTZ, MVT::v64i8, 9 },
2928 { ISD::SADDSAT, MVT::v32i16, 1 },
2929 { ISD::SADDSAT, MVT::v64i8, 1 },
2930 { ISD::SMAX, MVT::v32i16, 1 },
2931 { ISD::SMAX, MVT::v64i8, 1 },
2932 { ISD::SMIN, MVT::v32i16, 1 },
2933 { ISD::SMIN, MVT::v64i8, 1 },
2934 { ISD::SSUBSAT, MVT::v32i16, 1 },
2935 { ISD::SSUBSAT, MVT::v64i8, 1 },
2936 { ISD::UADDSAT, MVT::v32i16, 1 },
2937 { ISD::UADDSAT, MVT::v64i8, 1 },
2938 { ISD::UMAX, MVT::v32i16, 1 },
2939 { ISD::UMAX, MVT::v64i8, 1 },
2940 { ISD::UMIN, MVT::v32i16, 1 },
2941 { ISD::UMIN, MVT::v64i8, 1 },
2942 { ISD::USUBSAT, MVT::v32i16, 1 },
2943 { ISD::USUBSAT, MVT::v64i8, 1 },
2944 };
2945 static const CostTblEntry AVX512CostTbl[] = {
2946 { ISD::ABS, MVT::v8i64, 1 },
2947 { ISD::ABS, MVT::v16i32, 1 },
2948 { ISD::ABS, MVT::v32i16, 2 },
2949 { ISD::ABS, MVT::v64i8, 2 },
2950 { ISD::ABS, MVT::v4i64, 1 },
2951 { ISD::ABS, MVT::v2i64, 1 },
2952 { ISD::BITREVERSE, MVT::v8i64, 36 },
2953 { ISD::BITREVERSE, MVT::v16i32, 24 },
2954 { ISD::BITREVERSE, MVT::v32i16, 10 },
2955 { ISD::BITREVERSE, MVT::v64i8, 10 },
2956 { ISD::BSWAP, MVT::v8i64, 4 },
2957 { ISD::BSWAP, MVT::v16i32, 4 },
2958 { ISD::BSWAP, MVT::v32i16, 4 },
2959 { ISD::CTLZ, MVT::v8i64, 29 },
2960 { ISD::CTLZ, MVT::v16i32, 35 },
2961 { ISD::CTLZ, MVT::v32i16, 28 },
2962 { ISD::CTLZ, MVT::v64i8, 18 },
2963 { ISD::CTPOP, MVT::v8i64, 16 },
2964 { ISD::CTPOP, MVT::v16i32, 24 },
2965 { ISD::CTPOP, MVT::v32i16, 18 },
2966 { ISD::CTPOP, MVT::v64i8, 12 },
2967 { ISD::CTTZ, MVT::v8i64, 20 },
2968 { ISD::CTTZ, MVT::v16i32, 28 },
2969 { ISD::CTTZ, MVT::v32i16, 24 },
2970 { ISD::CTTZ, MVT::v64i8, 18 },
2971 { ISD::SMAX, MVT::v8i64, 1 },
2972 { ISD::SMAX, MVT::v16i32, 1 },
2973 { ISD::SMAX, MVT::v32i16, 2 },
2974 { ISD::SMAX, MVT::v64i8, 2 },
2975 { ISD::SMAX, MVT::v4i64, 1 },
2976 { ISD::SMAX, MVT::v2i64, 1 },
2977 { ISD::SMIN, MVT::v8i64, 1 },
2978 { ISD::SMIN, MVT::v16i32, 1 },
2979 { ISD::SMIN, MVT::v32i16, 2 },
2980 { ISD::SMIN, MVT::v64i8, 2 },
2981 { ISD::SMIN, MVT::v4i64, 1 },
2982 { ISD::SMIN, MVT::v2i64, 1 },
2983 { ISD::UMAX, MVT::v8i64, 1 },
2984 { ISD::UMAX, MVT::v16i32, 1 },
2985 { ISD::UMAX, MVT::v32i16, 2 },
2986 { ISD::UMAX, MVT::v64i8, 2 },
2987 { ISD::UMAX, MVT::v4i64, 1 },
2988 { ISD::UMAX, MVT::v2i64, 1 },
2989 { ISD::UMIN, MVT::v8i64, 1 },
2990 { ISD::UMIN, MVT::v16i32, 1 },
2991 { ISD::UMIN, MVT::v32i16, 2 },
2992 { ISD::UMIN, MVT::v64i8, 2 },
2993 { ISD::UMIN, MVT::v4i64, 1 },
2994 { ISD::UMIN, MVT::v2i64, 1 },
2995 { ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd
2996 { ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq
2997 { ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq
2998 { ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq
2999 { ISD::UADDSAT, MVT::v16i32, 3 }, // not + pminud + paddd
3000 { ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq
3001 { ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq
3002 { ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq
3003 { ISD::SADDSAT, MVT::v32i16, 2 },
3004 { ISD::SADDSAT, MVT::v64i8, 2 },
3005 { ISD::SSUBSAT, MVT::v32i16, 2 },
3006 { ISD::SSUBSAT, MVT::v64i8, 2 },
3007 { ISD::UADDSAT, MVT::v32i16, 2 },
3008 { ISD::UADDSAT, MVT::v64i8, 2 },
3009 { ISD::USUBSAT, MVT::v32i16, 2 },
3010 { ISD::USUBSAT, MVT::v64i8, 2 },
3011 { ISD::FMAXNUM, MVT::f32, 2 },
3012 { ISD::FMAXNUM, MVT::v4f32, 2 },
3013 { ISD::FMAXNUM, MVT::v8f32, 2 },
3014 { ISD::FMAXNUM, MVT::v16f32, 2 },
3015 { ISD::FMAXNUM, MVT::f64, 2 },
3016 { ISD::FMAXNUM, MVT::v2f64, 2 },
3017 { ISD::FMAXNUM, MVT::v4f64, 2 },
3018 { ISD::FMAXNUM, MVT::v8f64, 2 },
3019 };
3020 static const CostTblEntry XOPCostTbl[] = {
3021 { ISD::BITREVERSE, MVT::v4i64, 4 },
3022 { ISD::BITREVERSE, MVT::v8i32, 4 },
3023 { ISD::BITREVERSE, MVT::v16i16, 4 },
3024 { ISD::BITREVERSE, MVT::v32i8, 4 },
3025 { ISD::BITREVERSE, MVT::v2i64, 1 },
3026 { ISD::BITREVERSE, MVT::v4i32, 1 },
3027 { ISD::BITREVERSE, MVT::v8i16, 1 },
3028 { ISD::BITREVERSE, MVT::v16i8, 1 },
3029 { ISD::BITREVERSE, MVT::i64, 3 },
3030 { ISD::BITREVERSE, MVT::i32, 3 },
3031 { ISD::BITREVERSE, MVT::i16, 3 },
3032 { ISD::BITREVERSE, MVT::i8, 3 }
3033 };
3034 static const CostTblEntry AVX2CostTbl[] = {
3035 { ISD::ABS, MVT::v4i64, 2 }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3036 { ISD::ABS, MVT::v8i32, 1 },
3037 { ISD::ABS, MVT::v16i16, 1 },
3038 { ISD::ABS, MVT::v32i8, 1 },
3039 { ISD::BITREVERSE, MVT::v2i64, 3 },
3040 { ISD::BITREVERSE, MVT::v4i64, 3 },
3041 { ISD::BITREVERSE, MVT::v4i32, 3 },
3042 { ISD::BITREVERSE, MVT::v8i32, 3 },
3043 { ISD::BITREVERSE, MVT::v8i16, 3 },
3044 { ISD::BITREVERSE, MVT::v16i16, 3 },
3045 { ISD::BITREVERSE, MVT::v16i8, 3 },
3046 { ISD::BITREVERSE, MVT::v32i8, 3 },
3047 { ISD::BSWAP, MVT::v4i64, 1 },
3048 { ISD::BSWAP, MVT::v8i32, 1 },
3049 { ISD::BSWAP, MVT::v16i16, 1 },
3050 { ISD::CTLZ, MVT::v2i64, 7 },
3051 { ISD::CTLZ, MVT::v4i64, 7 },
3052 { ISD::CTLZ, MVT::v4i32, 5 },
3053 { ISD::CTLZ, MVT::v8i32, 5 },
3054 { ISD::CTLZ, MVT::v8i16, 4 },
3055 { ISD::CTLZ, MVT::v16i16, 4 },
3056 { ISD::CTLZ, MVT::v16i8, 3 },
3057 { ISD::CTLZ, MVT::v32i8, 3 },
3058 { ISD::CTPOP, MVT::v2i64, 3 },
3059 { ISD::CTPOP, MVT::v4i64, 3 },
3060 { ISD::CTPOP, MVT::v4i32, 7 },
3061 { ISD::CTPOP, MVT::v8i32, 7 },
3062 { ISD::CTPOP, MVT::v8i16, 3 },
3063 { ISD::CTPOP, MVT::v16i16, 3 },
3064 { ISD::CTPOP, MVT::v16i8, 2 },
3065 { ISD::CTPOP, MVT::v32i8, 2 },
3066 { ISD::CTTZ, MVT::v2i64, 4 },
3067 { ISD::CTTZ, MVT::v4i64, 4 },
3068 { ISD::CTTZ, MVT::v4i32, 7 },
3069 { ISD::CTTZ, MVT::v8i32, 7 },
3070 { ISD::CTTZ, MVT::v8i16, 4 },
3071 { ISD::CTTZ, MVT::v16i16, 4 },
3072 { ISD::CTTZ, MVT::v16i8, 3 },
3073 { ISD::CTTZ, MVT::v32i8, 3 },
3074 { ISD::SADDSAT, MVT::v16i16, 1 },
3075 { ISD::SADDSAT, MVT::v32i8, 1 },
3076 { ISD::SMAX, MVT::v8i32, 1 },
3077 { ISD::SMAX, MVT::v16i16, 1 },
3078 { ISD::SMAX, MVT::v32i8, 1 },
3079 { ISD::SMIN, MVT::v8i32, 1 },
3080 { ISD::SMIN, MVT::v16i16, 1 },
3081 { ISD::SMIN, MVT::v32i8, 1 },
3082 { ISD::SSUBSAT, MVT::v16i16, 1 },
3083 { ISD::SSUBSAT, MVT::v32i8, 1 },
3084 { ISD::UADDSAT, MVT::v16i16, 1 },
3085 { ISD::UADDSAT, MVT::v32i8, 1 },
3086 { ISD::UADDSAT, MVT::v8i32, 3 }, // not + pminud + paddd
3087 { ISD::UMAX, MVT::v8i32, 1 },
3088 { ISD::UMAX, MVT::v16i16, 1 },
3089 { ISD::UMAX, MVT::v32i8, 1 },
3090 { ISD::UMIN, MVT::v8i32, 1 },
3091 { ISD::UMIN, MVT::v16i16, 1 },
3092 { ISD::UMIN, MVT::v32i8, 1 },
3093 { ISD::USUBSAT, MVT::v16i16, 1 },
3094 { ISD::USUBSAT, MVT::v32i8, 1 },
3095 { ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd
3096 { ISD::FMAXNUM, MVT::v8f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS
3097 { ISD::FMAXNUM, MVT::v4f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD
3098 { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/
3099 { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
3100 { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
3101 { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/
3102 { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
3103 { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
3104 };
3105 static const CostTblEntry AVX1CostTbl[] = {
3106 { ISD::ABS, MVT::v4i64, 5 }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3107 { ISD::ABS, MVT::v8i32, 3 },
3108 { ISD::ABS, MVT::v16i16, 3 },
3109 { ISD::ABS, MVT::v32i8, 3 },
3110 { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert
3111 { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert
3112 { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert
3113 { ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert
3114 { ISD::BSWAP, MVT::v4i64, 4 },
3115 { ISD::BSWAP, MVT::v8i32, 4 },
3116 { ISD::BSWAP, MVT::v16i16, 4 },
3117 { ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert
3118 { ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert
3119 { ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert
3120 { ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
3121 { ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert
3122 { ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert
3123 { ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert
3124 { ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert
3125 { ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert
3126 { ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert
3127 { ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert
3128 { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
3129 { ISD::SADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
3130 { ISD::SADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
3131 { ISD::SMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
3132 { ISD::SMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
3133 { ISD::SMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
3134 { ISD::SMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
3135 { ISD::SMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
3136 { ISD::SMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
3137 { ISD::SSUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
3138 { ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
3139 { ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
3140 { ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
3141 { ISD::UADDSAT, MVT::v8i32, 8 }, // 2 x 128-bit Op + extract/insert
3142 { ISD::UMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
3143 { ISD::UMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
3144 { ISD::UMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
3145 { ISD::UMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
3146 { ISD::UMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
3147 { ISD::UMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
3148 { ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
3149 { ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
3150 { ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert
3151 { ISD::FMAXNUM, MVT::f32, 3 }, // MAXSS + CMPUNORDSS + BLENDVPS
3152 { ISD::FMAXNUM, MVT::v4f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS
3153 { ISD::FMAXNUM, MVT::v8f32, 5 }, // MAXPS + CMPUNORDPS + BLENDVPS + ?
3154 { ISD::FMAXNUM, MVT::f64, 3 }, // MAXSD + CMPUNORDSD + BLENDVPD
3155 { ISD::FMAXNUM, MVT::v2f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD
3156 { ISD::FMAXNUM, MVT::v4f64, 5 }, // MAXPD + CMPUNORDPD + BLENDVPD + ?
3157 { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
3158 { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
3159 { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
3160 { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/
3161 { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/
3162 { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/
3163 };
3164 static const CostTblEntry GLMCostTbl[] = {
3165 { ISD::FSQRT, MVT::f32, 19 }, // sqrtss
3166 { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps
3167 { ISD::FSQRT, MVT::f64, 34 }, // sqrtsd
3168 { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd
3169 };
3170 static const CostTblEntry SLMCostTbl[] = {
3171 { ISD::FSQRT, MVT::f32, 20 }, // sqrtss
3172 { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps
3173 { ISD::FSQRT, MVT::f64, 35 }, // sqrtsd
3174 { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd
3175 };
3176 static const CostTblEntry SSE42CostTbl[] = {
3177 { ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd
3178 { ISD::UADDSAT, MVT::v4i32, 3 }, // not + pminud + paddd
3179 { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
3180 { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
3181 };
3182 static const CostTblEntry SSE41CostTbl[] = {
3183 { ISD::ABS, MVT::v2i64, 2 }, // BLENDVPD(X,PSUBQ(0,X),X)
3184 { ISD::SMAX, MVT::v4i32, 1 },
3185 { ISD::SMAX, MVT::v16i8, 1 },
3186 { ISD::SMIN, MVT::v4i32, 1 },
3187 { ISD::SMIN, MVT::v16i8, 1 },
3188 { ISD::UMAX, MVT::v4i32, 1 },
3189 { ISD::UMAX, MVT::v8i16, 1 },
3190 { ISD::UMIN, MVT::v4i32, 1 },
3191 { ISD::UMIN, MVT::v8i16, 1 },
3192 };
3193 static const CostTblEntry SSSE3CostTbl[] = {
3194 { ISD::ABS, MVT::v4i32, 1 },
3195 { ISD::ABS, MVT::v8i16, 1 },
3196 { ISD::ABS, MVT::v16i8, 1 },
3197 { ISD::BITREVERSE, MVT::v2i64, 5 },
3198 { ISD::BITREVERSE, MVT::v4i32, 5 },
3199 { ISD::BITREVERSE, MVT::v8i16, 5 },
3200 { ISD::BITREVERSE, MVT::v16i8, 5 },
3201 { ISD::BSWAP, MVT::v2i64, 1 },
3202 { ISD::BSWAP, MVT::v4i32, 1 },
3203 { ISD::BSWAP, MVT::v8i16, 1 },
3204 { ISD::CTLZ, MVT::v2i64, 23 },
3205 { ISD::CTLZ, MVT::v4i32, 18 },
3206 { ISD::CTLZ, MVT::v8i16, 14 },
3207 { ISD::CTLZ, MVT::v16i8, 9 },
3208 { ISD::CTPOP, MVT::v2i64, 7 },
3209 { ISD::CTPOP, MVT::v4i32, 11 },
3210 { ISD::CTPOP, MVT::v8i16, 9 },
3211 { ISD::CTPOP, MVT::v16i8, 6 },
3212 { ISD::CTTZ, MVT::v2i64, 10 },
3213 { ISD::CTTZ, MVT::v4i32, 14 },
3214 { ISD::CTTZ, MVT::v8i16, 12 },
3215 { ISD::CTTZ, MVT::v16i8, 9 }
3216 };
3217 static const CostTblEntry SSE2CostTbl[] = {
3218 { ISD::ABS, MVT::v2i64, 4 },
3219 { ISD::ABS, MVT::v4i32, 3 },
3220 { ISD::ABS, MVT::v8i16, 2 },
3221 { ISD::ABS, MVT::v16i8, 2 },
3222 { ISD::BITREVERSE, MVT::v2i64, 29 },
3223 { ISD::BITREVERSE, MVT::v4i32, 27 },
3224 { ISD::BITREVERSE, MVT::v8i16, 27 },
3225 { ISD::BITREVERSE, MVT::v16i8, 20 },
3226 { ISD::BSWAP, MVT::v2i64, 7 },
3227 { ISD::BSWAP, MVT::v4i32, 7 },
3228 { ISD::BSWAP, MVT::v8i16, 7 },
3229 { ISD::CTLZ, MVT::v2i64, 25 },
3230 { ISD::CTLZ, MVT::v4i32, 26 },
3231 { ISD::CTLZ, MVT::v8i16, 20 },
3232 { ISD::CTLZ, MVT::v16i8, 17 },
3233 { ISD::CTPOP, MVT::v2i64, 12 },
3234 { ISD::CTPOP, MVT::v4i32, 15 },
3235 { ISD::CTPOP, MVT::v8i16, 13 },
3236 { ISD::CTPOP, MVT::v16i8, 10 },
3237 { ISD::CTTZ, MVT::v2i64, 14 },
3238 { ISD::CTTZ, MVT::v4i32, 18 },
3239 { ISD::CTTZ, MVT::v8i16, 16 },
3240 { ISD::CTTZ, MVT::v16i8, 13 },
3241 { ISD::SADDSAT, MVT::v8i16, 1 },
3242 { ISD::SADDSAT, MVT::v16i8, 1 },
3243 { ISD::SMAX, MVT::v8i16, 1 },
3244 { ISD::SMIN, MVT::v8i16, 1 },
3245 { ISD::SSUBSAT, MVT::v8i16, 1 },
3246 { ISD::SSUBSAT, MVT::v16i8, 1 },
3247 { ISD::UADDSAT, MVT::v8i16, 1 },
3248 { ISD::UADDSAT, MVT::v16i8, 1 },
3249 { ISD::UMAX, MVT::v8i16, 2 },
3250 { ISD::UMAX, MVT::v16i8, 1 },
3251 { ISD::UMIN, MVT::v8i16, 2 },
3252 { ISD::UMIN, MVT::v16i8, 1 },
3253 { ISD::USUBSAT, MVT::v8i16, 1 },
3254 { ISD::USUBSAT, MVT::v16i8, 1 },
3255 { ISD::FMAXNUM, MVT::f64, 4 },
3256 { ISD::FMAXNUM, MVT::v2f64, 4 },
3257 { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/
3258 { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/
3259 };
3260 static const CostTblEntry SSE1CostTbl[] = {
3261 { ISD::FMAXNUM, MVT::f32, 4 },
3262 { ISD::FMAXNUM, MVT::v4f32, 4 },
3263 { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
3264 { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
3265 };
3266 static const CostTblEntry BMI64CostTbl[] = { // 64-bit targets
3267 { ISD::CTTZ, MVT::i64, 1 },
3268 };
3269 static const CostTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
3270 { ISD::CTTZ, MVT::i32, 1 },
3271 { ISD::CTTZ, MVT::i16, 1 },
3272 { ISD::CTTZ, MVT::i8, 1 },
3273 };
3274 static const CostTblEntry LZCNT64CostTbl[] = { // 64-bit targets
3275 { ISD::CTLZ, MVT::i64, 1 },
3276 };
3277 static const CostTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
3278 { ISD::CTLZ, MVT::i32, 1 },
3279 { ISD::CTLZ, MVT::i16, 1 },
3280 { ISD::CTLZ, MVT::i8, 1 },
3281 };
3282 static const CostTblEntry POPCNT64CostTbl[] = { // 64-bit targets
3283 { ISD::CTPOP, MVT::i64, 1 },
3284 };
3285 static const CostTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
3286 { ISD::CTPOP, MVT::i32, 1 },
3287 { ISD::CTPOP, MVT::i16, 1 },
3288 { ISD::CTPOP, MVT::i8, 1 },
3289 };
3290 static const CostTblEntry X64CostTbl[] = { // 64-bit targets
3291 { ISD::ABS, MVT::i64, 2 }, // SUB+CMOV
3292 { ISD::BITREVERSE, MVT::i64, 14 },
3293 { ISD::BSWAP, MVT::i64, 1 },
3294 { ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV
3295 { ISD::CTTZ, MVT::i64, 3 }, // TEST+BSF+CMOV/BRANCH
3296 { ISD::CTPOP, MVT::i64, 10 },
3297 { ISD::SADDO, MVT::i64, 1 },
3298 { ISD::UADDO, MVT::i64, 1 },
3299 { ISD::UMULO, MVT::i64, 2 }, // mulq + seto
3300 };
3301 static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
3302 { ISD::ABS, MVT::i32, 2 }, // SUB+CMOV
3303 { ISD::ABS, MVT::i16, 2 }, // SUB+CMOV
3304 { ISD::BITREVERSE, MVT::i32, 14 },
3305 { ISD::BITREVERSE, MVT::i16, 14 },
3306 { ISD::BITREVERSE, MVT::i8, 11 },
3307 { ISD::BSWAP, MVT::i32, 1 },
3308 { ISD::BSWAP, MVT::i16, 1 }, // ROL
3309 { ISD::CTLZ, MVT::i32, 4 }, // BSR+XOR or BSR+XOR+CMOV
3310 { ISD::CTLZ, MVT::i16, 4 }, // BSR+XOR or BSR+XOR+CMOV
3311 { ISD::CTLZ, MVT::i8, 4 }, // BSR+XOR or BSR+XOR+CMOV
3312 { ISD::CTTZ, MVT::i32, 3 }, // TEST+BSF+CMOV/BRANCH
3313 { ISD::CTTZ, MVT::i16, 3 }, // TEST+BSF+CMOV/BRANCH
3314 { ISD::CTTZ, MVT::i8, 3 }, // TEST+BSF+CMOV/BRANCH
3315 { ISD::CTPOP, MVT::i32, 8 },
3316 { ISD::CTPOP, MVT::i16, 9 },
3317 { ISD::CTPOP, MVT::i8, 7 },
3318 { ISD::SADDO, MVT::i32, 1 },
3319 { ISD::SADDO, MVT::i16, 1 },
3320 { ISD::SADDO, MVT::i8, 1 },
3321 { ISD::UADDO, MVT::i32, 1 },
3322 { ISD::UADDO, MVT::i16, 1 },
3323 { ISD::UADDO, MVT::i8, 1 },
3324 { ISD::UMULO, MVT::i32, 2 }, // mul + seto
3325 { ISD::UMULO, MVT::i16, 2 },
3326 { ISD::UMULO, MVT::i8, 2 },
3327 };
3328
3329 Type *RetTy = ICA.getReturnType();
3330 Type *OpTy = RetTy;
3331 Intrinsic::ID IID = ICA.getID();
3332 unsigned ISD = ISD::DELETED_NODE;
3333 switch (IID) {
3334 default:
3335 break;
3336 case Intrinsic::abs:
3337 ISD = ISD::ABS;
3338 break;
3339 case Intrinsic::bitreverse:
3340 ISD = ISD::BITREVERSE;
3341 break;
3342 case Intrinsic::bswap:
3343 ISD = ISD::BSWAP;
3344 break;
3345 case Intrinsic::ctlz:
3346 ISD = ISD::CTLZ;
3347 break;
3348 case Intrinsic::ctpop:
3349 ISD = ISD::CTPOP;
3350 break;
3351 case Intrinsic::cttz:
3352 ISD = ISD::CTTZ;
3353 break;
3354 case Intrinsic::maxnum:
3355 case Intrinsic::minnum:
3356 // FMINNUM has same costs so don't duplicate.
3357 ISD = ISD::FMAXNUM;
3358 break;
3359 case Intrinsic::sadd_sat:
3360 ISD = ISD::SADDSAT;
3361 break;
3362 case Intrinsic::smax:
3363 ISD = ISD::SMAX;
3364 break;
3365 case Intrinsic::smin:
3366 ISD = ISD::SMIN;
3367 break;
3368 case Intrinsic::ssub_sat:
3369 ISD = ISD::SSUBSAT;
3370 break;
3371 case Intrinsic::uadd_sat:
3372 ISD = ISD::UADDSAT;
3373 break;
3374 case Intrinsic::umax:
3375 ISD = ISD::UMAX;
3376 break;
3377 case Intrinsic::umin:
3378 ISD = ISD::UMIN;
3379 break;
3380 case Intrinsic::usub_sat:
3381 ISD = ISD::USUBSAT;
3382 break;
3383 case Intrinsic::sqrt:
3384 ISD = ISD::FSQRT;
3385 break;
3386 case Intrinsic::sadd_with_overflow:
3387 case Intrinsic::ssub_with_overflow:
3388 // SSUBO has same costs so don't duplicate.
3389 ISD = ISD::SADDO;
3390 OpTy = RetTy->getContainedType(0);
3391 break;
3392 case Intrinsic::uadd_with_overflow:
3393 case Intrinsic::usub_with_overflow:
3394 // USUBO has same costs so don't duplicate.
3395 ISD = ISD::UADDO;
3396 OpTy = RetTy->getContainedType(0);
3397 break;
3398 case Intrinsic::umul_with_overflow:
3399 case Intrinsic::smul_with_overflow:
3400 // SMULO has same costs so don't duplicate.
3401 ISD = ISD::UMULO;
3402 OpTy = RetTy->getContainedType(0);
3403 break;
3404 }
3405
3406 if (ISD != ISD::DELETED_NODE) {
3407 // Legalize the type.
3408 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy);
3409 MVT MTy = LT.second;
3410
3411 // Attempt to lookup cost.
3412 if (ISD == ISD::BITREVERSE && ST->hasGFNI() && ST->hasSSSE3() &&
3413 MTy.isVector()) {
3414 // With PSHUFB the code is very similar for all types. If we have integer
3415 // byte operations, we just need a GF2P8AFFINEQB for vXi8. For other types
3416 // we also need a PSHUFB.
3417 unsigned Cost = MTy.getVectorElementType() == MVT::i8 ? 1 : 2;
3418
3419 // Without byte operations, we need twice as many GF2P8AFFINEQB and PSHUFB
3420 // instructions. We also need an extract and an insert.
3421 if (!(MTy.is128BitVector() || (ST->hasAVX2() && MTy.is256BitVector()) ||
3422 (ST->hasBWI() && MTy.is512BitVector())))
3423 Cost = Cost * 2 + 2;
3424
3425 return LT.first * Cost;
3426 }
3427
3428 auto adjustTableCost = [](const CostTblEntry &Entry,
3429 InstructionCost LegalizationCost,
3430 FastMathFlags FMF) {
3431 // If there are no NANs to deal with, then these are reduced to a
3432 // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
3433 // assume is used in the non-fast case.
3434 if (Entry.ISD == ISD::FMAXNUM || Entry.ISD == ISD::FMINNUM) {
3435 if (FMF.noNaNs())
3436 return LegalizationCost * 1;
3437 }
3438 return LegalizationCost * (int)Entry.Cost;
3439 };
3440
3441 if (ST->useGLMDivSqrtCosts())
3442 if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
3443 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3444
3445 if (ST->useSLMArithCosts())
3446 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
3447 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3448
3449 if (ST->hasBITALG())
3450 if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
3451 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3452
3453 if (ST->hasVPOPCNTDQ())
3454 if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
3455 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3456
3457 if (ST->hasCDI())
3458 if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
3459 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3460
3461 if (ST->hasBWI())
3462 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3463 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3464
3465 if (ST->hasAVX512())
3466 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3467 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3468
3469 if (ST->hasXOP())
3470 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3471 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3472
3473 if (ST->hasAVX2())
3474 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3475 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3476
3477 if (ST->hasAVX())
3478 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3479 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3480
3481 if (ST->hasSSE42())
3482 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3483 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3484
3485 if (ST->hasSSE41())
3486 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3487 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3488
3489 if (ST->hasSSSE3())
3490 if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
3491 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3492
3493 if (ST->hasSSE2())
3494 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3495 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3496
3497 if (ST->hasSSE1())
3498 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3499 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3500
3501 if (ST->hasBMI()) {
3502 if (ST->is64Bit())
3503 if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
3504 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3505
3506 if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
3507 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3508 }
3509
3510 if (ST->hasLZCNT()) {
3511 if (ST->is64Bit())
3512 if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
3513 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3514
3515 if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
3516 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3517 }
3518
3519 if (ST->hasPOPCNT()) {
3520 if (ST->is64Bit())
3521 if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
3522 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3523
3524 if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
3525 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3526 }
3527
3528 if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
3529 if (const Instruction *II = ICA.getInst()) {
3530 if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
3531 return TTI::TCC_Free;
3532 if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
3533 if (LI->hasOneUse())
3534 return TTI::TCC_Free;
3535 }
3536 }
3537 }
3538
3539 if (ST->is64Bit())
3540 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
3541 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3542
3543 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
3544 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3545 }
3546
3547 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
3548}
3549
3550InstructionCost
3551X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
3552 TTI::TargetCostKind CostKind) {
3553 if (ICA.isTypeBasedOnly())
3554 return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
3555
3556 static const CostTblEntry AVX512BWCostTbl[] = {
3557 { ISD::ROTL, MVT::v32i16, 2 },
3558 { ISD::ROTL, MVT::v16i16, 2 },
3559 { ISD::ROTL, MVT::v8i16, 2 },
3560 { ISD::ROTL, MVT::v64i8, 5 },
3561 { ISD::ROTL, MVT::v32i8, 5 },
3562 { ISD::ROTL, MVT::v16i8, 5 },
3563 { ISD::ROTR, MVT::v32i16, 2 },
3564 { ISD::ROTR, MVT::v16i16, 2 },
3565 { ISD::ROTR, MVT::v8i16, 2 },
3566 { ISD::ROTR, MVT::v64i8, 5 },
3567 { ISD::ROTR, MVT::v32i8, 5 },
3568 { ISD::ROTR, MVT::v16i8, 5 }
3569 };
3570 static const CostTblEntry AVX512CostTbl[] = {
3571 { ISD::ROTL, MVT::v8i64, 1 },
3572 { ISD::ROTL, MVT::v4i64, 1 },
3573 { ISD::ROTL, MVT::v2i64, 1 },
3574 { ISD::ROTL, MVT::v16i32, 1 },
3575 { ISD::ROTL, MVT::v8i32, 1 },
3576 { ISD::ROTL, MVT::v4i32, 1 },
3577 { ISD::ROTR, MVT::v8i64, 1 },
3578 { ISD::ROTR, MVT::v4i64, 1 },
3579 { ISD::ROTR, MVT::v2i64, 1 },
3580 { ISD::ROTR, MVT::v16i32, 1 },
3581 { ISD::ROTR, MVT::v8i32, 1 },
3582 { ISD::ROTR, MVT::v4i32, 1 }
3583 };
3584 // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
3585 static const CostTblEntry XOPCostTbl[] = {
3586 { ISD::ROTL, MVT::v4i64, 4 },
3587 { ISD::ROTL, MVT::v8i32, 4 },
3588 { ISD::ROTL, MVT::v16i16, 4 },
3589 { ISD::ROTL, MVT::v32i8, 4 },
3590 { ISD::ROTL, MVT::v2i64, 1 },
3591 { ISD::ROTL, MVT::v4i32, 1 },
3592 { ISD::ROTL, MVT::v8i16, 1 },
3593 { ISD::ROTL, MVT::v16i8, 1 },
3594 { ISD::ROTR, MVT::v4i64, 6 },
3595 { ISD::ROTR, MVT::v8i32, 6 },
3596 { ISD::ROTR, MVT::v16i16, 6 },
3597 { ISD::ROTR, MVT::v32i8, 6 },
3598 { ISD::ROTR, MVT::v2i64, 2 },
3599 { ISD::ROTR, MVT::v4i32, 2 },
3600 { ISD::ROTR, MVT::v8i16, 2 },
3601 { ISD::ROTR, MVT::v16i8, 2 }
3602 };
3603 static const CostTblEntry X64CostTbl[] = { // 64-bit targets
3604 { ISD::ROTL, MVT::i64, 1 },
3605 { ISD::ROTR, MVT::i64, 1 },
3606 { ISD::FSHL, MVT::i64, 4 }
3607 };
3608 static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
3609 { ISD::ROTL, MVT::i32, 1 },
3610 { ISD::ROTL, MVT::i16, 1 },
3611 { ISD::ROTL, MVT::i8, 1 },
3612 { ISD::ROTR, MVT::i32, 1 },
3613 { ISD::ROTR, MVT::i16, 1 },
3614 { ISD::ROTR, MVT::i8, 1 },
3615 { ISD::FSHL, MVT::i32, 4 },
3616 { ISD::FSHL, MVT::i16, 4 },
3617 { ISD::FSHL, MVT::i8, 4 }
3618 };
3619
3620 Intrinsic::ID IID = ICA.getID();
3621 Type *RetTy = ICA.getReturnType();
3622 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
3623 unsigned ISD = ISD::DELETED_NODE;
3624 switch (IID) {
3625 default:
3626 break;
3627 case Intrinsic::fshl:
3628 ISD = ISD::FSHL;
3629 if (Args[0] == Args[1])
3630 ISD = ISD::ROTL;
3631 break;
3632 case Intrinsic::fshr:
3633 // FSHR has same costs so don't duplicate.
3634 ISD = ISD::FSHL;
3635 if (Args[0] == Args[1])
3636 ISD = ISD::ROTR;
3637 break;
3638 }
3639
3640 if (ISD != ISD::DELETED_NODE) {
3641 // Legalize the type.
3642 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
3643 MVT MTy = LT.second;
3644
3645 // Attempt to lookup cost.
3646 if (ST->hasBWI())
3647 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3648 return LT.first * Entry->Cost;
3649
3650 if (ST->hasAVX512())
3651 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3652 return LT.first * Entry->Cost;
3653
3654 if (ST->hasXOP())
3655 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3656 return LT.first * Entry->Cost;
3657
3658 if (ST->is64Bit())
3659 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
3660 return LT.first * Entry->Cost;
3661
3662 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
3663 return LT.first * Entry->Cost;
3664 }
3665
3666 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
3667}
3668
3669InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
3670 unsigned Index) {
3671 static const CostTblEntry SLMCostTbl[] = {
3672 { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },
3673 { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 },
3674 { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 },
3675 { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 }
3676 };
3677
3678 assert(Val->isVectorTy() && "This must be a vector type")(static_cast <bool> (Val->isVectorTy() && "This must be a vector type"
) ? void (0) : __assert_fail ("Val->isVectorTy() && \"This must be a vector type\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3678, __extension__
__PRETTY_FUNCTION__))
;
16
'?' condition is true
3679 Type *ScalarType = Val->getScalarType();
3680 InstructionCost RegisterFileMoveCost = 0;
3681
3682 // Non-immediate extraction/insertion can be handled as a sequence of
3683 // aliased loads+stores via the stack.
3684 if (Index == -1U && (Opcode == Instruction::ExtractElement ||
3685 Opcode == Instruction::InsertElement)) {
3686 // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
3687 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
3688
3689 // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
3690 assert(isa<FixedVectorType>(Val) && "Fixed vector type expected")(static_cast <bool> (isa<FixedVectorType>(Val) &&
"Fixed vector type expected") ? void (0) : __assert_fail ("isa<FixedVectorType>(Val) && \"Fixed vector type expected\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3690, __extension__
__PRETTY_FUNCTION__))
;
3691 Align VecAlign = DL.getPrefTypeAlign(Val);
3692 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
3693
3694 // Extract - store vector to stack, load scalar.
3695 if (Opcode == Instruction::ExtractElement) {
3696 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0,
3697 TTI::TargetCostKind::TCK_RecipThroughput) +
3698 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
3699 TTI::TargetCostKind::TCK_RecipThroughput);
3700 }
3701 // Insert - store vector to stack, store scalar, load vector.
3702 if (Opcode == Instruction::InsertElement) {
3703 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0,
3704 TTI::TargetCostKind::TCK_RecipThroughput) +
3705 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
3706 TTI::TargetCostKind::TCK_RecipThroughput) +
3707 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0,
3708 TTI::TargetCostKind::TCK_RecipThroughput);
3709 }
3710 }
3711
3712 if (Index != -1U && (Opcode
16.1
'Opcode' is equal to ExtractElement
== Instruction::ExtractElement ||
3713 Opcode == Instruction::InsertElement)) {
3714 // Extraction of vXi1 elements are now efficiently handled by MOVMSK.
3715 if (Opcode
16.2
'Opcode' is equal to ExtractElement
== Instruction::ExtractElement &&
3716 ScalarType->getScalarSizeInBits() == 1 &&
17
Assuming the condition is false
3717 cast<FixedVectorType>(Val)->getNumElements() > 1)
3718 return 1;
3719
3720 // Legalize the type.
3721 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
3722
3723 // This type is legalized to a scalar type.
3724 if (!LT.second.isVector())
18
Taking false branch
3725 return 0;
3726
3727 // The type may be split. Normalize the index to the new type.
3728 unsigned SizeInBits = LT.second.getSizeInBits();
3729 unsigned NumElts = LT.second.getVectorNumElements();
3730 unsigned SubNumElts = NumElts;
3731 Index = Index % NumElts;
3732
3733 // For >128-bit vectors, we need to extract higher 128-bit subvectors.
3734 // For inserts, we also need to insert the subvector back.
3735 if (SizeInBits > 128) {
19
Assuming 'SizeInBits' is > 128
3736 assert((SizeInBits % 128) == 0 && "Illegal vector")(static_cast <bool> ((SizeInBits % 128) == 0 &&
"Illegal vector") ? void (0) : __assert_fail ("(SizeInBits % 128) == 0 && \"Illegal vector\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3736, __extension__
__PRETTY_FUNCTION__))
;
20
Taking true branch
21
Assuming the condition is true
22
'?' condition is true
3737 unsigned NumSubVecs = SizeInBits / 128;
3738 SubNumElts = NumElts / NumSubVecs;
23
Value assigned to 'SubNumElts'
3739 if (SubNumElts <= Index) {
24
Assuming 'SubNumElts' is <= 'Index'
25
Taking true branch
3740 RegisterFileMoveCost += (Opcode
25.1
'Opcode' is not equal to InsertElement
== Instruction::InsertElement ? 2 : 1);
26
'?' condition is false
3741 Index %= SubNumElts;
27
Division by zero
3742 }
3743 }
3744
3745 if (Index == 0) {
3746 // Floating point scalars are already located in index #0.
3747 // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
3748 // true for all.
3749 if (ScalarType->isFloatingPointTy())
3750 return RegisterFileMoveCost;
3751
3752 // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
3753 if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
3754 return 1 + RegisterFileMoveCost;
3755 }
3756
3757 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3758 assert(ISD && "Unexpected vector opcode")(static_cast <bool> (ISD && "Unexpected vector opcode"
) ? void (0) : __assert_fail ("ISD && \"Unexpected vector opcode\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3758, __extension__
__PRETTY_FUNCTION__))
;
3759 MVT MScalarTy = LT.second.getScalarType();
3760 if (ST->useSLMArithCosts())
3761 if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
3762 return Entry->Cost + RegisterFileMoveCost;
3763
3764 // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
3765 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
3766 (MScalarTy.isInteger() && ST->hasSSE41()))
3767 return 1 + RegisterFileMoveCost;
3768
3769 // Assume insertps is relatively cheap on all targets.
3770 if (MScalarTy == MVT::f32 && ST->hasSSE41() &&
3771 Opcode == Instruction::InsertElement)
3772 return 1 + RegisterFileMoveCost;
3773
3774 // For extractions we just need to shuffle the element to index 0, which
3775 // should be very cheap (assume cost = 1). For insertions we need to shuffle
3776 // the elements to its destination. In both cases we must handle the
3777 // subvector move(s).
3778 // If the vector type is already less than 128-bits then don't reduce it.
3779 // TODO: Under what circumstances should we shuffle using the full width?
3780 InstructionCost ShuffleCost = 1;
3781 if (Opcode == Instruction::InsertElement) {
3782 auto *SubTy = cast<VectorType>(Val);
3783 EVT VT = TLI->getValueType(DL, Val);
3784 if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
3785 SubTy = FixedVectorType::get(ScalarType, SubNumElts);
3786 ShuffleCost =
3787 getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, None, 0, SubTy);
3788 }
3789 int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
3790 return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
3791 }
3792
3793 // Add to the base cost if we know that the extracted element of a vector is
3794 // destined to be moved to and used in the integer register file.
3795 if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
3796 RegisterFileMoveCost += 1;
3797
3798 return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
3799}
3800
3801InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
3802 const APInt &DemandedElts,
3803 bool Insert,
3804 bool Extract) {
3805 assert(DemandedElts.getBitWidth() ==(static_cast <bool> (DemandedElts.getBitWidth() == cast
<FixedVectorType>(Ty)->getNumElements() && "Vector size mismatch"
) ? void (0) : __assert_fail ("DemandedElts.getBitWidth() == cast<FixedVectorType>(Ty)->getNumElements() && \"Vector size mismatch\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3807, __extension__
__PRETTY_FUNCTION__))
3806 cast<FixedVectorType>(Ty)->getNumElements() &&(static_cast <bool> (DemandedElts.getBitWidth() == cast
<FixedVectorType>(Ty)->getNumElements() && "Vector size mismatch"
) ? void (0) : __assert_fail ("DemandedElts.getBitWidth() == cast<FixedVectorType>(Ty)->getNumElements() && \"Vector size mismatch\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3807, __extension__
__PRETTY_FUNCTION__))
3807 "Vector size mismatch")(static_cast <bool> (DemandedElts.getBitWidth() == cast
<FixedVectorType>(Ty)->getNumElements() && "Vector size mismatch"
) ? void (0) : __assert_fail ("DemandedElts.getBitWidth() == cast<FixedVectorType>(Ty)->getNumElements() && \"Vector size mismatch\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3807, __extension__
__PRETTY_FUNCTION__))
;
3808
3809 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
3810 MVT MScalarTy = LT.second.getScalarType();
3811 unsigned SizeInBits = LT.second.getSizeInBits();
3812
3813 InstructionCost Cost = 0;
3814
3815 // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
3816 // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
3817 if (Insert) {
3818 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
3819 (MScalarTy.isInteger() && ST->hasSSE41()) ||
3820 (MScalarTy == MVT::f32 && ST->hasSSE41())) {
3821 // For types we can insert directly, insertion into 128-bit sub vectors is
3822 // cheap, followed by a cheap chain of concatenations.
3823 if (SizeInBits <= 128) {
3824 Cost +=
3825 BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false);
3826 } else {
3827 // In each 128-lane, if at least one index is demanded but not all
3828 // indices are demanded and this 128-lane is not the first 128-lane of
3829 // the legalized-vector, then this 128-lane needs a extracti128; If in
3830 // each 128-lane, there is at least one demanded index, this 128-lane
3831 // needs a inserti128.
3832
3833 // The following cases will help you build a better understanding:
3834 // Assume we insert several elements into a v8i32 vector in avx2,
3835 // Case#1: inserting into 1th index needs vpinsrd + inserti128.
3836 // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
3837 // inserti128.
3838 // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
3839 const int CostValue = *LT.first.getValue();
3840 assert(CostValue >= 0 && "Negative cost!")(static_cast <bool> (CostValue >= 0 && "Negative cost!"
) ? void (0) : __assert_fail ("CostValue >= 0 && \"Negative cost!\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3840, __extension__
__PRETTY_FUNCTION__))
;
3841 unsigned Num128Lanes = SizeInBits / 128 * CostValue;
3842 unsigned NumElts = LT.second.getVectorNumElements() * CostValue;
3843 APInt WidenedDemandedElts = DemandedElts.zext(NumElts);
3844 unsigned Scale = NumElts / Num128Lanes;
3845 // We iterate each 128-lane, and check if we need a
3846 // extracti128/inserti128 for this 128-lane.
3847 for (unsigned I = 0; I < NumElts; I += Scale) {
3848 APInt Mask = WidenedDemandedElts.getBitsSet(NumElts, I, I + Scale);
3849 APInt MaskedDE = Mask & WidenedDemandedElts;
3850 unsigned Population = MaskedDE.countPopulation();
3851 Cost += (Population > 0 && Population != Scale &&
3852 I % LT.second.getVectorNumElements() != 0);
3853 Cost += Population > 0;
3854 }
3855 Cost += DemandedElts.countPopulation();
3856
3857 // For vXf32 cases, insertion into the 0'th index in each v4f32
3858 // 128-bit vector is free.
3859 // NOTE: This assumes legalization widens vXf32 vectors.
3860 if (MScalarTy == MVT::f32)
3861 for (unsigned i = 0, e = cast<FixedVectorType>(Ty)->getNumElements();
3862 i < e; i += 4)
3863 if (DemandedElts[i])
3864 Cost--;
3865 }
3866 } else if (LT.second.isVector()) {
3867 // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
3868 // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
3869 // series of UNPCK followed by CONCAT_VECTORS - all of these can be
3870 // considered cheap.
3871 if (Ty->isIntOrIntVectorTy())
3872 Cost += DemandedElts.countPopulation();
3873
3874 // Get the smaller of the legalized or original pow2-extended number of
3875 // vector elements, which represents the number of unpacks we'll end up
3876 // performing.
3877 unsigned NumElts = LT.second.getVectorNumElements();
3878 unsigned Pow2Elts =
3879 PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements());
3880 Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
3881 }
3882 }
3883
3884 if (Extract) {
3885 // vXi1 can be efficiently extracted with MOVMSK.
3886 // TODO: AVX512 predicate mask handling.
3887 // NOTE: This doesn't work well for roundtrip scalarization.
3888 if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) {
3889 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
3890 unsigned MaxElts = ST->hasAVX2() ? 32 : 16;
3891 unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts;
3892 return MOVMSKCost;
3893 }
3894
3895 if (LT.second.isVector()) {
3896 int CostValue = *LT.first.getValue();
3897 assert(CostValue >= 0 && "Negative cost!")(static_cast <bool> (CostValue >= 0 && "Negative cost!"
) ? void (0) : __assert_fail ("CostValue >= 0 && \"Negative cost!\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3897, __extension__
__PRETTY_FUNCTION__))
;
3898
3899 unsigned NumElts = LT.second.getVectorNumElements() * CostValue;
3900 assert(NumElts >= DemandedElts.getBitWidth() &&(static_cast <bool> (NumElts >= DemandedElts.getBitWidth
() && "Vector has been legalized to smaller element count"
) ? void (0) : __assert_fail ("NumElts >= DemandedElts.getBitWidth() && \"Vector has been legalized to smaller element count\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3901, __extension__
__PRETTY_FUNCTION__))
3901 "Vector has been legalized to smaller element count")(static_cast <bool> (NumElts >= DemandedElts.getBitWidth
() && "Vector has been legalized to smaller element count"
) ? void (0) : __assert_fail ("NumElts >= DemandedElts.getBitWidth() && \"Vector has been legalized to smaller element count\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3901, __extension__
__PRETTY_FUNCTION__))
;
3902
3903 // If we're extracting elements from a 128-bit subvector lane, we only need
3904 // to extract each lane once, not for every element.
3905 if (SizeInBits > 128) {
3906 assert((SizeInBits % 128) == 0 && "Illegal vector")(static_cast <bool> ((SizeInBits % 128) == 0 &&
"Illegal vector") ? void (0) : __assert_fail ("(SizeInBits % 128) == 0 && \"Illegal vector\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3906, __extension__
__PRETTY_FUNCTION__))
;
3907 unsigned NumLegal128Lanes = SizeInBits / 128;
3908 unsigned Num128Lanes = NumLegal128Lanes * CostValue;
3909 APInt WidenedDemandedElts = DemandedElts.zext(NumElts);
3910 unsigned Scale = NumElts / Num128Lanes;
3911
3912 // Add cost for each demanded 128-bit subvector extraction.
3913 // Luckily this is a lot easier than for insertion.
3914 APInt DemandedUpper128Lanes =
3915 APIntOps::ScaleBitMask(WidenedDemandedElts, Num128Lanes);
3916 auto *Ty128 = FixedVectorType::get(Ty->getElementType(), Scale);
3917 for (unsigned I = 0; I != Num128Lanes; ++I)
3918 if (DemandedUpper128Lanes[I])
3919 Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
3920 I * Scale, Ty128);
3921
3922 // Add all the demanded element extractions together, but adjust the
3923 // index to use the equivalent of the bottom 128 bit lane.
3924 for (unsigned I = 0; I != NumElts; ++I)
3925 if (WidenedDemandedElts[I]) {
3926 unsigned Idx = I % Scale;
3927 Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, Idx);
3928 }
3929
3930 return Cost;
3931 }
3932 }
3933
3934 // Fallback to default extraction.
3935 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, false, Extract);
3936 }
3937
3938 return Cost;
3939}
3940
3941InstructionCost
3942X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
3943 int VF, const APInt &DemandedDstElts,
3944 TTI::TargetCostKind CostKind) {
3945 const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy);
3946 // We don't differentiate element types here, only element bit width.
3947 EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits);
3948
3949 auto bailout = [&]() {
3950 return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
3951 DemandedDstElts, CostKind);
3952 };
3953
3954 // For now, only deal with AVX512 cases.
3955 if (!ST->hasAVX512())
3956 return bailout();
3957
3958 // Do we have a native shuffle for this element type, or should we promote?
3959 unsigned PromEltTyBits = EltTyBits;
3960 switch (EltTyBits) {
3961 case 32:
3962 case 64:
3963 break; // AVX512F.
3964 case 16:
3965 if (!ST->hasBWI())
3966 PromEltTyBits = 32; // promote to i32, AVX512F.
3967 break; // AVX512BW
3968 case 8:
3969 if (!ST->hasVBMI())
3970 PromEltTyBits = 32; // promote to i32, AVX512F.
3971 break; // AVX512VBMI
3972 case 1:
3973 // There is no support for shuffling i1 elements. We *must* promote.
3974 if (ST->hasBWI()) {
3975 if (ST->hasVBMI())
3976 PromEltTyBits = 8; // promote to i8, AVX512VBMI.
3977 else
3978 PromEltTyBits = 16; // promote to i16, AVX512BW.
3979 break;
3980 }
3981 if (ST->hasDQI()) {
3982 PromEltTyBits = 32; // promote to i32, AVX512F.
3983 break;
3984 }
3985 return bailout();
3986 default:
3987 return bailout();
3988 }
3989 auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits);
3990
3991 auto *SrcVecTy = FixedVectorType::get(EltTy, VF);
3992 auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF);
3993
3994 int NumDstElements = VF * ReplicationFactor;
3995 auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements);
3996 auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements);
3997
3998 // Legalize the types.
3999 MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second;
4000 MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second;
4001 MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second;
4002 MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second;
4003 // They should have legalized into vector types.
4004 if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() ||
4005 !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector())
4006 return bailout();
4007
4008 if (PromEltTyBits != EltTyBits) {
4009 // If we have to perform the shuffle with wider elt type than our data type,
4010 // then we will first need to anyext (we don't care about the new bits)
4011 // the source elements, and then truncate Dst elements.
4012 InstructionCost PromotionCost;
4013 PromotionCost += getCastInstrCost(
4014 Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy,
4015 TargetTransformInfo::CastContextHint::None, CostKind);
4016 PromotionCost +=
4017 getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy,
4018 /*Src=*/PromDstVecTy,
4019 TargetTransformInfo::CastContextHint::None, CostKind);
4020 return PromotionCost + getReplicationShuffleCost(PromEltTy,
4021 ReplicationFactor, VF,
4022 DemandedDstElts, CostKind);
4023 }
4024
4025 assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&(static_cast <bool> (LegalSrcVecTy.getScalarSizeInBits(
) == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy
.getScalarType() && "We expect that the legalization doesn't affect the element width, "
"doesn't coalesce/split elements.") ? void (0) : __assert_fail
("LegalSrcVecTy.getScalarSizeInBits() == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() && \"We expect that the legalization doesn't affect the element width, \" \"doesn't coalesce/split elements.\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4028, __extension__
__PRETTY_FUNCTION__))
4026 LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&(static_cast <bool> (LegalSrcVecTy.getScalarSizeInBits(
) == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy
.getScalarType() && "We expect that the legalization doesn't affect the element width, "
"doesn't coalesce/split elements.") ? void (0) : __assert_fail
("LegalSrcVecTy.getScalarSizeInBits() == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() && \"We expect that the legalization doesn't affect the element width, \" \"doesn't coalesce/split elements.\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4028, __extension__
__PRETTY_FUNCTION__))
4027 "We expect that the legalization doesn't affect the element width, "(static_cast <bool> (LegalSrcVecTy.getScalarSizeInBits(
) == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy
.getScalarType() && "We expect that the legalization doesn't affect the element width, "
"doesn't coalesce/split elements.") ? void (0) : __assert_fail
("LegalSrcVecTy.getScalarSizeInBits() == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() && \"We expect that the legalization doesn't affect the element width, \" \"doesn't coalesce/split elements.\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4028, __extension__
__PRETTY_FUNCTION__))
4028 "doesn't coalesce/split elements.")(static_cast <bool> (LegalSrcVecTy.getScalarSizeInBits(
) == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy
.getScalarType() && "We expect that the legalization doesn't affect the element width, "
"doesn't coalesce/split elements.") ? void (0) : __assert_fail
("LegalSrcVecTy.getScalarSizeInBits() == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() && \"We expect that the legalization doesn't affect the element width, \" \"doesn't coalesce/split elements.\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4028, __extension__
__PRETTY_FUNCTION__))
;
4029
4030 unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements();
4031 unsigned NumDstVectors =
4032 divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec);
4033
4034 auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec);
4035
4036 // Not all the produced Dst elements may be demanded. In our case,
4037 // given that a single Dst vector is formed by a single shuffle,
4038 // if all elements that will form a single Dst vector aren't demanded,
4039 // then we won't need to do that shuffle, so adjust the cost accordingly.
4040 APInt DemandedDstVectors = APIntOps::ScaleBitMask(
4041 DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors);
4042 unsigned NumDstVectorsDemanded = DemandedDstVectors.countPopulation();
4043
4044 InstructionCost SingleShuffleCost =
4045 getShuffleCost(TTI::SK_PermuteSingleSrc, SingleDstVecTy,
4046 /*Mask=*/None, /*Index=*/0, /*SubTp=*/nullptr);
4047 return NumDstVectorsDemanded * SingleShuffleCost;
4048}
4049
4050InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
4051 MaybeAlign Alignment,
4052 unsigned AddressSpace,
4053 TTI::TargetCostKind CostKind,
4054 TTI::OperandValueKind OpdInfo,
4055 const Instruction *I) {
4056 // TODO: Handle other cost kinds.
4057 if (CostKind != TTI::TCK_RecipThroughput) {
4058 if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
4059 // Store instruction with index and scale costs 2 Uops.
4060 // Check the preceding GEP to identify non-const indices.
4061 if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
4062 if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
4063 return TTI::TCC_Basic * 2;
4064 }
4065 }
4066 return TTI::TCC_Basic;
4067 }
4068
4069 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&(static_cast <bool> ((Opcode == Instruction::Load || Opcode
== Instruction::Store) && "Invalid Opcode") ? void (
0) : __assert_fail ("(Opcode == Instruction::Load || Opcode == Instruction::Store) && \"Invalid Opcode\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4070, __extension__
__PRETTY_FUNCTION__))
4070 "Invalid Opcode")(static_cast <bool> ((Opcode == Instruction::Load || Opcode
== Instruction::Store) && "Invalid Opcode") ? void (
0) : __assert_fail ("(Opcode == Instruction::Load || Opcode == Instruction::Store) && \"Invalid Opcode\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4070, __extension__
__PRETTY_FUNCTION__))
;
4071 // Type legalization can't handle structs
4072 if (TLI->getValueType(DL, Src, true) == MVT::Other)
4073 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4074 CostKind);
4075
4076 // Legalize the type.
4077 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
4078
4079 auto *VTy = dyn_cast<FixedVectorType>(Src);
4080
4081 InstructionCost Cost = 0;
4082
4083 // Add a cost for constant load to vector.
4084 if (Opcode == Instruction::Store &&
4085 (OpdInfo == TTI::OK_UniformConstantValue ||
4086 OpdInfo == TTI::OK_NonUniformConstantValue))
4087 Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src),
4088 /*AddressSpace=*/0, CostKind);
4089
4090 // Handle the simple case of non-vectors.
4091 // NOTE: this assumes that legalization never creates vector from scalars!
4092 if (!VTy || !LT.second.isVector()) {
4093 // Each load/store unit costs 1.
4094 return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1;
4095 }
4096
4097 bool IsLoad = Opcode == Instruction::Load;
4098
4099 Type *EltTy = VTy->getElementType();
4100
4101 const int EltTyBits = DL.getTypeSizeInBits(EltTy);
4102
4103 // Source of truth: how many elements were there in the original IR vector?
4104 const unsigned SrcNumElt = VTy->getNumElements();
4105
4106 // How far have we gotten?
4107 int NumEltRemaining = SrcNumElt;
4108 // Note that we intentionally capture by-reference, NumEltRemaining changes.
4109 auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
4110
4111 const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
4112
4113 // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
4114 const unsigned XMMBits = 128;
4115 if (XMMBits % EltTyBits != 0)
4116 // Vector size must be a multiple of the element size. I.e. no padding.
4117 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4118 CostKind);
4119 const int NumEltPerXMM = XMMBits / EltTyBits;
4120
4121 auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
4122
4123 for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
4124 NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
4125 // How many elements would a single op deal with at once?
4126 if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
4127 // Vector size must be a multiple of the element size. I.e. no padding.
4128 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4129 CostKind);
4130 int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
4131
4132 assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?")(static_cast <bool> (CurrOpSizeBytes > 0 && CurrNumEltPerOp
> 0 && "How'd we get here?") ? void (0) : __assert_fail
("CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && \"How'd we get here?\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4132, __extension__
__PRETTY_FUNCTION__))
;
4133 assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||(static_cast <bool> ((((NumEltRemaining * EltTyBits) <
(2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes
)) && "Unless we haven't halved the op size yet, " "we have less than two op's sized units of work left."
) ? void (0) : __assert_fail ("(((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes)) && \"Unless we haven't halved the op size yet, \" \"we have less than two op's sized units of work left.\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4136, __extension__
__PRETTY_FUNCTION__))
4134 (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&(static_cast <bool> ((((NumEltRemaining * EltTyBits) <
(2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes
)) && "Unless we haven't halved the op size yet, " "we have less than two op's sized units of work left."
) ? void (0) : __assert_fail ("(((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes)) && \"Unless we haven't halved the op size yet, \" \"we have less than two op's sized units of work left.\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4136, __extension__
__PRETTY_FUNCTION__))
4135 "Unless we haven't halved the op size yet, "(static_cast <bool> ((((NumEltRemaining * EltTyBits) <
(2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes
)) && "Unless we haven't halved the op size yet, " "we have less than two op's sized units of work left."
) ? void (0) : __assert_fail ("(((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes)) && \"Unless we haven't halved the op size yet, \" \"we have less than two op's sized units of work left.\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4136, __extension__
__PRETTY_FUNCTION__))
4136 "we have less than two op's sized units of work left.")(static_cast <bool> ((((NumEltRemaining * EltTyBits) <
(2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes
)) && "Unless we haven't halved the op size yet, " "we have less than two op's sized units of work left."
) ? void (0) : __assert_fail ("(((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes)) && \"Unless we haven't halved the op size yet, \" \"we have less than two op's sized units of work left.\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4136, __extension__
__PRETTY_FUNCTION__))
;
4137
4138 auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
4139 ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
4140 : XMMVecTy;
4141
4142 assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&(static_cast <bool> (CurrVecTy->getNumElements() % CurrNumEltPerOp
== 0 && "After halving sizes, the vector elt count is no longer a multiple "
"of number of elements per operation?") ? void (0) : __assert_fail
("CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && \"After halving sizes, the vector elt count is no longer a multiple \" \"of number of elements per operation?\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4144, __extension__
__PRETTY_FUNCTION__))
4143 "After halving sizes, the vector elt count is no longer a multiple "(static_cast <bool> (CurrVecTy->getNumElements() % CurrNumEltPerOp
== 0 && "After halving sizes, the vector elt count is no longer a multiple "
"of number of elements per operation?") ? void (0) : __assert_fail
("CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && \"After halving sizes, the vector elt count is no longer a multiple \" \"of number of elements per operation?\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4144, __extension__
__PRETTY_FUNCTION__))
4144 "of number of elements per operation?")(static_cast <bool> (CurrVecTy->getNumElements() % CurrNumEltPerOp
== 0 && "After halving sizes, the vector elt count is no longer a multiple "
"of number of elements per operation?") ? void (0) : __assert_fail
("CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && \"After halving sizes, the vector elt count is no longer a multiple \" \"of number of elements per operation?\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4144, __extension__
__PRETTY_FUNCTION__))
;
4145 auto *CoalescedVecTy =
4146 CurrNumEltPerOp == 1
4147 ? CurrVecTy
4148 : FixedVectorType::get(
4149 IntegerType::get(Src->getContext(),
4150 EltTyBits * CurrNumEltPerOp),
4151 CurrVecTy->getNumElements() / CurrNumEltPerOp);
4152 assert(DL.getTypeSizeInBits(CoalescedVecTy) ==(static_cast <bool> (DL.getTypeSizeInBits(CoalescedVecTy
) == DL.getTypeSizeInBits(CurrVecTy) && "coalesciing elements doesn't change vector width."
) ? void (0) : __assert_fail ("DL.getTypeSizeInBits(CoalescedVecTy) == DL.getTypeSizeInBits(CurrVecTy) && \"coalesciing elements doesn't change vector width.\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4154, __extension__
__PRETTY_FUNCTION__))
4153 DL.getTypeSizeInBits(CurrVecTy) &&(static_cast <bool> (DL.getTypeSizeInBits(CoalescedVecTy
) == DL.getTypeSizeInBits(CurrVecTy) && "coalesciing elements doesn't change vector width."
) ? void (0) : __assert_fail ("DL.getTypeSizeInBits(CoalescedVecTy) == DL.getTypeSizeInBits(CurrVecTy) && \"coalesciing elements doesn't change vector width.\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4154, __extension__
__PRETTY_FUNCTION__))
4154 "coalesciing elements doesn't change vector width.")(static_cast <bool> (DL.getTypeSizeInBits(CoalescedVecTy
) == DL.getTypeSizeInBits(CurrVecTy) && "coalesciing elements doesn't change vector width."
) ? void (0) : __assert_fail ("DL.getTypeSizeInBits(CoalescedVecTy) == DL.getTypeSizeInBits(CurrVecTy) && \"coalesciing elements doesn't change vector width.\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4154, __extension__
__PRETTY_FUNCTION__))
;
4155
4156 while (NumEltRemaining > 0) {
4157 assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?")(static_cast <bool> (SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?"
) ? void (0) : __assert_fail ("SubVecEltsLeft >= 0 && \"Subreg element count overconsumtion?\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4157, __extension__
__PRETTY_FUNCTION__))
;
4158
4159 // Can we use this vector size, as per the remaining element count?
4160 // Iff the vector is naturally aligned, we can do a wide load regardless.
4161 if (NumEltRemaining < CurrNumEltPerOp &&
4162 (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) &&
4163 CurrOpSizeBytes != 1)
4164 break; // Try smalled vector size.
4165
4166 bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
4167
4168 // If we have fully processed the previous reg, we need to replenish it.
4169 if (SubVecEltsLeft == 0) {
4170 SubVecEltsLeft += CurrVecTy->getNumElements();
4171 // And that's free only for the 0'th subvector of a legalized vector.
4172 if (!Is0thSubVec)
4173 Cost += getShuffleCost(IsLoad ? TTI::ShuffleKind::SK_InsertSubvector
4174 : TTI::ShuffleKind::SK_ExtractSubvector,
4175 VTy, None, NumEltDone(), CurrVecTy);
4176 }
4177
4178 // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
4179 // for smaller widths (32/16/8) we have to insert/extract them separately.
4180 // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
4181 // but let's pretend that it is also true for 16/8 bit wide ops...)
4182 if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
4183 int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
4184 assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "")(static_cast <bool> (NumEltDoneInCurrXMM % CurrNumEltPerOp
== 0 && "") ? void (0) : __assert_fail ("NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && \"\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4184, __extension__
__PRETTY_FUNCTION__))
;
4185 int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
4186 APInt DemandedElts =
4187 APInt::getBitsSet(CoalescedVecTy->getNumElements(),
4188 CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
4189 assert(DemandedElts.countPopulation() == 1 && "Inserting single value")(static_cast <bool> (DemandedElts.countPopulation() == 1
&& "Inserting single value") ? void (0) : __assert_fail
("DemandedElts.countPopulation() == 1 && \"Inserting single value\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4189, __extension__
__PRETTY_FUNCTION__))
;
4190 Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
4191 !IsLoad);
4192 }
4193
4194 // This isn't exactly right. We're using slow unaligned 32-byte accesses
4195 // as a proxy for a double-pumped AVX memory interface such as on
4196 // Sandybridge.
4197 if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
4198 Cost += 2;
4199 else
4200 Cost += 1;
4201
4202 SubVecEltsLeft -= CurrNumEltPerOp;
4203 NumEltRemaining -= CurrNumEltPerOp;
4204 Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes);
4205 }
4206 }
4207
4208 assert(NumEltRemaining <= 0 && "Should have processed all the elements.")(static_cast <bool> (NumEltRemaining <= 0 &&
"Should have processed all the elements.") ? void (0) : __assert_fail
("NumEltRemaining <= 0 && \"Should have processed all the elements.\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4208, __extension__
__PRETTY_FUNCTION__))
;
4209
4210 return Cost;
4211}
4212
4213InstructionCost
4214X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
4215 unsigned AddressSpace,
4216 TTI::TargetCostKind CostKind) {
4217 bool IsLoad = (Instruction::Load == Opcode);
4218 bool IsStore = (Instruction::Store == Opcode);
4219
4220 auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
4221 if (!SrcVTy)
4222 // To calculate scalar take the regular cost, without mask
4223 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
4224
4225 unsigned NumElem = SrcVTy->getNumElements();
4226 auto *MaskTy =
4227 FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
4228 if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) ||
4229 (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) {
4230 // Scalarization
4231 APInt DemandedElts = APInt::getAllOnes(NumElem);
4232 InstructionCost MaskSplitCost =
4233 getScalarizationOverhead(MaskTy, DemandedElts, false, true);
4234 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
4235 Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
4236 CmpInst::BAD_ICMP_PREDICATE, CostKind);
4237 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
4238 InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
4239 InstructionCost ValueSplitCost =
4240 getScalarizationOverhead(SrcVTy, DemandedElts, IsLoad, IsStore);
4241 InstructionCost MemopCost =
4242 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
4243 Alignment, AddressSpace, CostKind);
4244 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
4245 }
4246
4247 // Legalize the type.
4248 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
4249 auto VT = TLI->getValueType(DL, SrcVTy);
4250 InstructionCost Cost = 0;
4251 if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
4252 LT.second.getVectorNumElements() == NumElem)
4253 // Promotion requires extend/truncate for data and a shuffle for mask.
4254 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, None, 0, nullptr) +
4255 getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, None, 0, nullptr);
4256
4257 else if (LT.first * LT.second.getVectorNumElements() > NumElem) {
4258 auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
4259 LT.second.getVectorNumElements());
4260 // Expanding requires fill mask with zeroes
4261 Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, None, 0, MaskTy);
4262 }
4263
4264 // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
4265 if (!ST->hasAVX512())
4266 return Cost + LT.first * (IsLoad ? 2 : 8);
4267
4268 // AVX-512 masked load/store is cheaper
4269 return Cost + LT.first;
4270}
4271
4272InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty,
4273 ScalarEvolution *SE,
4274 const SCEV *Ptr) {
4275 // Address computations in vectorized code with non-consecutive addresses will
4276 // likely result in more instructions compared to scalar code where the
4277 // computation can more often be merged into the index mode. The resulting
4278 // extra micro-ops can significantly decrease throughput.
4279 const unsigned NumVectorInstToHideOverhead = 10;
4280
4281 // Cost modeling of Strided Access Computation is hidden by the indexing
4282 // modes of X86 regardless of the stride value. We dont believe that there
4283 // is a difference between constant strided access in gerenal and constant
4284 // strided value which is less than or equal to 64.
4285 // Even in the case of (loop invariant) stride whose value is not known at
4286 // compile time, the address computation will not incur more than one extra
4287 // ADD instruction.
4288 if (Ty->isVectorTy() && SE && !ST->hasAVX2()) {
4289 // TODO: AVX2 is the current cut-off because we don't have correct
4290 // interleaving costs for prior ISA's.
4291 if (!BaseT::isStridedAccess(Ptr))
4292 return NumVectorInstToHideOverhead;
4293 if (!BaseT::getConstantStrideStep(SE, Ptr))
4294 return 1;
4295 }
4296
4297 return BaseT::getAddressComputationCost(Ty, SE, Ptr);
4298}
4299
4300InstructionCost
4301X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
4302 Optional<FastMathFlags> FMF,
4303 TTI::TargetCostKind CostKind) {
4304 if (TTI::requiresOrderedReduction(FMF))
4305 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
4306
4307 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
4308 // and make it as the cost.
4309
4310 static const CostTblEntry SLMCostTblNoPairWise[] = {
4311 { ISD::FADD, MVT::v2f64, 3 },
4312 { ISD::ADD, MVT::v2i64, 5 },
4313 };
4314
4315 static const CostTblEntry SSE2CostTblNoPairWise[] = {
4316 { ISD::FADD, MVT::v2f64, 2 },
4317 { ISD::FADD, MVT::v2f32, 2 },
4318 { ISD::FADD, MVT::v4f32, 4 },
4319 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
4320 { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
4321 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
4322 { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
4323 { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
4324 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
4325 { ISD::ADD, MVT::v2i8, 2 },
4326 { ISD::ADD, MVT::v4i8, 2 },
4327 { ISD::ADD, MVT::v8i8, 2 },
4328 { ISD::ADD, MVT::v16i8, 3 },
4329 };
4330
4331 static const CostTblEntry AVX1CostTblNoPairWise[] = {
4332 { ISD::FADD, MVT::v4f64, 3 },
4333 { ISD::FADD, MVT::v4f32, 3 },
4334 { ISD::FADD, MVT::v8f32, 4 },
4335 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
4336 { ISD::ADD, MVT::v4i64, 3 },
4337 { ISD::ADD, MVT::v8i32, 5 },
4338 { ISD::ADD, MVT::v16i16, 5 },
4339 { ISD::ADD, MVT::v32i8, 4 },
4340 };
4341
4342 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4343 assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ?
void (0) : __assert_fail ("ISD && \"Invalid opcode\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4343, __extension__
__PRETTY_FUNCTION__))
;
4344
4345 // Before legalizing the type, give a chance to look up illegal narrow types
4346 // in the table.
4347 // FIXME: Is there a better way to do this?
4348 EVT VT = TLI->getValueType(DL, ValTy);
4349 if (VT.isSimple()) {
4350 MVT MTy = VT.getSimpleVT();
4351 if (ST->useSLMArithCosts())
4352 if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
4353 return Entry->Cost;
4354
4355 if (ST->hasAVX())
4356 if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
4357 return Entry->Cost;
4358
4359 if (ST->hasSSE2())
4360 if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
4361 return Entry->Cost;
4362 }
4363
4364 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
4365
4366 MVT MTy = LT.second;
4367
4368 auto *ValVTy = cast<FixedVectorType>(ValTy);
4369
4370 // Special case: vXi8 mul reductions are performed as vXi16.
4371 if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
4372 auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
4373 auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
4374 return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
4375 TargetTransformInfo::CastContextHint::None,
4376 CostKind) +
4377 getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind);
4378 }
4379
4380 InstructionCost ArithmeticCost = 0;
4381 if (LT.first != 1 && MTy.isVector() &&
4382 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
4383 // Type needs to be split. We need LT.first - 1 arithmetic ops.
4384 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
4385 MTy.getVectorNumElements());
4386 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
4387 ArithmeticCost *= LT.first - 1;
4388 }
4389
4390 if (ST->useSLMArithCosts())
4391 if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
4392 return ArithmeticCost + Entry->Cost;
4393
4394 if (ST->hasAVX())
4395 if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
4396 return ArithmeticCost + Entry->Cost;
4397
4398 if (ST->hasSSE2())
4399 if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
4400 return ArithmeticCost + Entry->Cost;
4401
4402 // FIXME: These assume a naive kshift+binop lowering, which is probably
4403 // conservative in most cases.
4404 static const CostTblEntry AVX512BoolReduction[] = {
4405 { ISD::AND, MVT::v2i1, 3 },
4406 { ISD::AND, MVT::v4i1, 5 },
4407 { ISD::AND, MVT::v8i1, 7 },
4408 { ISD::AND, MVT::v16i1, 9 },
4409 { ISD::AND, MVT::v32i1, 11 },
4410 { ISD::AND, MVT::v64i1, 13 },
4411 { ISD::OR, MVT::v2i1, 3 },
4412 { ISD::OR, MVT::v4i1, 5 },
4413 { ISD::OR, MVT::v8i1, 7 },
4414 { ISD::OR, MVT::v16i1, 9 },
4415 { ISD::OR, MVT::v32i1, 11 },
4416 { ISD::OR, MVT::v64i1, 13 },
4417 };
4418
4419 static const CostTblEntry AVX2BoolReduction[] = {
4420 { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
4421 { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
4422 { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
4423 { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
4424 };
4425
4426 static const CostTblEntry AVX1BoolReduction[] = {
4427 { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
4428 { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
4429 { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
4430 { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
4431 { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
4432 { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
4433 { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
4434 { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
4435 };
4436
4437 static const CostTblEntry SSE2BoolReduction[] = {
4438 { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
4439 { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
4440 { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
4441 { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
4442 { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
4443 { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
4444 { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
4445 { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
4446 };
4447
4448 // Handle bool allof/anyof patterns.
4449 if (ValVTy->getElementType()->isIntegerTy(1)) {
4450 InstructionCost ArithmeticCost = 0;
4451 if (LT.first != 1 && MTy.isVector() &&
4452 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
4453 // Type needs to be split. We need LT.first - 1 arithmetic ops.
4454 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
4455 MTy.getVectorNumElements());
4456 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
4457 ArithmeticCost *= LT.first - 1;
4458 }
4459
4460 if (ST->hasAVX512())
4461 if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
4462 return ArithmeticCost + Entry->Cost;
4463 if (ST->hasAVX2())
4464 if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
4465 return ArithmeticCost + Entry->Cost;
4466 if (ST->hasAVX())
4467 if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
4468 return ArithmeticCost + Entry->Cost;
4469 if (ST->hasSSE2())
4470 if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
4471 return ArithmeticCost + Entry->Cost;
4472
4473 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
4474 }
4475
4476 unsigned NumVecElts = ValVTy->getNumElements();
4477 unsigned ScalarSize = ValVTy->getScalarSizeInBits();
4478
4479 // Special case power of 2 reductions where the scalar type isn't changed
4480 // by type legalization.
4481 if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
4482 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
4483
4484 InstructionCost ReductionCost = 0;
4485
4486 auto *Ty = ValVTy;
4487 if (LT.first != 1 && MTy.isVector() &&
4488 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
4489 // Type needs to be split. We need LT.first - 1 arithmetic ops.
4490 Ty = FixedVectorType::get(ValVTy->getElementType(),
4491 MTy.getVectorNumElements());
4492 ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
4493 ReductionCost *= LT.first - 1;
4494 NumVecElts = MTy.getVectorNumElements();
4495 }
4496
4497 // Now handle reduction with the legal type, taking into account size changes
4498 // at each level.
4499 while (NumVecElts > 1) {
4500 // Determine the size of the remaining vector we need to reduce.
4501 unsigned Size = NumVecElts * ScalarSize;
4502 NumVecElts /= 2;
4503 // If we're reducing from 256/512 bits, use an extract_subvector.
4504 if (Size > 128) {
4505 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
4506 ReductionCost +=
4507 getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, NumVecElts, SubTy);
4508 Ty = SubTy;
4509 } else if (Size == 128) {
4510 // Reducing from 128 bits is a permute of v2f64/v2i64.
4511 FixedVectorType *ShufTy;
4512 if (ValVTy->isFloatingPointTy())
4513 ShufTy =
4514 FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
4515 else
4516 ShufTy =
4517 FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
4518 ReductionCost +=
4519 getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr);
4520 } else if (Size == 64) {
4521 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
4522 FixedVectorType *ShufTy;
4523 if (ValVTy->isFloatingPointTy())
4524 ShufTy =
4525 FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
4526 else
4527 ShufTy =
4528 FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
4529 ReductionCost +=
4530 getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr);
4531 } else {
4532 // Reducing from smaller size is a shift by immediate.
4533 auto *ShiftTy = FixedVectorType::get(
4534 Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
4535 ReductionCost += getArithmeticInstrCost(
4536 Instruction::LShr, ShiftTy, CostKind,
4537 TargetTransformInfo::OK_AnyValue,
4538 TargetTransformInfo::OK_UniformConstantValue,
4539 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
4540 }
4541
4542 // Add the arithmetic op for this level.
4543 ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
4544 }
4545
4546 // Add the final extract element to the cost.
4547 return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
4548}
4549
4550InstructionCost X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy,
4551 bool IsUnsigned) {
4552 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4553
4554 MVT MTy = LT.second;
4555
4556 int ISD;
4557 if (Ty->isIntOrIntVectorTy()) {
4558 ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
4559 } else {
4560 assert(Ty->isFPOrFPVectorTy() &&(static_cast <bool> (Ty->isFPOrFPVectorTy() &&
"Expected float point or integer vector type.") ? void (0) :
__assert_fail ("Ty->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4561, __extension__
__PRETTY_FUNCTION__))
4561 "Expected float point or integer vector type.")(static_cast <bool> (Ty->isFPOrFPVectorTy() &&
"Expected float point or integer vector type.") ? void (0) :
__assert_fail ("Ty->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4561, __extension__
__PRETTY_FUNCTION__))
;
4562 ISD = ISD::FMINNUM;
4563 }
4564
4565 static const CostTblEntry SSE1CostTbl[] = {
4566 {ISD::FMINNUM, MVT::v4f32, 1},
4567 };
4568
4569 static const CostTblEntry SSE2CostTbl[] = {
4570 {ISD::FMINNUM, MVT::v2f64, 1},
4571 {ISD::SMIN, MVT::v8i16, 1},
4572 {ISD::UMIN, MVT::v16i8, 1},
4573 };
4574
4575 static const CostTblEntry SSE41CostTbl[] = {
4576 {ISD::SMIN, MVT::v4i32, 1},
4577 {ISD::UMIN, MVT::v4i32, 1},
4578 {ISD::UMIN, MVT::v8i16, 1},
4579 {ISD::SMIN, MVT::v16i8, 1},
4580 };
4581
4582 static const CostTblEntry SSE42CostTbl[] = {
4583 {ISD::UMIN, MVT::v2i64, 3}, // xor+pcmpgtq+blendvpd
4584 };
4585
4586 static const CostTblEntry AVX1CostTbl[] = {
4587 {ISD::FMINNUM, MVT::v8f32, 1},
4588 {ISD::FMINNUM, MVT::v4f64, 1},
4589 {ISD::SMIN, MVT::v8i32, 3},
4590 {ISD::UMIN, MVT::v8i32, 3},
4591 {ISD::SMIN, MVT::v16i16, 3},
4592 {ISD::UMIN, MVT::v16i16, 3},
4593 {ISD::SMIN, MVT::v32i8, 3},
4594 {ISD::UMIN, MVT::v32i8, 3},
4595 };
4596
4597 static const CostTblEntry AVX2CostTbl[] = {
4598 {ISD::SMIN, MVT::v8i32, 1},
4599 {ISD::UMIN, MVT::v8i32, 1},
4600 {ISD::SMIN, MVT::v16i16, 1},
4601 {ISD::UMIN, MVT::v16i16, 1},
4602 {ISD::SMIN, MVT::v32i8, 1},
4603 {ISD::UMIN, MVT::v32i8, 1},
4604 };
4605
4606 static const CostTblEntry AVX512CostTbl[] = {
4607 {ISD::FMINNUM, MVT::v16f32, 1},
4608 {ISD::FMINNUM, MVT::v8f64, 1},
4609 {ISD::SMIN, MVT::v2i64, 1},
4610 {ISD::UMIN, MVT::v2i64, 1},
4611 {ISD::SMIN, MVT::v4i64, 1},
4612 {ISD::UMIN, MVT::v4i64, 1},
4613 {ISD::SMIN, MVT::v8i64, 1},
4614 {ISD::UMIN, MVT::v8i64, 1},
4615 {ISD::SMIN, MVT::v16i32, 1},
4616 {ISD::UMIN, MVT::v16i32, 1},
4617 };
4618
4619 static const CostTblEntry AVX512BWCostTbl[] = {
4620 {ISD::SMIN, MVT::v32i16, 1},
4621 {ISD::UMIN, MVT::v32i16, 1},
4622 {ISD::SMIN, MVT::v64i8, 1},
4623 {ISD::UMIN, MVT::v64i8, 1},
4624 };
4625
4626 // If we have a native MIN/MAX instruction for this type, use it.
4627 if (ST->hasBWI())
4628 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
4629 return LT.first * Entry->Cost;
4630
4631 if (ST->hasAVX512())
4632 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
4633 return LT.first * Entry->Cost;
4634
4635 if (ST->hasAVX2())
4636 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
4637 return LT.first * Entry->Cost;
4638
4639 if (ST->hasAVX())
4640 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
4641 return LT.first * Entry->Cost;
4642
4643 if (ST->hasSSE42())
4644 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
4645 return LT.first * Entry->Cost;
4646
4647 if (ST->hasSSE41())
4648 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
4649 return LT.first * Entry->Cost;
4650
4651 if (ST->hasSSE2())
4652 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
4653 return LT.first * Entry->Cost;
4654
4655 if (ST->hasSSE1())
4656 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
4657 return LT.first * Entry->Cost;
4658
4659 unsigned CmpOpcode;
4660 if (Ty->isFPOrFPVectorTy()) {
4661 CmpOpcode = Instruction::FCmp;
4662 } else {
4663 assert(Ty->isIntOrIntVectorTy() &&(static_cast <bool> (Ty->isIntOrIntVectorTy() &&
"expecting floating point or integer type for min/max reduction"
) ? void (0) : __assert_fail ("Ty->isIntOrIntVectorTy() && \"expecting floating point or integer type for min/max reduction\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4664, __extension__
__PRETTY_FUNCTION__))
4664 "expecting floating point or integer type for min/max reduction")(static_cast <bool> (Ty->isIntOrIntVectorTy() &&
"expecting floating point or integer type for min/max reduction"
) ? void (0) : __assert_fail ("Ty->isIntOrIntVectorTy() && \"expecting floating point or integer type for min/max reduction\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4664, __extension__
__PRETTY_FUNCTION__))
;
4665 CmpOpcode = Instruction::ICmp;
4666 }
4667
4668 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4669 // Otherwise fall back to cmp+select.
4670 InstructionCost Result =
4671 getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CmpInst::BAD_ICMP_PREDICATE,
4672 CostKind) +
4673 getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
4674 CmpInst::BAD_ICMP_PREDICATE, CostKind);
4675 return Result;
4676}
4677
4678InstructionCost
4679X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy,
4680 bool IsUnsigned,
4681 TTI::TargetCostKind CostKind) {
4682 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
4683
4684 MVT MTy = LT.second;
4685
4686 int ISD;
4687 if (ValTy->isIntOrIntVectorTy()) {
4688 ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
1
Taking true branch
2
Assuming 'IsUnsigned' is false
3
'?' condition is false
4689 } else {
4690 assert(ValTy->isFPOrFPVectorTy() &&(static_cast <bool> (ValTy->isFPOrFPVectorTy() &&
"Expected float point or integer vector type.") ? void (0) :
__assert_fail ("ValTy->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4691, __extension__
__PRETTY_FUNCTION__))
4691 "Expected float point or integer vector type.")(static_cast <bool> (ValTy->isFPOrFPVectorTy() &&
"Expected float point or integer vector type.") ? void (0) :
__assert_fail ("ValTy->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4691, __extension__
__PRETTY_FUNCTION__))
;
4692 ISD = ISD::FMINNUM;
4693 }
4694
4695 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
4696 // and make it as the cost.
4697
4698 static const CostTblEntry SSE2CostTblNoPairWise[] = {
4699 {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
4700 {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
4701 {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
4702 };
4703
4704 static const CostTblEntry SSE41CostTblNoPairWise[] = {
4705 {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
4706 {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
4707 {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
4708 {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
4709 {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
4710 {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
4711 {ISD::SMIN, MVT::v2i8, 3}, // pminsb
4712 {ISD::SMIN, MVT::v4i8, 5}, // pminsb
4713 {ISD::SMIN, MVT::v8i8, 7}, // pminsb
4714 {ISD::SMIN, MVT::v16i8, 6},
4715 {ISD::UMIN, MVT::v2i8, 3}, // same as sse2
4716 {ISD::UMIN, MVT::v4i8, 5}, // same as sse2
4717 {ISD::UMIN, MVT::v8i8, 7}, // same as sse2
4718 {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
4719 };
4720
4721 static const CostTblEntry AVX1CostTblNoPairWise[] = {
4722 {ISD::SMIN, MVT::v16i16, 6},
4723 {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
4724 {ISD::SMIN, MVT::v32i8, 8},
4725 {ISD::UMIN, MVT::v32i8, 8},
4726 };
4727
4728 static const CostTblEntry AVX512BWCostTblNoPairWise[] = {
4729 {ISD::SMIN, MVT::v32i16, 8},
4730 {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
4731 {ISD::SMIN, MVT::v64i8, 10},
4732 {ISD::UMIN, MVT::v64i8, 10},
4733 };
4734
4735 // Before legalizing the type, give a chance to look up illegal narrow types
4736 // in the table.
4737 // FIXME: Is there a better way to do this?
4738 EVT VT = TLI->getValueType(DL, ValTy);
4739 if (VT.isSimple()) {
4
Taking false branch
4740 MVT MTy = VT.getSimpleVT();
4741 if (ST->hasBWI())
4742 if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy))
4743 return Entry->Cost;
4744
4745 if (ST->hasAVX())
4746 if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
4747 return Entry->Cost;
4748
4749 if (ST->hasSSE41())
4750 if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy))
4751 return Entry->Cost;
4752
4753 if (ST->hasSSE2())
4754 if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
4755 return Entry->Cost;
4756 }
4757
4758 auto *ValVTy = cast<FixedVectorType>(ValTy);
5
'ValTy' is a 'CastReturnType'
4759 unsigned NumVecElts = ValVTy->getNumElements();
4760
4761 auto *Ty = ValVTy;
4762 InstructionCost MinMaxCost = 0;
4763 if (LT.first != 1 && MTy.isVector() &&
4764 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
4765 // Type needs to be split. We need LT.first - 1 operations ops.
4766 Ty = FixedVectorType::get(ValVTy->getElementType(),
4767 MTy.getVectorNumElements());
4768 auto *SubCondTy = FixedVectorType::get(CondTy->getElementType(),
4769 MTy.getVectorNumElements());
4770 MinMaxCost = getMinMaxCost(Ty, SubCondTy, IsUnsigned);
4771 MinMaxCost *= LT.first - 1;
4772 NumVecElts = MTy.getVectorNumElements();
4773 }
4774
4775 if (ST->hasBWI())
6
Assuming the condition is false
7
Taking false branch
4776 if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy))
4777 return MinMaxCost + Entry->Cost;
4778
4779 if (ST->hasAVX())
8
Taking false branch
4780 if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
4781 return MinMaxCost + Entry->Cost;
4782
4783 if (ST->hasSSE41())
9
Taking false branch
4784 if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy))
4785 return MinMaxCost + Entry->Cost;
4786
4787 if (ST->hasSSE2())
10
Taking false branch
4788 if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
4789 return MinMaxCost + Entry->Cost;
4790
4791 unsigned ScalarSize = ValTy->getScalarSizeInBits();
4792
4793 // Special case power of 2 reductions where the scalar type isn't changed
4794 // by type legalization.
4795 if (!isPowerOf2_32(ValVTy->getNumElements()) ||
12
Taking false branch
4796 ScalarSize != MTy.getScalarSizeInBits())
11
Assuming the condition is false
4797 return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsUnsigned, CostKind);
4798
4799 // Now handle reduction with the legal type, taking into account size changes
4800 // at each level.
4801 while (NumVecElts > 1) {
13
Assuming 'NumVecElts' is <= 1
14
Loop condition is false. Execution continues on line 4848
4802 // Determine the size of the remaining vector we need to reduce.
4803 unsigned Size = NumVecElts * ScalarSize;
4804 NumVecElts /= 2;
4805 // If we're reducing from 256/512 bits, use an extract_subvector.
4806 if (Size > 128) {
4807 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
4808 MinMaxCost +=
4809 getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, NumVecElts, SubTy);
4810 Ty = SubTy;
4811 } else if (Size == 128) {
4812 // Reducing from 128 bits is a permute of v2f64/v2i64.
4813 VectorType *ShufTy;
4814 if (ValTy->isFloatingPointTy())
4815 ShufTy =
4816 FixedVectorType::get(Type::getDoubleTy(ValTy->getContext()), 2);
4817 else
4818 ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
4819 MinMaxCost +=
4820 getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr);
4821 } else if (Size == 64) {
4822 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
4823 FixedVectorType *ShufTy;
4824 if (ValTy->isFloatingPointTy())
4825 ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
4826 else
4827 ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
4828 MinMaxCost +=
4829 getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr);
4830 } else {
4831 // Reducing from smaller size is a shift by immediate.
4832 auto *ShiftTy = FixedVectorType::get(
4833 Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
4834 MinMaxCost += getArithmeticInstrCost(
4835 Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
4836 TargetTransformInfo::OK_AnyValue,
4837 TargetTransformInfo::OK_UniformConstantValue,
4838 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
4839 }
4840
4841 // Add the arithmetic op for this level.
4842 auto *SubCondTy =
4843 FixedVectorType::get(CondTy->getElementType(), Ty->getNumElements());
4844 MinMaxCost += getMinMaxCost(Ty, SubCondTy, IsUnsigned);
4845 }
4846
4847 // Add the final extract element to the cost.
4848 return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
15
Calling 'X86TTIImpl::getVectorInstrCost'
4849}
4850
4851/// Calculate the cost of materializing a 64-bit value. This helper
4852/// method might only calculate a fraction of a larger immediate. Therefore it
4853/// is valid to return a cost of ZERO.
4854InstructionCost X86TTIImpl::getIntImmCost(int64_t Val) {
4855 if (Val == 0)
4856 return TTI::TCC_Free;
4857
4858 if (isInt<32>(Val))
4859 return TTI::TCC_Basic;
4860
4861 return 2 * TTI::TCC_Basic;
4862}
4863
4864InstructionCost X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
4865 TTI::TargetCostKind CostKind) {
4866 assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
__assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 4866, __extension__ __PRETTY_FUNCTION__))
;
4867
4868 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4869 if (BitSize == 0)
4870 return ~0U;
4871
4872 // Never hoist constants larger than 128bit, because this might lead to
4873 // incorrect code generation or assertions in codegen.
4874 // Fixme: Create a cost model for types larger than i128 once the codegen
4875 // issues have been fixed.
4876 if (BitSize > 128)
4877 return TTI::TCC_Free;
4878
4879 if (Imm == 0)
4880 return TTI::TCC_Free;
4881
4882 // Sign-extend all constants to a multiple of 64-bit.
4883 APInt ImmVal = Imm;
4884 if (BitSize % 64 != 0)
4885 ImmVal = Imm.sext(alignTo(BitSize, 64));
4886
4887 // Split the constant into 64-bit chunks and calculate the cost for each
4888 // chunk.
4889 InstructionCost Cost = 0;
4890 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
4891 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
4892 int64_t Val = Tmp.getSExtValue();
4893 Cost += getIntImmCost(Val);
4894 }
4895 // We need at least one instruction to materialize the constant.
4896 return std::max<InstructionCost>(1, Cost);
4897}
4898
4899InstructionCost X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
4900 const APInt &Imm, Type *Ty,
4901 TTI::TargetCostKind CostKind,
4902 Instruction *Inst) {
4903 assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
__assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 4903, __extension__ __PRETTY_FUNCTION__))
;
4904
4905 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4906 // There is no cost model for constants with a bit size of 0. Return TCC_Free
4907 // here, so that constant hoisting will ignore this constant.
4908 if (BitSize == 0)
4909 return TTI::TCC_Free;
4910
4911 unsigned ImmIdx = ~0U;
4912 switch (Opcode) {
4913 default:
4914 return TTI::TCC_Free;
4915 case Instruction::GetElementPtr:
4916 // Always hoist the base address of a GetElementPtr. This prevents the
4917 // creation of new constants for every base constant that gets constant
4918 // folded with the offset.
4919 if (Idx == 0)
4920 return 2 * TTI::TCC_Basic;
4921 return TTI::TCC_Free;
4922 case Instruction::Store:
4923 ImmIdx = 0;
4924 break;
4925 case Instruction::ICmp:
4926 // This is an imperfect hack to prevent constant hoisting of
4927 // compares that might be trying to check if a 64-bit value fits in
4928 // 32-bits. The backend can optimize these cases using a right shift by 32.
4929 // Ideally we would check the compare predicate here. There also other
4930 // similar immediates the backend can use shifts for.
4931 if (Idx == 1 && Imm.getBitWidth() == 64) {
4932 uint64_t ImmVal = Imm.getZExtValue();
4933 if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
4934 return TTI::TCC_Free;
4935 }
4936 ImmIdx = 1;
4937 break;
4938 case Instruction::And:
4939 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
4940 // by using a 32-bit operation with implicit zero extension. Detect such
4941 // immediates here as the normal path expects bit 31 to be sign extended.
4942 if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue()))
4943 return TTI::TCC_Free;
4944 ImmIdx = 1;
4945 break;
4946 case Instruction::Add:
4947 case Instruction::Sub:
4948 // For add/sub, we can use the opposite instruction for INT32_MIN.
4949 if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000)
4950 return TTI::TCC_Free;
4951 ImmIdx = 1;
4952 break;
4953 case Instruction::UDiv:
4954 case Instruction::SDiv:
4955 case Instruction::URem:
4956 case Instruction::SRem:
4957 // Division by constant is typically expanded later into a different
4958 // instruction sequence. This completely changes the constants.
4959 // Report them as "free" to stop ConstantHoist from marking them as opaque.
4960 return TTI::TCC_Free;
4961 case Instruction::Mul:
4962 case Instruction::Or:
4963 case Instruction::Xor:
4964 ImmIdx = 1;
4965 break;
4966 // Always return TCC_Free for the shift value of a shift instruction.
4967 case Instruction::Shl:
4968 case Instruction::LShr:
4969 case Instruction::AShr:
4970 if (Idx == 1)
4971 return TTI::TCC_Free;
4972 break;
4973 case Instruction::Trunc:
4974 case Instruction::ZExt:
4975 case Instruction::SExt:
4976 case Instruction::IntToPtr:
4977 case Instruction::PtrToInt:
4978 case Instruction::BitCast:
4979 case Instruction::PHI:
4980 case Instruction::Call:
4981 case Instruction::Select:
4982 case Instruction::Ret:
4983 case Instruction::Load:
4984 break;
4985 }
4986
4987 if (Idx == ImmIdx) {
4988 int NumConstants = divideCeil(BitSize, 64);
4989 InstructionCost Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
4990 return (Cost <= NumConstants * TTI::TCC_Basic)
4991 ? static_cast<int>(TTI::TCC_Free)
4992 : Cost;
4993 }
4994
4995 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
4996}
4997
4998InstructionCost X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
4999 const APInt &Imm, Type *Ty,
5000 TTI::TargetCostKind CostKind) {
5001 assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
__assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 5001, __extension__ __PRETTY_FUNCTION__))
;
5002
5003 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5004 // There is no cost model for constants with a bit size of 0. Return TCC_Free
5005 // here, so that constant hoisting will ignore this constant.
5006 if (BitSize == 0)
5007 return TTI::TCC_Free;
5008
5009 switch (IID) {
5010 default:
5011 return TTI::TCC_Free;
5012 case Intrinsic::sadd_with_overflow:
5013 case Intrinsic::uadd_with_overflow:
5014 case Intrinsic::ssub_with_overflow:
5015 case Intrinsic::usub_with_overflow:
5016 case Intrinsic::smul_with_overflow:
5017 case Intrinsic::umul_with_overflow:
5018 if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
5019 return TTI::TCC_Free;
5020 break;
5021 case Intrinsic::experimental_stackmap:
5022 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
5023 return TTI::TCC_Free;
5024 break;
5025 case Intrinsic::experimental_patchpoint_void:
5026 case Intrinsic::experimental_patchpoint_i64:
5027 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
5028 return TTI::TCC_Free;
5029 break;
5030 }
5031 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
5032}
5033
5034InstructionCost X86TTIImpl::getCFInstrCost(unsigned Opcode,
5035 TTI::TargetCostKind CostKind,
5036 const Instruction *I) {
5037 if (CostKind != TTI::TCK_RecipThroughput)
5038 return Opcode == Instruction::PHI ? 0 : 1;
5039 // Branches are assumed to be predicted.
5040 return 0;
5041}
5042
5043int X86TTIImpl::getGatherOverhead() const {
5044 // Some CPUs have more overhead for gather. The specified overhead is relative
5045 // to the Load operation. "2" is the number provided by Intel architects. This
5046 // parameter is used for cost estimation of Gather Op and comparison with
5047 // other alternatives.
5048 // TODO: Remove the explicit hasAVX512()?, That would mean we would only
5049 // enable gather with a -march.
5050 if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
5051 return 2;
5052
5053 return 1024;
5054}
5055
5056int X86TTIImpl::getScatterOverhead() const {
5057 if (ST->hasAVX512())
5058 return 2;
5059
5060 return 1024;
5061}
5062
5063// Return an average cost of Gather / Scatter instruction, maybe improved later.
5064// FIXME: Add TargetCostKind support.
5065InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy,
5066 const Value *Ptr, Align Alignment,
5067 unsigned AddressSpace) {
5068
5069 assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost")(static_cast <bool> (isa<VectorType>(SrcVTy) &&
"Unexpected type in getGSVectorCost") ? void (0) : __assert_fail
("isa<VectorType>(SrcVTy) && \"Unexpected type in getGSVectorCost\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5069, __extension__
__PRETTY_FUNCTION__))
;
5070 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
5071
5072 // Try to reduce index size from 64 bit (default for GEP)
5073 // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
5074 // operation will use 16 x 64 indices which do not fit in a zmm and needs
5075 // to split. Also check that the base pointer is the same for all lanes,
5076 // and that there's at most one variable index.
5077 auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
5078 unsigned IndexSize = DL.getPointerSizeInBits();
5079 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
5080 if (IndexSize < 64 || !GEP)
5081 return IndexSize;
5082
5083 unsigned NumOfVarIndices = 0;
5084 const Value *Ptrs = GEP->getPointerOperand();
5085 if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
5086 return IndexSize;
5087 for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
5088 if (isa<Constant>(GEP->getOperand(i)))
5089 continue;
5090 Type *IndxTy = GEP->getOperand(i)->getType();
5091 if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
5092 IndxTy = IndexVTy->getElementType();
5093 if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
5094 !isa<SExtInst>(GEP->getOperand(i))) ||
5095 ++NumOfVarIndices > 1)
5096 return IndexSize; // 64
5097 }
5098 return (unsigned)32;
5099 };
5100
5101 // Trying to reduce IndexSize to 32 bits for vector 16.
5102 // By default the IndexSize is equal to pointer size.
5103 unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
5104 ? getIndexSizeInBits(Ptr, DL)
5105 : DL.getPointerSizeInBits();
5106
5107 auto *IndexVTy = FixedVectorType::get(
5108 IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
5109 std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(IndexVTy);
5110 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcVTy);
5111 InstructionCost::CostType SplitFactor =
5112 *std::max(IdxsLT.first, SrcLT.first).getValue();
5113 if (SplitFactor > 1) {
5114 // Handle splitting of vector of pointers
5115 auto *SplitSrcTy =
5116 FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
5117 return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
5118 AddressSpace);
5119 }
5120
5121 // The gather / scatter cost is given by Intel architects. It is a rough
5122 // number since we are looking at one instruction in a time.
5123 const int GSOverhead = (Opcode == Instruction::Load)
5124 ? getGatherOverhead()
5125 : getScatterOverhead();
5126 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5127 MaybeAlign(Alignment), AddressSpace,
5128 TTI::TCK_RecipThroughput);
5129}
5130
5131/// Return the cost of full scalarization of gather / scatter operation.
5132///
5133/// Opcode - Load or Store instruction.
5134/// SrcVTy - The type of the data vector that should be gathered or scattered.
5135/// VariableMask - The mask is non-constant at compile time.
5136/// Alignment - Alignment for one element.
5137/// AddressSpace - pointer[s] address space.
5138///
5139/// FIXME: Add TargetCostKind support.
5140InstructionCost X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
5141 bool VariableMask, Align Alignment,
5142 unsigned AddressSpace) {
5143 Type *ScalarTy = SrcVTy->getScalarType();
5144 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
5145 APInt DemandedElts = APInt::getAllOnes(VF);
5146 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5147
5148 InstructionCost MaskUnpackCost = 0;
5149 if (VariableMask) {
5150 auto *MaskTy =
5151 FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
5152 MaskUnpackCost = getScalarizationOverhead(
5153 MaskTy, DemandedElts, /*Insert=*/false, /*Extract=*/true);
5154 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
5155 Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr,
5156 CmpInst::BAD_ICMP_PREDICATE, CostKind);
5157 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
5158 MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
5159 }
5160
5161 InstructionCost AddressUnpackCost = getScalarizationOverhead(
5162 FixedVectorType::get(ScalarTy->getPointerTo(), VF), DemandedElts,
5163 /*Insert=*/false, /*Extract=*/true);
5164
5165 // The cost of the scalar loads/stores.
5166 InstructionCost MemoryOpCost =
5167 VF * getMemoryOpCost(Opcode, ScalarTy, MaybeAlign(Alignment),
5168 AddressSpace, CostKind);
5169
5170 // The cost of forming the vector from loaded scalars/
5171 // scalarizing the vector to perform scalar stores.
5172 InstructionCost InsertExtractCost =
5173 getScalarizationOverhead(cast<FixedVectorType>(SrcVTy), DemandedElts,
5174 /*Insert=*/Opcode == Instruction::Load,
5175 /*Extract=*/Opcode == Instruction::Store);
5176
5177 return AddressUnpackCost + MemoryOpCost + MaskUnpackCost + InsertExtractCost;
5178}
5179
5180/// Calculate the cost of Gather / Scatter operation
5181InstructionCost X86TTIImpl::getGatherScatterOpCost(
5182 unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask,
5183 Align Alignment, TTI::TargetCostKind CostKind,
5184 const Instruction *I = nullptr) {
5185 if (CostKind != TTI::TCK_RecipThroughput) {
5186 if ((Opcode == Instruction::Load &&
5187 isLegalMaskedGather(SrcVTy, Align(Alignment)) &&
5188 !forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
5189 Align(Alignment))) ||
5190 (Opcode == Instruction::Store &&
5191 isLegalMaskedScatter(SrcVTy, Align(Alignment)) &&
5192 !forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
5193 Align(Alignment))))
5194 return 1;
5195 return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask,
5196 Alignment, CostKind, I);
5197 }
5198
5199 assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter")(static_cast <bool> (SrcVTy->isVectorTy() &&
"Unexpected data type for Gather/Scatter") ? void (0) : __assert_fail
("SrcVTy->isVectorTy() && \"Unexpected data type for Gather/Scatter\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5199, __extension__
__PRETTY_FUNCTION__))
;
5200 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
5201 if (!PtrTy && Ptr->getType()->isVectorTy())
5202 PtrTy = dyn_cast<PointerType>(
5203 cast<VectorType>(Ptr->getType())->getElementType());
5204 assert(PtrTy && "Unexpected type for Ptr argument")(static_cast <bool> (PtrTy && "Unexpected type for Ptr argument"
) ? void (0) : __assert_fail ("PtrTy && \"Unexpected type for Ptr argument\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5204, __extension__
__PRETTY_FUNCTION__))
;
5205 unsigned AddressSpace = PtrTy->getAddressSpace();
5206
5207 if ((Opcode == Instruction::Load &&
5208 (!isLegalMaskedGather(SrcVTy, Align(Alignment)) ||
5209 forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
5210 Align(Alignment)))) ||
5211 (Opcode == Instruction::Store &&
5212 (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) ||
5213 forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
5214 Align(Alignment)))))
5215 return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
5216 AddressSpace);
5217
5218 return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
5219}
5220
5221bool X86TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
5222 const TargetTransformInfo::LSRCost &C2) {
5223 // X86 specific here are "instruction number 1st priority".
5224 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
5225 C1.NumIVMuls, C1.NumBaseAdds,
5226 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
5227 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
5228 C2.NumIVMuls, C2.NumBaseAdds,
5229 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
5230}
5231
5232bool X86TTIImpl::canMacroFuseCmp() {
5233 return ST->hasMacroFusion() || ST->hasBranchFusion();
5234}
5235
5236bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
5237 if (!ST->hasAVX())
5238 return false;
5239
5240 // The backend can't handle a single element vector.
5241 if (isa<VectorType>(DataTy) &&
5242 cast<FixedVectorType>(DataTy)->getNumElements() == 1)
5243 return false;
5244 Type *ScalarTy = DataTy->getScalarType();
5245
5246 if (ScalarTy->isPointerTy())
5247 return true;
5248
5249 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
5250 return true;
5251
5252 if (ScalarTy->isHalfTy() && ST->hasBWI())
5253 return true;
5254
5255 if (!ScalarTy->isIntegerTy())
5256 return false;
5257
5258 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
5259 return IntWidth == 32 || IntWidth == 64 ||
5260 ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
5261}
5262
5263bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) {
5264 return isLegalMaskedLoad(DataType, Alignment);
5265}
5266
5267bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) {
5268 unsigned DataSize = DL.getTypeStoreSize(DataType);
5269 // The only supported nontemporal loads are for aligned vectors of 16 or 32
5270 // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
5271 // (the equivalent stores only require AVX).
5272 if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
5273 return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
5274
5275 return false;
5276}
5277
5278bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) {
5279 unsigned DataSize = DL.getTypeStoreSize(DataType);
5280
5281 // SSE4A supports nontemporal stores of float and double at arbitrary
5282 // alignment.
5283 if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
5284 return true;
5285
5286 // Besides the SSE4A subtarget exception above, only aligned stores are
5287 // available nontemporaly on any other subtarget. And only stores with a size
5288 // of 4..32 bytes (powers of 2, only) are permitted.
5289 if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
5290 !isPowerOf2_32(DataSize))
5291 return false;
5292
5293 // 32-byte vector nontemporal stores are supported by AVX (the equivalent
5294 // loads require AVX2).
5295 if (DataSize == 32)
5296 return ST->hasAVX();
5297 if (DataSize == 16)
5298 return ST->hasSSE1();
5299 return true;
5300}
5301
5302bool X86TTIImpl::isLegalBroadcastLoad(Type *ElementTy,
5303 ElementCount NumElements) const {
5304 // movddup
5305 return ST->hasSSE3() && !NumElements.isScalable() &&
5306 NumElements.getFixedValue() == 2 &&
5307 ElementTy == Type::getDoubleTy(ElementTy->getContext());
5308}
5309
5310bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) {
5311 if (!isa<VectorType>(DataTy))
5312 return false;
5313
5314 if (!ST->hasAVX512())
5315 return false;
5316
5317 // The backend can't handle a single element vector.
5318 if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
5319 return false;
5320
5321 Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
5322
5323 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
5324 return true;
5325
5326 if (!ScalarTy->isIntegerTy())
5327 return false;
5328
5329 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
5330 return IntWidth == 32 || IntWidth == 64 ||
5331 ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
5332}
5333
5334bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) {
5335 return isLegalMaskedExpandLoad(DataTy);
5336}
5337
5338bool X86TTIImpl::supportsGather() const {
5339 // Some CPUs have better gather performance than others.
5340 // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
5341 // enable gather with a -march.
5342 return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2());
5343}
5344
5345bool X86TTIImpl::forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) {
5346 // Gather / Scatter for vector 2 is not profitable on KNL / SKX
5347 // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend
5348 // it to 8 elements, but zeroing upper bits of the mask vector will add more
5349 // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO:
5350 // Check, maybe the gather/scatter instruction is better in the VariableMask
5351 // case.
5352 unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements();
5353 return NumElts == 1 ||
5354 (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())));
5355}
5356
5357bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
5358 if (!supportsGather())
5359 return false;
5360 Type *ScalarTy = DataTy->getScalarType();
5361 if (ScalarTy->isPointerTy())
5362 return true;
5363
5364 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
5365 return true;
5366
5367 if (!ScalarTy->isIntegerTy())
5368 return false;
5369
5370 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
5371 return IntWidth == 32 || IntWidth == 64;
5372}
5373
5374bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
5375 unsigned Opcode1,
5376 const SmallBitVector &OpcodeMask) const {
5377 // ADDSUBPS 4xf32 SSE3
5378 // VADDSUBPS 4xf32 AVX
5379 // VADDSUBPS 8xf32 AVX2
5380 // ADDSUBPD 2xf64 SSE3
5381 // VADDSUBPD 2xf64 AVX
5382 // VADDSUBPD 4xf64 AVX2
5383
5384 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
5385 assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible")(static_cast <bool> (OpcodeMask.size() == NumElements &&
"Mask and VecTy are incompatible") ? void (0) : __assert_fail
("OpcodeMask.size() == NumElements && \"Mask and VecTy are incompatible\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5385, __extension__
__PRETTY_FUNCTION__))
;
5386 if (!isPowerOf2_32(NumElements))
5387 return false;
5388 // Check the opcode pattern. We apply the mask on the opcode arguments and
5389 // then check if it is what we expect.
5390 for (int Lane : seq<int>(0, NumElements)) {
5391 unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0;
5392 // We expect FSub for even lanes and FAdd for odd lanes.
5393 if (Lane % 2 == 0 && Opc != Instruction::FSub)
5394 return false;
5395 if (Lane % 2 == 1 && Opc != Instruction::FAdd)
5396 return false;
5397 }
5398 // Now check that the pattern is supported by the target ISA.
5399 Type *ElemTy = cast<VectorType>(VecTy)->getElementType();
5400 if (ElemTy->isFloatTy())
5401 return ST->hasSSE3() && NumElements % 4 == 0;
5402 if (ElemTy->isDoubleTy())
5403 return ST->hasSSE3() && NumElements % 2 == 0;
5404 return false;
5405}
5406
5407bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) {
5408 // AVX2 doesn't support scatter
5409 if (!ST->hasAVX512())
5410 return false;
5411 return isLegalMaskedGather(DataType, Alignment);
5412}
5413
5414bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
5415 EVT VT = TLI->getValueType(DL, DataType);
5416 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
5417}
5418
5419bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
5420 return false;
5421}
5422
5423bool X86TTIImpl::areInlineCompatible(const Function *Caller,
5424 const Function *Callee) const {
5425 const TargetMachine &TM = getTLI()->getTargetMachine();
5426
5427 // Work this as a subsetting of subtarget features.
5428 const FeatureBitset &CallerBits =
5429 TM.getSubtargetImpl(*Caller)->getFeatureBits();
5430 const FeatureBitset &CalleeBits =
5431 TM.getSubtargetImpl(*Callee)->getFeatureBits();
5432
5433 // Check whether features are the same (apart from the ignore list).
5434 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
5435 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
5436 if (RealCallerBits == RealCalleeBits)
5437 return true;
5438
5439 // If the features are a subset, we need to additionally check for calls
5440 // that may become ABI-incompatible as a result of inlining.
5441 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
5442 return false;
5443
5444 for (const Instruction &I : instructions(Callee)) {
5445 if (const auto *CB = dyn_cast<CallBase>(&I)) {
5446 SmallVector<Type *, 8> Types;
5447 for (Value *Arg : CB->args())
5448 Types.push_back(Arg->getType());
5449 if (!CB->getType()->isVoidTy())
5450 Types.push_back(CB->getType());
5451
5452 // Simple types are always ABI compatible.
5453 auto IsSimpleTy = [](Type *Ty) {
5454 return !Ty->isVectorTy() && !Ty->isAggregateType();
5455 };
5456 if (all_of(Types, IsSimpleTy))
5457 continue;
5458
5459 if (Function *NestedCallee = CB->getCalledFunction()) {
5460 // Assume that intrinsics are always ABI compatible.
5461 if (NestedCallee->isIntrinsic())
5462 continue;
5463
5464 // Do a precise compatibility check.
5465 if (!areTypesABICompatible(Caller, NestedCallee, Types))
5466 return false;
5467 } else {
5468 // We don't know the target features of the callee,
5469 // assume it is incompatible.
5470 return false;
5471 }
5472 }
5473 }
5474 return true;
5475}
5476
5477bool X86TTIImpl::areTypesABICompatible(const Function *Caller,
5478 const Function *Callee,
5479 const ArrayRef<Type *> &Types) const {
5480 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
5481 return false;
5482
5483 // If we get here, we know the target features match. If one function
5484 // considers 512-bit vectors legal and the other does not, consider them
5485 // incompatible.
5486 const TargetMachine &TM = getTLI()->getTargetMachine();
5487
5488 if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
5489 TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs())
5490 return true;
5491
5492 // Consider the arguments compatible if they aren't vectors or aggregates.
5493 // FIXME: Look at the size of vectors.
5494 // FIXME: Look at the element types of aggregates to see if there are vectors.
5495 return llvm::none_of(Types,
5496 [](Type *T) { return T->isVectorTy() || T->isAggregateType(); });
5497}
5498
5499X86TTIImpl::TTI::MemCmpExpansionOptions
5500X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
5501 TTI::MemCmpExpansionOptions Options;
5502 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
5503 Options.NumLoadsPerBlock = 2;
5504 // All GPR and vector loads can be unaligned.
5505 Options.AllowOverlappingLoads = true;
5506 if (IsZeroCmp) {
5507 // Only enable vector loads for equality comparison. Right now the vector
5508 // version is not as fast for three way compare (see #33329).
5509 const unsigned PreferredWidth = ST->getPreferVectorWidth();
5510 if (PreferredWidth >= 512 && ST->hasAVX512()) Options.LoadSizes.push_back(64);
5511 if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
5512 if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
5513 }
5514 if (ST->is64Bit()) {
5515 Options.LoadSizes.push_back(8);
5516 }
5517 Options.LoadSizes.push_back(4);
5518 Options.LoadSizes.push_back(2);
5519 Options.LoadSizes.push_back(1);
5520 return Options;
5521}
5522
5523bool X86TTIImpl::prefersVectorizedAddressing() const {
5524 return supportsGather();
5525}
5526
5527bool X86TTIImpl::supportsEfficientVectorElementLoadStore() const {
5528 return false;
5529}
5530
5531bool X86TTIImpl::enableInterleavedAccessVectorization() {
5532 // TODO: We expect this to be beneficial regardless of arch,
5533 // but there are currently some unexplained performance artifacts on Atom.
5534 // As a temporary solution, disable on Atom.
5535 return !(ST->isAtom());
5536}
5537
5538// Get estimation for interleaved load/store operations and strided load.
5539// \p Indices contains indices for strided load.
5540// \p Factor - the factor of interleaving.
5541// AVX-512 provides 3-src shuffles that significantly reduces the cost.
5542InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
5543 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
5544 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
5545 TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
5546 // VecTy for interleave memop is <VF*Factor x Elt>.
5547 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
5548 // VecTy = <12 x i32>.
5549
5550 // Calculate the number of memory operations (NumOfMemOps), required
5551 // for load/store the VecTy.
5552 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
5553 unsigned VecTySize = DL.getTypeStoreSize(VecTy);
5554 unsigned LegalVTSize = LegalVT.getStoreSize();
5555 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
5556
5557 // Get the cost of one memory operation.
5558 auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
5559 LegalVT.getVectorNumElements());
5560 InstructionCost MemOpCost;
5561 bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps;
5562 if (UseMaskedMemOp)
5563 MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment,
5564 AddressSpace, CostKind);
5565 else
5566 MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, MaybeAlign(Alignment),
5567 AddressSpace, CostKind);
5568
5569 unsigned VF = VecTy->getNumElements() / Factor;
5570 MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
5571
5572 InstructionCost MaskCost;
5573 if (UseMaskedMemOp) {
5574 APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements());
5575 for (unsigned Index : Indices) {
5576 assert(Index < Factor && "Invalid index for interleaved memory op")(static_cast <bool> (Index < Factor && "Invalid index for interleaved memory op"
) ? void (0) : __assert_fail ("Index < Factor && \"Invalid index for interleaved memory op\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5576, __extension__
__PRETTY_FUNCTION__))
;
5577 for (unsigned Elm = 0; Elm < VF; Elm++)
5578 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
5579 }
5580
5581 Type *I1Type = Type::getInt1Ty(VecTy->getContext());
5582
5583 MaskCost = getReplicationShuffleCost(
5584 I1Type, Factor, VF,
5585 UseMaskForGaps ? DemandedLoadStoreElts
5586 : APInt::getAllOnes(VecTy->getNumElements()),
5587 CostKind);
5588
5589 // The Gaps mask is invariant and created outside the loop, therefore the
5590 // cost of creating it is not accounted for here. However if we have both
5591 // a MaskForGaps and some other mask that guards the execution of the
5592 // memory access, we need to account for the cost of And-ing the two masks
5593 // inside the loop.
5594 if (UseMaskForGaps) {
5595 auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements());
5596 MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind);
5597 }
5598 }
5599
5600 if (Opcode == Instruction::Load) {
5601 // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
5602 // contain the cost of the optimized shuffle sequence that the
5603 // X86InterleavedAccess pass will generate.
5604 // The cost of loads and stores are computed separately from the table.
5605
5606 // X86InterleavedAccess support only the following interleaved-access group.
5607 static const CostTblEntry AVX512InterleavedLoadTbl[] = {
5608 {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
5609 {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
5610 {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
5611 };
5612
5613 if (const auto *Entry =
5614 CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
5615 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
5616 //If an entry does not exist, fallback to the default implementation.
5617
5618 // Kind of shuffle depends on number of loaded values.
5619 // If we load the entire data in one register, we can use a 1-src shuffle.
5620 // Otherwise, we'll merge 2 sources in each operation.
5621 TTI::ShuffleKind ShuffleKind =
5622 (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
5623
5624 InstructionCost ShuffleCost =
5625 getShuffleCost(ShuffleKind, SingleMemOpTy, None, 0, nullptr);
5626
5627 unsigned NumOfLoadsInInterleaveGrp =
5628 Indices.size() ? Indices.size() : Factor;
5629 auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
5630 VecTy->getNumElements() / Factor);
5631 InstructionCost NumOfResults =
5632 getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp;
5633
5634 // About a half of the loads may be folded in shuffles when we have only
5635 // one result. If we have more than one result, or the loads are masked,
5636 // we do not fold loads at all.
5637 unsigned NumOfUnfoldedLoads =
5638 UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
5639
5640 // Get a number of shuffle operations per result.
5641 unsigned NumOfShufflesPerResult =
5642 std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
5643
5644 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
5645 // When we have more than one destination, we need additional instructions
5646 // to keep sources.
5647 InstructionCost NumOfMoves = 0;
5648 if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
5649 NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
5650
5651 InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
5652 MaskCost + NumOfUnfoldedLoads * MemOpCost +
5653 NumOfMoves;
5654
5655 return Cost;
5656 }
5657
5658 // Store.
5659 assert(Opcode == Instruction::Store &&(static_cast <bool> (Opcode == Instruction::Store &&
"Expected Store Instruction at this point") ? void (0) : __assert_fail
("Opcode == Instruction::Store && \"Expected Store Instruction at this point\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5660, __extension__
__PRETTY_FUNCTION__))
5660 "Expected Store Instruction at this point")(static_cast <bool> (Opcode == Instruction::Store &&
"Expected Store Instruction at this point") ? void (0) : __assert_fail
("Opcode == Instruction::Store && \"Expected Store Instruction at this point\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5660, __extension__
__PRETTY_FUNCTION__))
;
5661 // X86InterleavedAccess support only the following interleaved-access group.
5662 static const CostTblEntry AVX512InterleavedStoreTbl[] = {
5663 {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
5664 {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
5665 {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
5666
5667 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
5668 {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
5669 {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
5670 {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
5671 };
5672
5673 if (const auto *Entry =
5674 CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
5675 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
5676 //If an entry does not exist, fallback to the default implementation.
5677
5678 // There is no strided stores meanwhile. And store can't be folded in
5679 // shuffle.
5680 unsigned NumOfSources = Factor; // The number of values to be merged.
5681 InstructionCost ShuffleCost =
5682 getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, None, 0, nullptr);
5683 unsigned NumOfShufflesPerStore = NumOfSources - 1;
5684
5685 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
5686 // We need additional instructions to keep sources.
5687 unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
5688 InstructionCost Cost =
5689 MaskCost +
5690 NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
5691 NumOfMoves;
5692 return Cost;
5693}
5694
5695InstructionCost X86TTIImpl::getInterleavedMemoryOpCost(
5696 unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices,
5697 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
5698 bool UseMaskForCond, bool UseMaskForGaps) {
5699 auto *VecTy = cast<FixedVectorType>(BaseTy);
5700
5701 auto isSupportedOnAVX512 = [&](Type *VecTy, bool HasBW) {
5702 Type *EltTy = cast<VectorType>(VecTy)->getElementType();
5703 if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
5704 EltTy->isIntegerTy(32) || EltTy->isPointerTy())
5705 return true;
5706 if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy())
5707 return HasBW;
5708 return false;
5709 };
5710 if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
5711 return getInterleavedMemoryOpCostAVX512(
5712 Opcode, VecTy, Factor, Indices, Alignment,
5713 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
5714
5715 if (UseMaskForCond || UseMaskForGaps)
5716 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
5717 Alignment, AddressSpace, CostKind,
5718 UseMaskForCond, UseMaskForGaps);
5719
5720 // Get estimation for interleaved load/store operations for SSE-AVX2.
5721 // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow
5722 // computing the cost using a generic formula as a function of generic
5723 // shuffles. We therefore use a lookup table instead, filled according to
5724 // the instruction sequences that codegen currently generates.
5725
5726 // VecTy for interleave memop is <VF*Factor x Elt>.
5727 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
5728 // VecTy = <12 x i32>.
5729 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
5730
5731 // This function can be called with VecTy=<6xi128>, Factor=3, in which case
5732 // the VF=2, while v2i128 is an unsupported MVT vector type
5733 // (see MachineValueType.h::getVectorVT()).
5734 if (!LegalVT.isVector())
5735 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
5736 Alignment, AddressSpace, CostKind);
5737
5738 unsigned VF = VecTy->getNumElements() / Factor;
5739 Type *ScalarTy = VecTy->getElementType();
5740 // Deduplicate entries, model floats/pointers as appropriately-sized integers.
5741 if (!ScalarTy->isIntegerTy())
5742 ScalarTy =
5743 Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
5744
5745 // Get the cost of all the memory operations.
5746 // FIXME: discount dead loads.
5747 InstructionCost MemOpCosts = getMemoryOpCost(
5748 Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind);
5749
5750 auto *VT = FixedVectorType::get(ScalarTy, VF);
5751 EVT ETy = TLI->getValueType(DL, VT);
5752 if (!ETy.isSimple())
5753 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
5754 Alignment, AddressSpace, CostKind);
5755
5756 // TODO: Complete for other data-types and strides.
5757 // Each combination of Stride, element bit width and VF results in a different
5758 // sequence; The cost tables are therefore accessed with:
5759 // Factor (stride) and VectorType=VFxiN.
5760 // The Cost accounts only for the shuffle sequence;
5761 // The cost of the loads/stores is accounted for separately.
5762 //
5763 static const CostTblEntry AVX2InterleavedLoadTbl[] = {
5764 {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8
5765 {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8
5766 {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8
5767 {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8
5768 {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8
5769
5770 {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16
5771 {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16
5772 {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16
5773
5774 {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32
5775 {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32
5776 {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32
5777
5778 {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64
5779 {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64
5780 {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64
5781 {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64
5782
5783 {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8
5784 {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8
5785 {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8
5786 {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
5787 {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8
5788
5789 {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16
5790 {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16
5791 {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16
5792 {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16
5793 {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16
5794
5795 {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32
5796 {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32
5797 {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32
5798 {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32
5799 {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32
5800
5801 {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64
5802 {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64
5803 {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64
5804 {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64
5805
5806 {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8
5807 {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8
5808 {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8
5809 {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8
5810 {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8
5811
5812 {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16
5813 {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16
5814 {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16
5815 {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16
5816 {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16
5817
5818 {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32
5819 {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32
5820 {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32
5821 {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32
5822 {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32
5823
5824 {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64
5825 {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64
5826 {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64
5827 {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64
5828
5829 {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8
5830 {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8
5831 {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8
5832 {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8
5833 {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8
5834
5835 {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16
5836 {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16
5837 {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16
5838 {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16
5839 {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16
5840
5841 {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32
5842 {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32
5843 {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32
5844 {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32
5845
5846 {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64
5847 {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64
5848 {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64
5849
5850 {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
5851 };
5852
5853 static const CostTblEntry SSSE3InterleavedLoadTbl[] = {
5854 {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16
5855 };
5856
5857 static const CostTblEntry SSE2InterleavedLoadTbl[] = {
5858 {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16
5859 {2, MVT::v4i16, 7}, // (load 8i16 and) deinterleave into 2 x 4i16
5860
5861 {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32
5862 {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32
5863
5864 {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64
5865 };
5866
5867 static const CostTblEntry AVX2InterleavedStoreTbl[] = {
5868 {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store)
5869 {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store)
5870
5871 {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store)
5872 {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store)
5873 {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store)
5874
5875 {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store)
5876 {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store)
5877 {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store)
5878 {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store)
5879
5880 {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store)
5881 {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store)
5882 {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store)
5883 {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store)
5884 {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store)
5885
5886 {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store)
5887 {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store)
5888 {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store)
5889 {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
5890 {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
5891
5892 {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store)
5893 {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store)
5894 {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store)
5895 {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store)
5896 {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store)
5897
5898 {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store)
5899 {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store)
5900 {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store)
5901 {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store)
5902 {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store)
5903
5904 {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store)
5905 {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store)
5906 {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store)
5907 {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store)
5908
5909 {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store)
5910 {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store)
5911 {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store)
5912 {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store)
5913 {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store)
5914
5915 {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store)
5916 {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store)
5917 {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store)
5918 {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store)
5919 {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store)
5920
5921 {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store)
5922 {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store)
5923 {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store)
5924 {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store)
5925 {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store)
5926
5927 {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store)
5928 {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store)
5929 {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store)
5930 {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store)
5931
5932 {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store)
5933 {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store)
5934 {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store)
5935 {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store)
5936 {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store)
5937
5938 {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store)
5939 {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store)
5940 {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store)
5941 {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store)
5942 {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store)
5943
5944 {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store)
5945 {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store)
5946 {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store)
5947 {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store)
5948
5949 {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store)
5950 {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store)
5951 {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store)
5952 };
5953
5954 static const CostTblEntry SSE2InterleavedStoreTbl[] = {
5955 {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store)
5956 {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store)
5957 {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store)
5958
5959 {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store)
5960 {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store)
5961
5962 {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store)
5963 };
5964
5965 if (Opcode == Instruction::Load) {
5966 auto GetDiscountedCost = [Factor, NumMembers = Indices.size(),
5967 MemOpCosts](const CostTblEntry *Entry) {
5968 // NOTE: this is just an approximation!
5969 // It can over/under -estimate the cost!
5970 return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor);
5971 };
5972
5973 if (ST->hasAVX2())
5974 if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor,
5975 ETy.getSimpleVT()))
5976 return GetDiscountedCost(Entry);
5977
5978 if (ST->hasSSSE3())
5979 if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor,
5980 ETy.getSimpleVT()))
5981 return GetDiscountedCost(Entry);
5982
5983 if (ST->hasSSE2())
5984 if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor,
5985 ETy.getSimpleVT()))
5986 return GetDiscountedCost(Entry);
5987 } else {
5988 assert(Opcode == Instruction::Store &&(static_cast <bool> (Opcode == Instruction::Store &&
"Expected Store Instruction at this point") ? void (0) : __assert_fail
("Opcode == Instruction::Store && \"Expected Store Instruction at this point\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5989, __extension__
__PRETTY_FUNCTION__))
5989 "Expected Store Instruction at this point")(static_cast <bool> (Opcode == Instruction::Store &&
"Expected Store Instruction at this point") ? void (0) : __assert_fail
("Opcode == Instruction::Store && \"Expected Store Instruction at this point\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5989, __extension__
__PRETTY_FUNCTION__))
;
5990 assert((!Indices.size() || Indices.size() == Factor) &&(static_cast <bool> ((!Indices.size() || Indices.size()
== Factor) && "Interleaved store only supports fully-interleaved groups."
) ? void (0) : __assert_fail ("(!Indices.size() || Indices.size() == Factor) && \"Interleaved store only supports fully-interleaved groups.\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5991, __extension__
__PRETTY_FUNCTION__))
5991 "Interleaved store only supports fully-interleaved groups.")(static_cast <bool> ((!Indices.size() || Indices.size()
== Factor) && "Interleaved store only supports fully-interleaved groups."
) ? void (0) : __assert_fail ("(!Indices.size() || Indices.size() == Factor) && \"Interleaved store only supports fully-interleaved groups.\""
, "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5991, __extension__
__PRETTY_FUNCTION__))
;
5992 if (ST->hasAVX2())
5993 if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor,
5994 ETy.getSimpleVT()))
5995 return MemOpCosts + Entry->Cost;
5996
5997 if (ST->hasSSE2())
5998 if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor,
5999 ETy.getSimpleVT()))
6000 return MemOpCosts + Entry->Cost;
6001 }
6002
6003 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6004 Alignment, AddressSpace, CostKind,
6005 UseMaskForCond, UseMaskForGaps);
6006}
6007
6008InstructionCost X86TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
6009 int64_t BaseOffset,
6010 bool HasBaseReg, int64_t Scale,
6011 unsigned AddrSpace) const {
6012 // Scaling factors are not free at all.
6013 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
6014 // will take 2 allocations in the out of order engine instead of 1
6015 // for plain addressing mode, i.e. inst (reg1).
6016 // E.g.,
6017 // vaddps (%rsi,%rdx), %ymm0, %ymm1
6018 // Requires two allocations (one for the load, one for the computation)
6019 // whereas:
6020 // vaddps (%rsi), %ymm0, %ymm1
6021 // Requires just 1 allocation, i.e., freeing allocations for other operations
6022 // and having less micro operations to execute.
6023 //
6024 // For some X86 architectures, this is even worse because for instance for
6025 // stores, the complex addressing mode forces the instruction to use the
6026 // "load" ports instead of the dedicated "store" port.
6027 // E.g., on Haswell:
6028 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
6029 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
6030 TargetLoweringBase::AddrMode AM;
6031 AM.BaseGV = BaseGV;
6032 AM.BaseOffs = BaseOffset;
6033 AM.HasBaseReg = HasBaseReg;
6034 AM.Scale = Scale;
6035 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
6036 // Scale represents reg2 * scale, thus account for 1
6037 // as soon as we use a second register.
6038 return AM.Scale != 0;
6039 return -1;
6040}