Bug Summary

File:llvm/lib/Target/X86/X86TargetTransformInfo.cpp
Warning:line 2996, column 15
Division by zero

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name X86TargetTransformInfo.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -fno-split-dwarf-inlining -debugger-tuning=gdb -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-12/lib/clang/12.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/build-llvm/lib/Target/X86 -I /build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Target/X86 -I /build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/build-llvm/include -I /build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-12/lib/clang/12.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/build-llvm/lib/Target/X86 -fdebug-prefix-map=/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb=. -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -o /tmp/scan-build-2020-09-26-161721-17566-1 -x c++ /build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Target/X86/X86TargetTransformInfo.cpp

/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Target/X86/X86TargetTransformInfo.cpp

1//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements a TargetTransformInfo analysis pass specific to the
10/// X86 target machine. It uses the target's detailed information to provide
11/// more precise answers to certain TTI queries, while letting the target
12/// independent and default TTI implementations handle the rest.
13///
14//===----------------------------------------------------------------------===//
15/// About Cost Model numbers used below it's necessary to say the following:
16/// the numbers correspond to some "generic" X86 CPU instead of usage of
17/// concrete CPU model. Usually the numbers correspond to CPU where the feature
18/// apeared at the first time. For example, if we do Subtarget.hasSSE42() in
19/// the lookups below the cost is based on Nehalem as that was the first CPU
20/// to support that feature level and thus has most likely the worst case cost.
21/// Some examples of other technologies/CPUs:
22/// SSE 3 - Pentium4 / Athlon64
23/// SSE 4.1 - Penryn
24/// SSE 4.2 - Nehalem
25/// AVX - Sandy Bridge
26/// AVX2 - Haswell
27/// AVX-512 - Xeon Phi / Skylake
28/// And some examples of instruction target dependent costs (latency)
29/// divss sqrtss rsqrtss
30/// AMD K7 11-16 19 3
31/// Piledriver 9-24 13-15 5
32/// Jaguar 14 16 2
33/// Pentium II,III 18 30 2
34/// Nehalem 7-14 7-18 3
35/// Haswell 10-13 11 5
36/// TODO: Develop and implement the target dependent cost model and
37/// specialize cost numbers for different Cost Model Targets such as throughput,
38/// code size, latency and uop count.
39//===----------------------------------------------------------------------===//
40
41#include "X86TargetTransformInfo.h"
42#include "llvm/Analysis/TargetTransformInfo.h"
43#include "llvm/CodeGen/BasicTTIImpl.h"
44#include "llvm/CodeGen/CostTable.h"
45#include "llvm/CodeGen/TargetLowering.h"
46#include "llvm/IR/IntrinsicInst.h"
47#include "llvm/Support/Debug.h"
48
49using namespace llvm;
50
51#define DEBUG_TYPE"x86tti" "x86tti"
52
53//===----------------------------------------------------------------------===//
54//
55// X86 cost model.
56//
57//===----------------------------------------------------------------------===//
58
59TargetTransformInfo::PopcntSupportKind
60X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
61 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2")((isPowerOf2_32(TyWidth) && "Ty width must be power of 2"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(TyWidth) && \"Ty width must be power of 2\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 61, __PRETTY_FUNCTION__))
;
62 // TODO: Currently the __builtin_popcount() implementation using SSE3
63 // instructions is inefficient. Once the problem is fixed, we should
64 // call ST->hasSSE3() instead of ST->hasPOPCNT().
65 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
66}
67
68llvm::Optional<unsigned> X86TTIImpl::getCacheSize(
69 TargetTransformInfo::CacheLevel Level) const {
70 switch (Level) {
71 case TargetTransformInfo::CacheLevel::L1D:
72 // - Penryn
73 // - Nehalem
74 // - Westmere
75 // - Sandy Bridge
76 // - Ivy Bridge
77 // - Haswell
78 // - Broadwell
79 // - Skylake
80 // - Kabylake
81 return 32 * 1024; // 32 KByte
82 case TargetTransformInfo::CacheLevel::L2D:
83 // - Penryn
84 // - Nehalem
85 // - Westmere
86 // - Sandy Bridge
87 // - Ivy Bridge
88 // - Haswell
89 // - Broadwell
90 // - Skylake
91 // - Kabylake
92 return 256 * 1024; // 256 KByte
93 }
94
95 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel")::llvm::llvm_unreachable_internal("Unknown TargetTransformInfo::CacheLevel"
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 95)
;
96}
97
98llvm::Optional<unsigned> X86TTIImpl::getCacheAssociativity(
99 TargetTransformInfo::CacheLevel Level) const {
100 // - Penryn
101 // - Nehalem
102 // - Westmere
103 // - Sandy Bridge
104 // - Ivy Bridge
105 // - Haswell
106 // - Broadwell
107 // - Skylake
108 // - Kabylake
109 switch (Level) {
110 case TargetTransformInfo::CacheLevel::L1D:
111 LLVM_FALLTHROUGH[[gnu::fallthrough]];
112 case TargetTransformInfo::CacheLevel::L2D:
113 return 8;
114 }
115
116 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel")::llvm::llvm_unreachable_internal("Unknown TargetTransformInfo::CacheLevel"
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 116)
;
117}
118
119unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
120 bool Vector = (ClassID == 1);
121 if (Vector && !ST->hasSSE1())
122 return 0;
123
124 if (ST->is64Bit()) {
125 if (Vector && ST->hasAVX512())
126 return 32;
127 return 16;
128 }
129 return 8;
130}
131
132unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const {
133 unsigned PreferVectorWidth = ST->getPreferVectorWidth();
134 if (Vector) {
135 if (ST->hasAVX512() && PreferVectorWidth >= 512)
136 return 512;
137 if (ST->hasAVX() && PreferVectorWidth >= 256)
138 return 256;
139 if (ST->hasSSE1() && PreferVectorWidth >= 128)
140 return 128;
141 return 0;
142 }
143
144 if (ST->is64Bit())
145 return 64;
146
147 return 32;
148}
149
150unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
151 return getRegisterBitWidth(true);
152}
153
154unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
155 // If the loop will not be vectorized, don't interleave the loop.
156 // Let regular unroll to unroll the loop, which saves the overflow
157 // check and memory check cost.
158 if (VF == 1)
159 return 1;
160
161 if (ST->isAtom())
162 return 1;
163
164 // Sandybridge and Haswell have multiple execution ports and pipelined
165 // vector units.
166 if (ST->hasAVX())
167 return 4;
168
169 return 2;
170}
171
172int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
173 TTI::TargetCostKind CostKind,
174 TTI::OperandValueKind Op1Info,
175 TTI::OperandValueKind Op2Info,
176 TTI::OperandValueProperties Opd1PropInfo,
177 TTI::OperandValueProperties Opd2PropInfo,
178 ArrayRef<const Value *> Args,
179 const Instruction *CxtI) {
180 // TODO: Handle more cost kinds.
181 if (CostKind != TTI::TCK_RecipThroughput)
182 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
183 Op2Info, Opd1PropInfo,
184 Opd2PropInfo, Args, CxtI);
185 // Legalize the type.
186 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
187
188 int ISD = TLI->InstructionOpcodeToISD(Opcode);
189 assert(ISD && "Invalid opcode")((ISD && "Invalid opcode") ? static_cast<void> (
0) : __assert_fail ("ISD && \"Invalid opcode\"", "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 189, __PRETTY_FUNCTION__))
;
190
191 static const CostTblEntry GLMCostTable[] = {
192 { ISD::FDIV, MVT::f32, 18 }, // divss
193 { ISD::FDIV, MVT::v4f32, 35 }, // divps
194 { ISD::FDIV, MVT::f64, 33 }, // divsd
195 { ISD::FDIV, MVT::v2f64, 65 }, // divpd
196 };
197
198 if (ST->useGLMDivSqrtCosts())
199 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD,
200 LT.second))
201 return LT.first * Entry->Cost;
202
203 static const CostTblEntry SLMCostTable[] = {
204 { ISD::MUL, MVT::v4i32, 11 }, // pmulld
205 { ISD::MUL, MVT::v8i16, 2 }, // pmullw
206 { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
207 { ISD::FMUL, MVT::f64, 2 }, // mulsd
208 { ISD::FMUL, MVT::v2f64, 4 }, // mulpd
209 { ISD::FMUL, MVT::v4f32, 2 }, // mulps
210 { ISD::FDIV, MVT::f32, 17 }, // divss
211 { ISD::FDIV, MVT::v4f32, 39 }, // divps
212 { ISD::FDIV, MVT::f64, 32 }, // divsd
213 { ISD::FDIV, MVT::v2f64, 69 }, // divpd
214 { ISD::FADD, MVT::v2f64, 2 }, // addpd
215 { ISD::FSUB, MVT::v2f64, 2 }, // subpd
216 // v2i64/v4i64 mul is custom lowered as a series of long:
217 // multiplies(3), shifts(3) and adds(2)
218 // slm muldq version throughput is 2 and addq throughput 4
219 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
220 // 3X4 (addq throughput) = 17
221 { ISD::MUL, MVT::v2i64, 17 },
222 // slm addq\subq throughput is 4
223 { ISD::ADD, MVT::v2i64, 4 },
224 { ISD::SUB, MVT::v2i64, 4 },
225 };
226
227 if (ST->isSLM()) {
228 if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
229 // Check if the operands can be shrinked into a smaller datatype.
230 bool Op1Signed = false;
231 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
232 bool Op2Signed = false;
233 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
234
235 bool signedMode = Op1Signed | Op2Signed;
236 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
237
238 if (OpMinSize <= 7)
239 return LT.first * 3; // pmullw/sext
240 if (!signedMode && OpMinSize <= 8)
241 return LT.first * 3; // pmullw/zext
242 if (OpMinSize <= 15)
243 return LT.first * 5; // pmullw/pmulhw/pshuf
244 if (!signedMode && OpMinSize <= 16)
245 return LT.first * 5; // pmullw/pmulhw/pshuf
246 }
247
248 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
249 LT.second)) {
250 return LT.first * Entry->Cost;
251 }
252 }
253
254 if ((ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
255 ISD == ISD::UREM) &&
256 (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
257 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
258 Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
259 if (ISD == ISD::SDIV || ISD == ISD::SREM) {
260 // On X86, vector signed division by constants power-of-two are
261 // normally expanded to the sequence SRA + SRL + ADD + SRA.
262 // The OperandValue properties may not be the same as that of the previous
263 // operation; conservatively assume OP_None.
264 int Cost =
265 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Op1Info,
266 Op2Info,
267 TargetTransformInfo::OP_None,
268 TargetTransformInfo::OP_None);
269 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info,
270 Op2Info,
271 TargetTransformInfo::OP_None,
272 TargetTransformInfo::OP_None);
273 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, Op1Info,
274 Op2Info,
275 TargetTransformInfo::OP_None,
276 TargetTransformInfo::OP_None);
277
278 if (ISD == ISD::SREM) {
279 // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
280 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info,
281 Op2Info);
282 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info,
283 Op2Info);
284 }
285
286 return Cost;
287 }
288
289 // Vector unsigned division/remainder will be simplified to shifts/masks.
290 if (ISD == ISD::UDIV)
291 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
292 Op1Info, Op2Info,
293 TargetTransformInfo::OP_None,
294 TargetTransformInfo::OP_None);
295
296 else // UREM
297 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
298 Op1Info, Op2Info,
299 TargetTransformInfo::OP_None,
300 TargetTransformInfo::OP_None);
301 }
302
303 static const CostTblEntry AVX512BWUniformConstCostTable[] = {
304 { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand.
305 { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand.
306 { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb.
307 };
308
309 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
310 ST->hasBWI()) {
311 if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
312 LT.second))
313 return LT.first * Entry->Cost;
314 }
315
316 static const CostTblEntry AVX512UniformConstCostTable[] = {
317 { ISD::SRA, MVT::v2i64, 1 },
318 { ISD::SRA, MVT::v4i64, 1 },
319 { ISD::SRA, MVT::v8i64, 1 },
320
321 { ISD::SHL, MVT::v64i8, 4 }, // psllw + pand.
322 { ISD::SRL, MVT::v64i8, 4 }, // psrlw + pand.
323 { ISD::SRA, MVT::v64i8, 8 }, // psrlw, pand, pxor, psubb.
324
325 { ISD::SDIV, MVT::v16i32, 6 }, // pmuludq sequence
326 { ISD::SREM, MVT::v16i32, 8 }, // pmuludq+mul+sub sequence
327 { ISD::UDIV, MVT::v16i32, 5 }, // pmuludq sequence
328 { ISD::UREM, MVT::v16i32, 7 }, // pmuludq+mul+sub sequence
329 };
330
331 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
332 ST->hasAVX512()) {
333 if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
334 LT.second))
335 return LT.first * Entry->Cost;
336 }
337
338 static const CostTblEntry AVX2UniformConstCostTable[] = {
339 { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand.
340 { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand.
341 { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb.
342
343 { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
344
345 { ISD::SDIV, MVT::v8i32, 6 }, // pmuludq sequence
346 { ISD::SREM, MVT::v8i32, 8 }, // pmuludq+mul+sub sequence
347 { ISD::UDIV, MVT::v8i32, 5 }, // pmuludq sequence
348 { ISD::UREM, MVT::v8i32, 7 }, // pmuludq+mul+sub sequence
349 };
350
351 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
352 ST->hasAVX2()) {
353 if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
354 LT.second))
355 return LT.first * Entry->Cost;
356 }
357
358 static const CostTblEntry SSE2UniformConstCostTable[] = {
359 { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand.
360 { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand.
361 { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
362
363 { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split.
364 { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split.
365 { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
366
367 { ISD::SDIV, MVT::v8i32, 12+2 }, // 2*pmuludq sequence + split.
368 { ISD::SREM, MVT::v8i32, 16+2 }, // 2*pmuludq+mul+sub sequence + split.
369 { ISD::SDIV, MVT::v4i32, 6 }, // pmuludq sequence
370 { ISD::SREM, MVT::v4i32, 8 }, // pmuludq+mul+sub sequence
371 { ISD::UDIV, MVT::v8i32, 10+2 }, // 2*pmuludq sequence + split.
372 { ISD::UREM, MVT::v8i32, 14+2 }, // 2*pmuludq+mul+sub sequence + split.
373 { ISD::UDIV, MVT::v4i32, 5 }, // pmuludq sequence
374 { ISD::UREM, MVT::v4i32, 7 }, // pmuludq+mul+sub sequence
375 };
376
377 // XOP has faster vXi8 shifts.
378 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
379 ST->hasSSE2() && !ST->hasXOP()) {
380 if (const auto *Entry =
381 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
382 return LT.first * Entry->Cost;
383 }
384
385 static const CostTblEntry AVX512BWConstCostTable[] = {
386 { ISD::SDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
387 { ISD::SREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
388 { ISD::UDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
389 { ISD::UREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
390 { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence
391 { ISD::SREM, MVT::v32i16, 8 }, // vpmulhw+mul+sub sequence
392 { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence
393 { ISD::UREM, MVT::v32i16, 8 }, // vpmulhuw+mul+sub sequence
394 };
395
396 if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
397 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
398 ST->hasBWI()) {
399 if (const auto *Entry =
400 CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
401 return LT.first * Entry->Cost;
402 }
403
404 static const CostTblEntry AVX512ConstCostTable[] = {
405 { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
406 { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
407 { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
408 { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
409 { ISD::SDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
410 { ISD::SREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
411 { ISD::UDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
412 { ISD::UREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
413 { ISD::SDIV, MVT::v32i16, 12 }, // 2*vpmulhw sequence
414 { ISD::SREM, MVT::v32i16, 16 }, // 2*vpmulhw+mul+sub sequence
415 { ISD::UDIV, MVT::v32i16, 12 }, // 2*vpmulhuw sequence
416 { ISD::UREM, MVT::v32i16, 16 }, // 2*vpmulhuw+mul+sub sequence
417 };
418
419 if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
420 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
421 ST->hasAVX512()) {
422 if (const auto *Entry =
423 CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
424 return LT.first * Entry->Cost;
425 }
426
427 static const CostTblEntry AVX2ConstCostTable[] = {
428 { ISD::SDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
429 { ISD::SREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
430 { ISD::UDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
431 { ISD::UREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
432 { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
433 { ISD::SREM, MVT::v16i16, 8 }, // vpmulhw+mul+sub sequence
434 { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
435 { ISD::UREM, MVT::v16i16, 8 }, // vpmulhuw+mul+sub sequence
436 { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
437 { ISD::SREM, MVT::v8i32, 19 }, // vpmuldq+mul+sub sequence
438 { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
439 { ISD::UREM, MVT::v8i32, 19 }, // vpmuludq+mul+sub sequence
440 };
441
442 if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
443 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
444 ST->hasAVX2()) {
445 if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
446 return LT.first * Entry->Cost;
447 }
448
449 static const CostTblEntry SSE2ConstCostTable[] = {
450 { ISD::SDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
451 { ISD::SREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
452 { ISD::SDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
453 { ISD::SREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
454 { ISD::UDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
455 { ISD::UREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
456 { ISD::UDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
457 { ISD::UREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
458 { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
459 { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split.
460 { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
461 { ISD::SREM, MVT::v8i16, 8 }, // pmulhw+mul+sub sequence
462 { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
463 { ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split.
464 { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
465 { ISD::UREM, MVT::v8i16, 8 }, // pmulhuw+mul+sub sequence
466 { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split.
467 { ISD::SREM, MVT::v8i32, 48+2 }, // 2*pmuludq+mul+sub sequence + split.
468 { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
469 { ISD::SREM, MVT::v4i32, 24 }, // pmuludq+mul+sub sequence
470 { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split.
471 { ISD::UREM, MVT::v8i32, 40+2 }, // 2*pmuludq+mul+sub sequence + split.
472 { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
473 { ISD::UREM, MVT::v4i32, 20 }, // pmuludq+mul+sub sequence
474 };
475
476 if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
477 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
478 ST->hasSSE2()) {
479 // pmuldq sequence.
480 if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
481 return LT.first * 32;
482 if (ISD == ISD::SREM && LT.second == MVT::v8i32 && ST->hasAVX())
483 return LT.first * 38;
484 if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
485 return LT.first * 15;
486 if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41())
487 return LT.first * 20;
488
489 if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
490 return LT.first * Entry->Cost;
491 }
492
493 static const CostTblEntry AVX512BWShiftCostTable[] = {
494 { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
495 { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
496 { ISD::SRA, MVT::v8i16, 1 }, // vpsravw
497
498 { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
499 { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
500 { ISD::SRA, MVT::v16i16, 1 }, // vpsravw
501
502 { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
503 { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
504 { ISD::SRA, MVT::v32i16, 1 }, // vpsravw
505 };
506
507 if (ST->hasBWI())
508 if (const auto *Entry = CostTableLookup(AVX512BWShiftCostTable, ISD, LT.second))
509 return LT.first * Entry->Cost;
510
511 static const CostTblEntry AVX2UniformCostTable[] = {
512 // Uniform splats are cheaper for the following instructions.
513 { ISD::SHL, MVT::v16i16, 1 }, // psllw.
514 { ISD::SRL, MVT::v16i16, 1 }, // psrlw.
515 { ISD::SRA, MVT::v16i16, 1 }, // psraw.
516 { ISD::SHL, MVT::v32i16, 2 }, // 2*psllw.
517 { ISD::SRL, MVT::v32i16, 2 }, // 2*psrlw.
518 { ISD::SRA, MVT::v32i16, 2 }, // 2*psraw.
519 };
520
521 if (ST->hasAVX2() &&
522 ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
523 (Op2Info == TargetTransformInfo::OK_UniformValue))) {
524 if (const auto *Entry =
525 CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
526 return LT.first * Entry->Cost;
527 }
528
529 static const CostTblEntry SSE2UniformCostTable[] = {
530 // Uniform splats are cheaper for the following instructions.
531 { ISD::SHL, MVT::v8i16, 1 }, // psllw.
532 { ISD::SHL, MVT::v4i32, 1 }, // pslld
533 { ISD::SHL, MVT::v2i64, 1 }, // psllq.
534
535 { ISD::SRL, MVT::v8i16, 1 }, // psrlw.
536 { ISD::SRL, MVT::v4i32, 1 }, // psrld.
537 { ISD::SRL, MVT::v2i64, 1 }, // psrlq.
538
539 { ISD::SRA, MVT::v8i16, 1 }, // psraw.
540 { ISD::SRA, MVT::v4i32, 1 }, // psrad.
541 };
542
543 if (ST->hasSSE2() &&
544 ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
545 (Op2Info == TargetTransformInfo::OK_UniformValue))) {
546 if (const auto *Entry =
547 CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
548 return LT.first * Entry->Cost;
549 }
550
551 static const CostTblEntry AVX512DQCostTable[] = {
552 { ISD::MUL, MVT::v2i64, 1 },
553 { ISD::MUL, MVT::v4i64, 1 },
554 { ISD::MUL, MVT::v8i64, 1 }
555 };
556
557 // Look for AVX512DQ lowering tricks for custom cases.
558 if (ST->hasDQI())
559 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
560 return LT.first * Entry->Cost;
561
562 static const CostTblEntry AVX512BWCostTable[] = {
563 { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence.
564 { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence.
565 { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence.
566
567 { ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence.
568 { ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence.
569 { ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence.
570 };
571
572 // Look for AVX512BW lowering tricks for custom cases.
573 if (ST->hasBWI())
574 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
575 return LT.first * Entry->Cost;
576
577 static const CostTblEntry AVX512CostTable[] = {
578 { ISD::SHL, MVT::v16i32, 1 },
579 { ISD::SRL, MVT::v16i32, 1 },
580 { ISD::SRA, MVT::v16i32, 1 },
581
582 { ISD::SHL, MVT::v8i64, 1 },
583 { ISD::SRL, MVT::v8i64, 1 },
584
585 { ISD::SRA, MVT::v2i64, 1 },
586 { ISD::SRA, MVT::v4i64, 1 },
587 { ISD::SRA, MVT::v8i64, 1 },
588
589 { ISD::MUL, MVT::v64i8, 26 }, // extend/pmullw/trunc sequence.
590 { ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence.
591 { ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence.
592 { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org)
593 { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org)
594 { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org)
595 { ISD::MUL, MVT::v8i64, 8 }, // 3*pmuludq/3*shift/2*add
596
597 { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
598 { ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
599 { ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
600
601 { ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
602 { ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
603 { ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
604 };
605
606 if (ST->hasAVX512())
607 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
608 return LT.first * Entry->Cost;
609
610 static const CostTblEntry AVX2ShiftCostTable[] = {
611 // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
612 // customize them to detect the cases where shift amount is a scalar one.
613 { ISD::SHL, MVT::v4i32, 1 },
614 { ISD::SRL, MVT::v4i32, 1 },
615 { ISD::SRA, MVT::v4i32, 1 },
616 { ISD::SHL, MVT::v8i32, 1 },
617 { ISD::SRL, MVT::v8i32, 1 },
618 { ISD::SRA, MVT::v8i32, 1 },
619 { ISD::SHL, MVT::v2i64, 1 },
620 { ISD::SRL, MVT::v2i64, 1 },
621 { ISD::SHL, MVT::v4i64, 1 },
622 { ISD::SRL, MVT::v4i64, 1 },
623 };
624
625 if (ST->hasAVX512()) {
626 if (ISD == ISD::SHL && LT.second == MVT::v32i16 &&
627 (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
628 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
629 // On AVX512, a packed v32i16 shift left by a constant build_vector
630 // is lowered into a vector multiply (vpmullw).
631 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
632 Op1Info, Op2Info,
633 TargetTransformInfo::OP_None,
634 TargetTransformInfo::OP_None);
635 }
636
637 // Look for AVX2 lowering tricks.
638 if (ST->hasAVX2()) {
639 if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
640 (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
641 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
642 // On AVX2, a packed v16i16 shift left by a constant build_vector
643 // is lowered into a vector multiply (vpmullw).
644 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
645 Op1Info, Op2Info,
646 TargetTransformInfo::OP_None,
647 TargetTransformInfo::OP_None);
648
649 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
650 return LT.first * Entry->Cost;
651 }
652
653 static const CostTblEntry XOPShiftCostTable[] = {
654 // 128bit shifts take 1cy, but right shifts require negation beforehand.
655 { ISD::SHL, MVT::v16i8, 1 },
656 { ISD::SRL, MVT::v16i8, 2 },
657 { ISD::SRA, MVT::v16i8, 2 },
658 { ISD::SHL, MVT::v8i16, 1 },
659 { ISD::SRL, MVT::v8i16, 2 },
660 { ISD::SRA, MVT::v8i16, 2 },
661 { ISD::SHL, MVT::v4i32, 1 },
662 { ISD::SRL, MVT::v4i32, 2 },
663 { ISD::SRA, MVT::v4i32, 2 },
664 { ISD::SHL, MVT::v2i64, 1 },
665 { ISD::SRL, MVT::v2i64, 2 },
666 { ISD::SRA, MVT::v2i64, 2 },
667 // 256bit shifts require splitting if AVX2 didn't catch them above.
668 { ISD::SHL, MVT::v32i8, 2+2 },
669 { ISD::SRL, MVT::v32i8, 4+2 },
670 { ISD::SRA, MVT::v32i8, 4+2 },
671 { ISD::SHL, MVT::v16i16, 2+2 },
672 { ISD::SRL, MVT::v16i16, 4+2 },
673 { ISD::SRA, MVT::v16i16, 4+2 },
674 { ISD::SHL, MVT::v8i32, 2+2 },
675 { ISD::SRL, MVT::v8i32, 4+2 },
676 { ISD::SRA, MVT::v8i32, 4+2 },
677 { ISD::SHL, MVT::v4i64, 2+2 },
678 { ISD::SRL, MVT::v4i64, 4+2 },
679 { ISD::SRA, MVT::v4i64, 4+2 },
680 };
681
682 // Look for XOP lowering tricks.
683 if (ST->hasXOP()) {
684 // If the right shift is constant then we'll fold the negation so
685 // it's as cheap as a left shift.
686 int ShiftISD = ISD;
687 if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) &&
688 (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
689 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
690 ShiftISD = ISD::SHL;
691 if (const auto *Entry =
692 CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
693 return LT.first * Entry->Cost;
694 }
695
696 static const CostTblEntry SSE2UniformShiftCostTable[] = {
697 // Uniform splats are cheaper for the following instructions.
698 { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split.
699 { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split.
700 { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split.
701
702 { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split.
703 { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split.
704 { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split.
705
706 { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split.
707 { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split.
708 { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle.
709 { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split.
710 };
711
712 if (ST->hasSSE2() &&
713 ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
714 (Op2Info == TargetTransformInfo::OK_UniformValue))) {
715
716 // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
717 if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2())
718 return LT.first * 4; // 2*psrad + shuffle.
719
720 if (const auto *Entry =
721 CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
722 return LT.first * Entry->Cost;
723 }
724
725 if (ISD == ISD::SHL &&
726 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) {
727 MVT VT = LT.second;
728 // Vector shift left by non uniform constant can be lowered
729 // into vector multiply.
730 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
731 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
732 ISD = ISD::MUL;
733 }
734
735 static const CostTblEntry AVX2CostTable[] = {
736 { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence.
737 { ISD::SHL, MVT::v64i8, 22 }, // 2*vpblendvb sequence.
738 { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
739 { ISD::SHL, MVT::v32i16, 20 }, // 2*extend/vpsrlvd/pack sequence.
740
741 { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence.
742 { ISD::SRL, MVT::v64i8, 22 }, // 2*vpblendvb sequence.
743 { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
744 { ISD::SRL, MVT::v32i16, 20 }, // 2*extend/vpsrlvd/pack sequence.
745
746 { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence.
747 { ISD::SRA, MVT::v64i8, 48 }, // 2*vpblendvb sequence.
748 { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence.
749 { ISD::SRA, MVT::v32i16, 20 }, // 2*extend/vpsravd/pack sequence.
750 { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence.
751 { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence.
752
753 { ISD::SUB, MVT::v32i8, 1 }, // psubb
754 { ISD::ADD, MVT::v32i8, 1 }, // paddb
755 { ISD::SUB, MVT::v16i16, 1 }, // psubw
756 { ISD::ADD, MVT::v16i16, 1 }, // paddw
757 { ISD::SUB, MVT::v8i32, 1 }, // psubd
758 { ISD::ADD, MVT::v8i32, 1 }, // paddd
759 { ISD::SUB, MVT::v4i64, 1 }, // psubq
760 { ISD::ADD, MVT::v4i64, 1 }, // paddq
761
762 { ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence.
763 { ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence.
764 { ISD::MUL, MVT::v16i16, 1 }, // pmullw
765 { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org)
766 { ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add
767
768 { ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
769 { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
770 { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
771 { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
772 { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
773 { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
774
775 { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
776 { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
777 { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
778 { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/
779 { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
780 { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
781 };
782
783 // Look for AVX2 lowering tricks for custom cases.
784 if (ST->hasAVX2())
785 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
786 return LT.first * Entry->Cost;
787
788 static const CostTblEntry AVX1CostTable[] = {
789 // We don't have to scalarize unsupported ops. We can issue two half-sized
790 // operations and we only need to extract the upper YMM half.
791 // Two ops + 1 extract + 1 insert = 4.
792 { ISD::MUL, MVT::v16i16, 4 },
793 { ISD::MUL, MVT::v8i32, 4 },
794 { ISD::SUB, MVT::v32i8, 4 },
795 { ISD::ADD, MVT::v32i8, 4 },
796 { ISD::SUB, MVT::v16i16, 4 },
797 { ISD::ADD, MVT::v16i16, 4 },
798 { ISD::SUB, MVT::v8i32, 4 },
799 { ISD::ADD, MVT::v8i32, 4 },
800 { ISD::SUB, MVT::v4i64, 4 },
801 { ISD::ADD, MVT::v4i64, 4 },
802
803 // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
804 // are lowered as a series of long multiplies(3), shifts(3) and adds(2)
805 // Because we believe v4i64 to be a legal type, we must also include the
806 // extract+insert in the cost table. Therefore, the cost here is 18
807 // instead of 8.
808 { ISD::MUL, MVT::v4i64, 18 },
809
810 { ISD::MUL, MVT::v32i8, 26 }, // extend/pmullw/trunc sequence.
811
812 { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
813 { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
814 { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
815 { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/
816 { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/
817 { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/
818 };
819
820 if (ST->hasAVX())
821 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
822 return LT.first * Entry->Cost;
823
824 static const CostTblEntry SSE42CostTable[] = {
825 { ISD::FADD, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
826 { ISD::FADD, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
827 { ISD::FADD, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
828 { ISD::FADD, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
829
830 { ISD::FSUB, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
831 { ISD::FSUB, MVT::f32 , 1 }, // Nehalem from http://www.agner.org/
832 { ISD::FSUB, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
833 { ISD::FSUB, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
834
835 { ISD::FMUL, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
836 { ISD::FMUL, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
837 { ISD::FMUL, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
838 { ISD::FMUL, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
839
840 { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/
841 { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
842 { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/
843 { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
844 };
845
846 if (ST->hasSSE42())
847 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
848 return LT.first * Entry->Cost;
849
850 static const CostTblEntry SSE41CostTable[] = {
851 { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence.
852 { ISD::SHL, MVT::v32i8, 2*11+2 }, // pblendvb sequence + split.
853 { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence.
854 { ISD::SHL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
855 { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
856 { ISD::SHL, MVT::v8i32, 2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split
857
858 { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence.
859 { ISD::SRL, MVT::v32i8, 2*12+2 }, // pblendvb sequence + split.
860 { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence.
861 { ISD::SRL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
862 { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend.
863 { ISD::SRL, MVT::v8i32, 2*11+2 }, // Shift each lane + blend + split.
864
865 { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence.
866 { ISD::SRA, MVT::v32i8, 2*24+2 }, // pblendvb sequence + split.
867 { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence.
868 { ISD::SRA, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
869 { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
870 { ISD::SRA, MVT::v8i32, 2*12+2 }, // Shift each lane + blend + split.
871
872 { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org)
873 };
874
875 if (ST->hasSSE41())
876 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
877 return LT.first * Entry->Cost;
878
879 static const CostTblEntry SSE2CostTable[] = {
880 // We don't correctly identify costs of casts because they are marked as
881 // custom.
882 { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence.
883 { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence.
884 { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
885 { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
886 { ISD::SHL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
887
888 { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence.
889 { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence.
890 { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
891 { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
892 { ISD::SRL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
893
894 { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence.
895 { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence.
896 { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend.
897 { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence.
898 { ISD::SRA, MVT::v4i64, 2*12+2 }, // srl/xor/sub sequence+split.
899
900 { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence.
901 { ISD::MUL, MVT::v8i16, 1 }, // pmullw
902 { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
903 { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
904
905 { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/
906 { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
907 { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
908 { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
909
910 { ISD::FADD, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
911 { ISD::FADD, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
912
913 { ISD::FSUB, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
914 { ISD::FSUB, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
915 };
916
917 if (ST->hasSSE2())
918 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
919 return LT.first * Entry->Cost;
920
921 static const CostTblEntry SSE1CostTable[] = {
922 { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/
923 { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
924
925 { ISD::FADD, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
926 { ISD::FADD, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
927
928 { ISD::FSUB, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
929 { ISD::FSUB, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
930
931 { ISD::ADD, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
932 { ISD::ADD, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
933 { ISD::ADD, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
934
935 { ISD::SUB, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
936 { ISD::SUB, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
937 { ISD::SUB, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
938 };
939
940 if (ST->hasSSE1())
941 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
942 return LT.first * Entry->Cost;
943
944 // It is not a good idea to vectorize division. We have to scalarize it and
945 // in the process we will often end up having to spilling regular
946 // registers. The overhead of division is going to dominate most kernels
947 // anyways so try hard to prevent vectorization of division - it is
948 // generally a bad idea. Assume somewhat arbitrarily that we have to be able
949 // to hide "20 cycles" for each lane.
950 if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM ||
951 ISD == ISD::UDIV || ISD == ISD::UREM)) {
952 int ScalarCost = getArithmeticInstrCost(
953 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info,
954 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
955 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
956 }
957
958 // Fallback to the default implementation.
959 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info);
960}
961
962int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp,
963 int Index, VectorType *SubTp) {
964 // 64-bit packed float vectors (v2f32) are widened to type v4f32.
965 // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
966 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, BaseTp);
967
968 // Treat Transpose as 2-op shuffles - there's no difference in lowering.
969 if (Kind == TTI::SK_Transpose)
970 Kind = TTI::SK_PermuteTwoSrc;
971
972 // For Broadcasts we are splatting the first element from the first input
973 // register, so only need to reference that input and all the output
974 // registers are the same.
975 if (Kind == TTI::SK_Broadcast)
976 LT.first = 1;
977
978 // Subvector extractions are free if they start at the beginning of a
979 // vector and cheap if the subvectors are aligned.
980 if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
981 int NumElts = LT.second.getVectorNumElements();
982 if ((Index % NumElts) == 0)
983 return 0;
984 std::pair<int, MVT> SubLT = TLI->getTypeLegalizationCost(DL, SubTp);
985 if (SubLT.second.isVector()) {
986 int NumSubElts = SubLT.second.getVectorNumElements();
987 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
988 return SubLT.first;
989 // Handle some cases for widening legalization. For now we only handle
990 // cases where the original subvector was naturally aligned and evenly
991 // fit in its legalized subvector type.
992 // FIXME: Remove some of the alignment restrictions.
993 // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
994 // vectors.
995 int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
996 if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
997 (NumSubElts % OrigSubElts) == 0 &&
998 LT.second.getVectorElementType() ==
999 SubLT.second.getVectorElementType() &&
1000 LT.second.getVectorElementType().getSizeInBits() ==
1001 BaseTp->getElementType()->getPrimitiveSizeInBits()) {
1002 assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&((NumElts >= NumSubElts && NumElts > OrigSubElts
&& "Unexpected number of elements!") ? static_cast<
void> (0) : __assert_fail ("NumElts >= NumSubElts && NumElts > OrigSubElts && \"Unexpected number of elements!\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 1003, __PRETTY_FUNCTION__))
1003 "Unexpected number of elements!")((NumElts >= NumSubElts && NumElts > OrigSubElts
&& "Unexpected number of elements!") ? static_cast<
void> (0) : __assert_fail ("NumElts >= NumSubElts && NumElts > OrigSubElts && \"Unexpected number of elements!\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 1003, __PRETTY_FUNCTION__))
;
1004 auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
1005 LT.second.getVectorNumElements());
1006 auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
1007 SubLT.second.getVectorNumElements());
1008 int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1009 int ExtractCost = getShuffleCost(TTI::SK_ExtractSubvector, VecTy,
1010 ExtractIndex, SubTy);
1011
1012 // If the original size is 32-bits or more, we can use pshufd. Otherwise
1013 // if we have SSSE3 we can use pshufb.
1014 if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1015 return ExtractCost + 1; // pshufd or pshufb
1016
1017 assert(SubTp->getPrimitiveSizeInBits() == 16 &&((SubTp->getPrimitiveSizeInBits() == 16 && "Unexpected vector size"
) ? static_cast<void> (0) : __assert_fail ("SubTp->getPrimitiveSizeInBits() == 16 && \"Unexpected vector size\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 1018, __PRETTY_FUNCTION__))
1018 "Unexpected vector size")((SubTp->getPrimitiveSizeInBits() == 16 && "Unexpected vector size"
) ? static_cast<void> (0) : __assert_fail ("SubTp->getPrimitiveSizeInBits() == 16 && \"Unexpected vector size\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 1018, __PRETTY_FUNCTION__))
;
1019
1020 return ExtractCost + 2; // worst case pshufhw + pshufd
1021 }
1022 }
1023 }
1024
1025 // Handle some common (illegal) sub-vector types as they are often very cheap
1026 // to shuffle even on targets without PSHUFB.
1027 EVT VT = TLI->getValueType(DL, BaseTp);
1028 if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1029 !ST->hasSSSE3()) {
1030 static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
1031 {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw
1032 {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw
1033 {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw
1034 {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw
1035 {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck
1036
1037 {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw
1038 {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw
1039 {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus
1040 {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck
1041
1042 {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw
1043 {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw
1044 {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw
1045 {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw
1046 {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck
1047
1048 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
1049 {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
1050 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw
1051 {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw
1052 {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck
1053 };
1054
1055 if (ST->hasSSE2())
1056 if (const auto *Entry =
1057 CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1058 return Entry->Cost;
1059 }
1060
1061 // We are going to permute multiple sources and the result will be in multiple
1062 // destinations. Providing an accurate cost only for splits where the element
1063 // type remains the same.
1064 if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
1065 MVT LegalVT = LT.second;
1066 if (LegalVT.isVector() &&
1067 LegalVT.getVectorElementType().getSizeInBits() ==
1068 BaseTp->getElementType()->getPrimitiveSizeInBits() &&
1069 LegalVT.getVectorNumElements() <
1070 cast<FixedVectorType>(BaseTp)->getNumElements()) {
1071
1072 unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
1073 unsigned LegalVTSize = LegalVT.getStoreSize();
1074 // Number of source vectors after legalization:
1075 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1076 // Number of destination vectors after legalization:
1077 unsigned NumOfDests = LT.first;
1078
1079 auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
1080 LegalVT.getVectorNumElements());
1081
1082 unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1083 return NumOfShuffles *
1084 getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
1085 }
1086
1087 return BaseT::getShuffleCost(Kind, BaseTp, Index, SubTp);
1088 }
1089
1090 // For 2-input shuffles, we must account for splitting the 2 inputs into many.
1091 if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
1092 // We assume that source and destination have the same vector type.
1093 int NumOfDests = LT.first;
1094 int NumOfShufflesPerDest = LT.first * 2 - 1;
1095 LT.first = NumOfDests * NumOfShufflesPerDest;
1096 }
1097
1098 static const CostTblEntry AVX512VBMIShuffleTbl[] = {
1099 {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
1100 {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
1101
1102 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
1103 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
1104
1105 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
1106 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
1107 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b
1108 };
1109
1110 if (ST->hasVBMI())
1111 if (const auto *Entry =
1112 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1113 return LT.first * Entry->Cost;
1114
1115 static const CostTblEntry AVX512BWShuffleTbl[] = {
1116 {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1117 {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1118
1119 {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
1120 {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
1121 {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
1122
1123 {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
1124 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
1125 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
1126
1127 {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
1128 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
1129 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
1130 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1131
1132 {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
1133 {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
1134 };
1135
1136 if (ST->hasBWI())
1137 if (const auto *Entry =
1138 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1139 return LT.first * Entry->Cost;
1140
1141 static const CostTblEntry AVX512ShuffleTbl[] = {
1142 {TTI::SK_Broadcast, MVT::v8f64, 1}, // vbroadcastpd
1143 {TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps
1144 {TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq
1145 {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd
1146 {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1147 {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1148
1149 {TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd
1150 {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps
1151 {TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq
1152 {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd
1153
1154 {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd
1155 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1156 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // vpermpd
1157 {TTI::SK_PermuteSingleSrc, MVT::v16f32, 1}, // vpermps
1158 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1159 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 1}, // vpermps
1160 {TTI::SK_PermuteSingleSrc, MVT::v8i64, 1}, // vpermq
1161 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1162 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // vpermq
1163 {TTI::SK_PermuteSingleSrc, MVT::v16i32, 1}, // vpermd
1164 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1165 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // vpermd
1166 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1167
1168 {TTI::SK_PermuteTwoSrc, MVT::v8f64, 1}, // vpermt2pd
1169 {TTI::SK_PermuteTwoSrc, MVT::v16f32, 1}, // vpermt2ps
1170 {TTI::SK_PermuteTwoSrc, MVT::v8i64, 1}, // vpermt2q
1171 {TTI::SK_PermuteTwoSrc, MVT::v16i32, 1}, // vpermt2d
1172 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 1}, // vpermt2pd
1173 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 1}, // vpermt2ps
1174 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 1}, // vpermt2q
1175 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 1}, // vpermt2d
1176 {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1}, // vpermt2pd
1177 {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1}, // vpermt2ps
1178 {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1}, // vpermt2q
1179 {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1}, // vpermt2d
1180
1181 // FIXME: This just applies the type legalization cost rules above
1182 // assuming these completely split.
1183 {TTI::SK_PermuteSingleSrc, MVT::v32i16, 14},
1184 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 14},
1185 {TTI::SK_PermuteTwoSrc, MVT::v32i16, 42},
1186 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 42},
1187
1188 {TTI::SK_Select, MVT::v32i16, 1}, // vpternlogq
1189 {TTI::SK_Select, MVT::v64i8, 1}, // vpternlogq
1190 {TTI::SK_Select, MVT::v8f64, 1}, // vblendmpd
1191 {TTI::SK_Select, MVT::v16f32, 1}, // vblendmps
1192 {TTI::SK_Select, MVT::v8i64, 1}, // vblendmq
1193 {TTI::SK_Select, MVT::v16i32, 1}, // vblendmd
1194 };
1195
1196 if (ST->hasAVX512())
1197 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1198 return LT.first * Entry->Cost;
1199
1200 static const CostTblEntry AVX2ShuffleTbl[] = {
1201 {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1202 {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1203 {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1204 {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1205 {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1206 {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1207
1208 {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1209 {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1210 {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1211 {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1212 {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1213 {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1214
1215 {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1216 {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1217
1218 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1219 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1220 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1221 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1222 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1223 // + vpblendvb
1224 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1225 // + vpblendvb
1226
1227 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1228 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1229 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1230 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1231 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1232 // + vpblendvb
1233 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
1234 // + vpblendvb
1235 };
1236
1237 if (ST->hasAVX2())
1238 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1239 return LT.first * Entry->Cost;
1240
1241 static const CostTblEntry XOPShuffleTbl[] = {
1242 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
1243 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
1244 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
1245 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
1246 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1247 // + vinsertf128
1248 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
1249 // + vinsertf128
1250
1251 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1252 // + vinsertf128
1253 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
1254 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
1255 // + vinsertf128
1256 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
1257 };
1258
1259 if (ST->hasXOP())
1260 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1261 return LT.first * Entry->Cost;
1262
1263 static const CostTblEntry AVX1ShuffleTbl[] = {
1264 {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1265 {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1266 {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1267 {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1268 {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1269 {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
1270
1271 {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1272 {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1273 {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1274 {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1275 {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1276 // + vinsertf128
1277 {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
1278 // + vinsertf128
1279
1280 {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
1281 {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
1282 {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
1283 {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
1284 {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
1285 {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
1286
1287 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
1288 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
1289 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1290 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1291 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
1292 // + 2*por + vinsertf128
1293 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
1294 // + 2*por + vinsertf128
1295
1296 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
1297 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
1298 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1299 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1300 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
1301 // + 4*por + vinsertf128
1302 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
1303 // + 4*por + vinsertf128
1304 };
1305
1306 if (ST->hasAVX())
1307 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
1308 return LT.first * Entry->Cost;
1309
1310 static const CostTblEntry SSE41ShuffleTbl[] = {
1311 {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
1312 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1313 {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
1314 {TTI::SK_Select, MVT::v4f32, 1}, // blendps
1315 {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
1316 {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
1317 };
1318
1319 if (ST->hasSSE41())
1320 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
1321 return LT.first * Entry->Cost;
1322
1323 static const CostTblEntry SSSE3ShuffleTbl[] = {
1324 {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
1325 {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
1326
1327 {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
1328 {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
1329
1330 {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
1331 {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
1332
1333 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
1334 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1335
1336 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
1337 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
1338 };
1339
1340 if (ST->hasSSSE3())
1341 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
1342 return LT.first * Entry->Cost;
1343
1344 static const CostTblEntry SSE2ShuffleTbl[] = {
1345 {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
1346 {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
1347 {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
1348 {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
1349 {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
1350
1351 {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
1352 {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
1353 {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
1354 {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
1355 {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
1356 // + 2*pshufd + 2*unpck + packus
1357
1358 {TTI::SK_Select, MVT::v2i64, 1}, // movsd
1359 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1360 {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
1361 {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
1362 {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
1363
1364 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
1365 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
1366 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
1367 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
1368 // + pshufd/unpck
1369 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
1370 // + 2*pshufd + 2*unpck + 2*packus
1371
1372 { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
1373 { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
1374 { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
1375 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
1376 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
1377 };
1378
1379 if (ST->hasSSE2())
1380 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
1381 return LT.first * Entry->Cost;
1382
1383 static const CostTblEntry SSE1ShuffleTbl[] = {
1384 { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
1385 { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
1386 { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
1387 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
1388 { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
1389 };
1390
1391 if (ST->hasSSE1())
1392 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
1393 return LT.first * Entry->Cost;
1394
1395 return BaseT::getShuffleCost(Kind, BaseTp, Index, SubTp);
1396}
1397
1398int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1399 TTI::CastContextHint CCH,
1400 TTI::TargetCostKind CostKind,
1401 const Instruction *I) {
1402 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1403 assert(ISD && "Invalid opcode")((ISD && "Invalid opcode") ? static_cast<void> (
0) : __assert_fail ("ISD && \"Invalid opcode\"", "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 1403, __PRETTY_FUNCTION__))
;
1404
1405 // TODO: Allow non-throughput costs that aren't binary.
1406 auto AdjustCost = [&CostKind](int Cost) {
1407 if (CostKind != TTI::TCK_RecipThroughput)
1408 return Cost == 0 ? 0 : 1;
1409 return Cost;
1410 };
1411
1412 // FIXME: Need a better design of the cost table to handle non-simple types of
1413 // potential massive combinations (elem_num x src_type x dst_type).
1414
1415 static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
1416 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
1417 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
1418
1419 // Mask sign extend has an instruction.
1420 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
1421 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
1422 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
1423 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
1424 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
1425 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
1426 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
1427 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
1428 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
1429 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 },
1430 { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 },
1431
1432 // Mask zero extend is a sext + shift.
1433 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
1434 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
1435 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
1436 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
1437 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
1438 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
1439 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
1440 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
1441 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
1442 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 },
1443 { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 },
1444
1445 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 2 },
1446 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm
1447 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // widen to zmm
1448 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // widen to zmm
1449 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // widen to zmm
1450 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // widen to zmm
1451 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // widen to zmm
1452 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // widen to zmm
1453 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // widen to zmm
1454 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // widen to zmm
1455 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // widen to zmm
1456 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, 2 },
1457 { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, 2 },
1458 };
1459
1460 static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
1461 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
1462 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
1463
1464 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
1465 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
1466
1467 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 },
1468 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 },
1469
1470 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 },
1471 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 },
1472 };
1473
1474 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
1475 // 256-bit wide vectors.
1476
1477 static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
1478 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 },
1479 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 },
1480 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 },
1481
1482 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
1483 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
1484 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
1485 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd
1486 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
1487 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
1488 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
1489 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd
1490 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd
1491 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd
1492 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd
1493 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd
1494 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq
1495 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq
1496 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq
1497 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 },
1498 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 },
1499 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 },
1500 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 },
1501 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 },
1502 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd
1503 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb
1504
1505 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32
1506 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 8 },
1507
1508 // Sign extend is zmm vpternlogd+vptruncdb.
1509 // Zero extend is zmm broadcast load+vptruncdw.
1510 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 3 },
1511 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 4 },
1512 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 3 },
1513 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 4 },
1514 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 3 },
1515 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 4 },
1516 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 3 },
1517 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 4 },
1518
1519 // Sign extend is zmm vpternlogd+vptruncdw.
1520 // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
1521 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 3 },
1522 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 4 },
1523 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 3 },
1524 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 4 },
1525 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 3 },
1526 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 4 },
1527 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 3 },
1528 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
1529
1530 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd
1531 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld
1532 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd
1533 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld
1534 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd
1535 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld
1536 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq
1537 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq
1538 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq
1539 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq
1540
1541 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd
1542 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld
1543 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq
1544 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq
1545
1546 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
1547 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
1548 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
1549 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
1550 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
1551 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
1552 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
1553 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
1554 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
1555 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
1556
1557 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
1558 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
1559
1560 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
1561 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
1562 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 },
1563 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 },
1564 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
1565 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 },
1566 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
1567 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
1568
1569 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
1570 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
1571 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 },
1572 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 },
1573 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
1574 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 },
1575 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
1576 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
1577 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 },
1578 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 },
1579
1580 { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f64, 3 },
1581 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, 3 },
1582 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 3 },
1583 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 3 },
1584
1585 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 },
1586 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 3 },
1587 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 3 },
1588 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 },
1589 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 3 },
1590 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 3 },
1591 };
1592
1593 static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] {
1594 // Mask sign extend has an instruction.
1595 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
1596 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
1597 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
1598 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
1599 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
1600 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
1601 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
1602 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
1603 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
1604
1605 // Mask zero extend is a sext + shift.
1606 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
1607 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
1608 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
1609 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
1610 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
1611 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
1612 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
1613 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
1614 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
1615
1616 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 },
1617 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // vpsllw+vptestmb
1618 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // vpsllw+vptestmw
1619 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // vpsllw+vptestmb
1620 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // vpsllw+vptestmw
1621 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // vpsllw+vptestmb
1622 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // vpsllw+vptestmw
1623 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // vpsllw+vptestmb
1624 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // vpsllw+vptestmw
1625 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // vpsllw+vptestmb
1626 };
1627
1628 static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = {
1629 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
1630 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
1631 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
1632 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
1633
1634 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
1635 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
1636 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
1637 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
1638
1639 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 1 },
1640 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 },
1641 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
1642 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 },
1643
1644 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 1 },
1645 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 },
1646 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
1647 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 },
1648 };
1649
1650 static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = {
1651 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
1652 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
1653 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
1654 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8
1655 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
1656 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
1657 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
1658 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16
1659 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd
1660 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd
1661 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd
1662 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq
1663 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq
1664 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd
1665
1666 // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
1667 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
1668 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 5 },
1669 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 6 },
1670 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 5 },
1671 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 6 },
1672 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 5 },
1673 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 6 },
1674 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 10 },
1675 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 12 },
1676
1677 // sign extend is vpcmpeq+maskedmove+vpmovdw
1678 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
1679 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 4 },
1680 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 5 },
1681 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 4 },
1682 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 5 },
1683 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 4 },
1684 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 5 },
1685 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 10 },
1686 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 12 },
1687
1688 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd
1689 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld
1690 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd
1691 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld
1692 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd
1693 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld
1694 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq
1695 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq
1696 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq
1697 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq
1698
1699 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 2 },
1700 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 },
1701 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 2 },
1702 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 5 },
1703 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 },
1704 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 },
1705 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 },
1706 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
1707 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
1708 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
1709 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
1710 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 },
1711 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
1712 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 5 },
1713
1714 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 },
1715 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 },
1716
1717 { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 3 },
1718 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f32, 3 },
1719
1720 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 },
1721 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 },
1722
1723 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
1724 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
1725 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 1 },
1726 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 },
1727 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 },
1728 };
1729
1730 static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
1731 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
1732 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
1733 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
1734 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
1735 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 1 },
1736 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 1 },
1737 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 1 },
1738 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 1 },
1739 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
1740 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
1741 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
1742 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
1743 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 1 },
1744 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 1 },
1745 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
1746 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
1747 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
1748 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
1749 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
1750 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
1751
1752 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 },
1753 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
1754
1755 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 },
1756 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 },
1757 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 },
1758 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 },
1759
1760 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 },
1761 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 },
1762
1763 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 },
1764 };
1765
1766 static const TypeConversionCostTblEntry AVXConversionTbl[] = {
1767 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 },
1768 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 },
1769 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 },
1770 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 },
1771 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 4 },
1772 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 },
1773 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 4 },
1774 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 4 },
1775 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
1776 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
1777 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
1778 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
1779 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 4 },
1780 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
1781 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 },
1782 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 4 },
1783 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 4 },
1784 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 4 },
1785
1786 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 4 },
1787 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 5 },
1788 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 4 },
1789 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 9 },
1790 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, 11 },
1791
1792 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 4 },
1793 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 },
1794 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
1795 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 4 },
1796 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 4 },
1797 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 },
1798 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 11 },
1799 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 9 },
1800 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 3 },
1801 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 11 },
1802
1803 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
1804 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 },
1805 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 },
1806 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
1807 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i8, 3 },
1808 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 8 },
1809 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 3 },
1810 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i16, 3 },
1811 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 },
1812 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
1813 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
1814 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
1815
1816 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 },
1817 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 },
1818 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 },
1819 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 2 },
1820 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 },
1821 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 5 },
1822 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
1823 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 },
1824 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 },
1825 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 6 },
1826 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 6 },
1827 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 },
1828 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 9 },
1829 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
1830 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 6 },
1831 // The generic code to compute the scalar overhead is currently broken.
1832 // Workaround this limitation by estimating the scalarization overhead
1833 // here. We have roughly 10 instructions per scalar element.
1834 // Multiply that by the vector width.
1835 // FIXME: remove that when PR19268 is fixed.
1836 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 },
1837 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 },
1838
1839 { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 4 },
1840 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f64, 3 },
1841 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f64, 2 },
1842 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 3 },
1843
1844 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f64, 3 },
1845 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f64, 2 },
1846 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f32, 4 },
1847 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 3 },
1848 // This node is expanded into scalarized operations but BasicTTI is overly
1849 // optimistic estimating its cost. It computes 3 per element (one
1850 // vector-extract, one scalar conversion and one vector-insert). The
1851 // problem is that the inserts form a read-modify-write chain so latency
1852 // should be factored in too. Inflating the cost per element by 1.
1853 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 8*4 },
1854 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4*4 },
1855
1856 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 },
1857 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 },
1858 };
1859
1860 static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
1861 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 2 },
1862 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 2 },
1863 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 2 },
1864 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 2 },
1865 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
1866 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
1867
1868 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 },
1869 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 2 },
1870 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 1 },
1871 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 1 },
1872 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
1873 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
1874 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 2 },
1875 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 2 },
1876 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
1877 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
1878 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 4 },
1879 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 4 },
1880 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
1881 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
1882 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
1883 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
1884 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
1885 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
1886
1887 // These truncates end up widening elements.
1888 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ
1889 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ
1890 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD
1891
1892 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 1 },
1893 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 1 },
1894 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1 },
1895 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1 },
1896 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
1897 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
1898 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 3 },
1899 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 },
1900 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1 }, // PSHUFB
1901
1902 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 4 },
1903 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 },
1904
1905 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 3 },
1906 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 3 },
1907
1908 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 3 },
1909 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 3 },
1910 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
1911 };
1912
1913 static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
1914 // These are somewhat magic numbers justified by looking at the output of
1915 // Intel's IACA, running some kernels and making sure when we take
1916 // legalization into account the throughput will be overestimated.
1917 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
1918 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1919 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
1920 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
1921 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
1922 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 2*10 },
1923 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2*10 },
1924 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
1925 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
1926
1927 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1928 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
1929 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
1930 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
1931 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
1932 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 },
1933 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 6 },
1934 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
1935
1936 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 4 },
1937 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 2 },
1938 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 },
1939 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
1940 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
1941 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 4 },
1942
1943 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 1 },
1944
1945 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 6 },
1946 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 6 },
1947
1948 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 },
1949 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 },
1950 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 4 },
1951 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 4 },
1952 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 },
1953 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 2 },
1954 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
1955 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 4 },
1956
1957 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 },
1958 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 },
1959 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
1960 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 3 },
1961 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 },
1962 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 8 },
1963 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
1964 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 2 },
1965 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 6 },
1966 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 6 },
1967 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
1968 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
1969 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 9 },
1970 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 12 },
1971 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
1972 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 2 },
1973 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
1974 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 10 },
1975 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
1976 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 },
1977 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 },
1978 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 8 },
1979 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
1980 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 5 },
1981
1982 // These truncates are really widening elements.
1983 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD
1984 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // PUNPCKLWD+DQ
1985 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // PUNPCKLBW+WD+PSHUFD
1986 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 1 }, // PUNPCKLWD
1987 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD
1988 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW
1989
1990 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // PAND+PACKUSWB
1991 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // PAND+PACKUSWB
1992 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // PAND+PACKUSWB
1993 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 },
1994 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 3 }, // PAND+2*PACKUSWB
1995 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 },
1996 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 3 },
1997 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 3 },
1998 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 },
1999 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 },
2000 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
2001 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 10 },
2002 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB
2003 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW
2004 { ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1 }, // PSHUFD
2005 };
2006
2007 std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
2008 std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);
2009
2010 if (ST->hasSSE2() && !ST->hasAVX()) {
2011 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2012 LTDest.second, LTSrc.second))
2013 return AdjustCost(LTSrc.first * Entry->Cost);
2014 }
2015
2016 EVT SrcTy = TLI->getValueType(DL, Src);
2017 EVT DstTy = TLI->getValueType(DL, Dst);
2018
2019 // The function getSimpleVT only handles simple value types.
2020 if (!SrcTy.isSimple() || !DstTy.isSimple())
2021 return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind));
2022
2023 MVT SimpleSrcTy = SrcTy.getSimpleVT();
2024 MVT SimpleDstTy = DstTy.getSimpleVT();
2025
2026 if (ST->useAVX512Regs()) {
2027 if (ST->hasBWI())
2028 if (const auto *Entry = ConvertCostTableLookup(AVX512BWConversionTbl, ISD,
2029 SimpleDstTy, SimpleSrcTy))
2030 return AdjustCost(Entry->Cost);
2031
2032 if (ST->hasDQI())
2033 if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
2034 SimpleDstTy, SimpleSrcTy))
2035 return AdjustCost(Entry->Cost);
2036
2037 if (ST->hasAVX512())
2038 if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
2039 SimpleDstTy, SimpleSrcTy))
2040 return AdjustCost(Entry->Cost);
2041 }
2042
2043 if (ST->hasBWI())
2044 if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
2045 SimpleDstTy, SimpleSrcTy))
2046 return AdjustCost(Entry->Cost);
2047
2048 if (ST->hasDQI())
2049 if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
2050 SimpleDstTy, SimpleSrcTy))
2051 return AdjustCost(Entry->Cost);
2052
2053 if (ST->hasAVX512())
2054 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2055 SimpleDstTy, SimpleSrcTy))
2056 return AdjustCost(Entry->Cost);
2057
2058 if (ST->hasAVX2()) {
2059 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2060 SimpleDstTy, SimpleSrcTy))
2061 return AdjustCost(Entry->Cost);
2062 }
2063
2064 if (ST->hasAVX()) {
2065 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2066 SimpleDstTy, SimpleSrcTy))
2067 return AdjustCost(Entry->Cost);
2068 }
2069
2070 if (ST->hasSSE41()) {
2071 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
2072 SimpleDstTy, SimpleSrcTy))
2073 return AdjustCost(Entry->Cost);
2074 }
2075
2076 if (ST->hasSSE2()) {
2077 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2078 SimpleDstTy, SimpleSrcTy))
2079 return AdjustCost(Entry->Cost);
2080 }
2081
2082 return AdjustCost(
2083 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2084}
2085
2086int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
2087 TTI::TargetCostKind CostKind,
2088 const Instruction *I) {
2089 // TODO: Handle other cost kinds.
2090 if (CostKind != TTI::TCK_RecipThroughput)
2091 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
2092
2093 // Legalize the type.
2094 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2095
2096 MVT MTy = LT.second;
2097
2098 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2099 assert(ISD && "Invalid opcode")((ISD && "Invalid opcode") ? static_cast<void> (
0) : __assert_fail ("ISD && \"Invalid opcode\"", "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 2099, __PRETTY_FUNCTION__))
;
2100
2101 unsigned ExtraCost = 0;
2102 if (I && (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp)) {
2103 // Some vector comparison predicates cost extra instructions.
2104 if (MTy.isVector() &&
2105 !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
2106 (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
2107 ST->hasBWI())) {
2108 switch (cast<CmpInst>(I)->getPredicate()) {
2109 case CmpInst::Predicate::ICMP_NE:
2110 // xor(cmpeq(x,y),-1)
2111 ExtraCost = 1;
2112 break;
2113 case CmpInst::Predicate::ICMP_SGE:
2114 case CmpInst::Predicate::ICMP_SLE:
2115 // xor(cmpgt(x,y),-1)
2116 ExtraCost = 1;
2117 break;
2118 case CmpInst::Predicate::ICMP_ULT:
2119 case CmpInst::Predicate::ICMP_UGT:
2120 // cmpgt(xor(x,signbit),xor(y,signbit))
2121 // xor(cmpeq(pmaxu(x,y),x),-1)
2122 ExtraCost = 2;
2123 break;
2124 case CmpInst::Predicate::ICMP_ULE:
2125 case CmpInst::Predicate::ICMP_UGE:
2126 if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
2127 (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
2128 // cmpeq(psubus(x,y),0)
2129 // cmpeq(pminu(x,y),x)
2130 ExtraCost = 1;
2131 } else {
2132 // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
2133 ExtraCost = 3;
2134 }
2135 break;
2136 default:
2137 break;
2138 }
2139 }
2140 }
2141
2142 static const CostTblEntry SLMCostTbl[] = {
2143 // slm pcmpeq/pcmpgt throughput is 2
2144 { ISD::SETCC, MVT::v2i64, 2 },
2145 };
2146
2147 static const CostTblEntry AVX512BWCostTbl[] = {
2148 { ISD::SETCC, MVT::v32i16, 1 },
2149 { ISD::SETCC, MVT::v64i8, 1 },
2150
2151 { ISD::SELECT, MVT::v32i16, 1 },
2152 { ISD::SELECT, MVT::v64i8, 1 },
2153 };
2154
2155 static const CostTblEntry AVX512CostTbl[] = {
2156 { ISD::SETCC, MVT::v8i64, 1 },
2157 { ISD::SETCC, MVT::v16i32, 1 },
2158 { ISD::SETCC, MVT::v8f64, 1 },
2159 { ISD::SETCC, MVT::v16f32, 1 },
2160
2161 { ISD::SELECT, MVT::v8i64, 1 },
2162 { ISD::SELECT, MVT::v16i32, 1 },
2163 { ISD::SELECT, MVT::v8f64, 1 },
2164 { ISD::SELECT, MVT::v16f32, 1 },
2165
2166 { ISD::SETCC, MVT::v32i16, 2 }, // FIXME: should probably be 4
2167 { ISD::SETCC, MVT::v64i8, 2 }, // FIXME: should probably be 4
2168
2169 { ISD::SELECT, MVT::v32i16, 2 }, // FIXME: should be 3
2170 { ISD::SELECT, MVT::v64i8, 2 }, // FIXME: should be 3
2171 };
2172
2173 static const CostTblEntry AVX2CostTbl[] = {
2174 { ISD::SETCC, MVT::v4i64, 1 },
2175 { ISD::SETCC, MVT::v8i32, 1 },
2176 { ISD::SETCC, MVT::v16i16, 1 },
2177 { ISD::SETCC, MVT::v32i8, 1 },
2178
2179 { ISD::SELECT, MVT::v4i64, 1 }, // pblendvb
2180 { ISD::SELECT, MVT::v8i32, 1 }, // pblendvb
2181 { ISD::SELECT, MVT::v16i16, 1 }, // pblendvb
2182 { ISD::SELECT, MVT::v32i8, 1 }, // pblendvb
2183 };
2184
2185 static const CostTblEntry AVX1CostTbl[] = {
2186 { ISD::SETCC, MVT::v4f64, 1 },
2187 { ISD::SETCC, MVT::v8f32, 1 },
2188 // AVX1 does not support 8-wide integer compare.
2189 { ISD::SETCC, MVT::v4i64, 4 },
2190 { ISD::SETCC, MVT::v8i32, 4 },
2191 { ISD::SETCC, MVT::v16i16, 4 },
2192 { ISD::SETCC, MVT::v32i8, 4 },
2193
2194 { ISD::SELECT, MVT::v4f64, 1 }, // vblendvpd
2195 { ISD::SELECT, MVT::v8f32, 1 }, // vblendvps
2196 { ISD::SELECT, MVT::v4i64, 1 }, // vblendvpd
2197 { ISD::SELECT, MVT::v8i32, 1 }, // vblendvps
2198 { ISD::SELECT, MVT::v16i16, 3 }, // vandps + vandnps + vorps
2199 { ISD::SELECT, MVT::v32i8, 3 }, // vandps + vandnps + vorps
2200 };
2201
2202 static const CostTblEntry SSE42CostTbl[] = {
2203 { ISD::SETCC, MVT::v2f64, 1 },
2204 { ISD::SETCC, MVT::v4f32, 1 },
2205 { ISD::SETCC, MVT::v2i64, 1 },
2206 };
2207
2208 static const CostTblEntry SSE41CostTbl[] = {
2209 { ISD::SELECT, MVT::v2f64, 1 }, // blendvpd
2210 { ISD::SELECT, MVT::v4f32, 1 }, // blendvps
2211 { ISD::SELECT, MVT::v2i64, 1 }, // pblendvb
2212 { ISD::SELECT, MVT::v4i32, 1 }, // pblendvb
2213 { ISD::SELECT, MVT::v8i16, 1 }, // pblendvb
2214 { ISD::SELECT, MVT::v16i8, 1 }, // pblendvb
2215 };
2216
2217 static const CostTblEntry SSE2CostTbl[] = {
2218 { ISD::SETCC, MVT::v2f64, 2 },
2219 { ISD::SETCC, MVT::f64, 1 },
2220 { ISD::SETCC, MVT::v2i64, 8 },
2221 { ISD::SETCC, MVT::v4i32, 1 },
2222 { ISD::SETCC, MVT::v8i16, 1 },
2223 { ISD::SETCC, MVT::v16i8, 1 },
2224
2225 { ISD::SELECT, MVT::v2f64, 3 }, // andpd + andnpd + orpd
2226 { ISD::SELECT, MVT::v2i64, 3 }, // pand + pandn + por
2227 { ISD::SELECT, MVT::v4i32, 3 }, // pand + pandn + por
2228 { ISD::SELECT, MVT::v8i16, 3 }, // pand + pandn + por
2229 { ISD::SELECT, MVT::v16i8, 3 }, // pand + pandn + por
2230 };
2231
2232 static const CostTblEntry SSE1CostTbl[] = {
2233 { ISD::SETCC, MVT::v4f32, 2 },
2234 { ISD::SETCC, MVT::f32, 1 },
2235
2236 { ISD::SELECT, MVT::v4f32, 3 }, // andps + andnps + orps
2237 };
2238
2239 if (ST->isSLM())
2240 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
2241 return LT.first * (ExtraCost + Entry->Cost);
2242
2243 if (ST->hasBWI())
2244 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
2245 return LT.first * (ExtraCost + Entry->Cost);
2246
2247 if (ST->hasAVX512())
2248 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2249 return LT.first * (ExtraCost + Entry->Cost);
2250
2251 if (ST->hasAVX2())
2252 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
2253 return LT.first * (ExtraCost + Entry->Cost);
2254
2255 if (ST->hasAVX())
2256 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
2257 return LT.first * (ExtraCost + Entry->Cost);
2258
2259 if (ST->hasSSE42())
2260 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
2261 return LT.first * (ExtraCost + Entry->Cost);
2262
2263 if (ST->hasSSE41())
2264 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
2265 return LT.first * (ExtraCost + Entry->Cost);
2266
2267 if (ST->hasSSE2())
2268 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
2269 return LT.first * (ExtraCost + Entry->Cost);
2270
2271 if (ST->hasSSE1())
2272 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
2273 return LT.first * (ExtraCost + Entry->Cost);
2274
2275 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
2276}
2277
2278unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; }
2279
2280int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
2281 const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) {
2282
2283 // Costs should match the codegen from:
2284 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
2285 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
2286 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
2287 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
2288 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
2289 static const CostTblEntry AVX512CDCostTbl[] = {
2290 { ISD::CTLZ, MVT::v8i64, 1 },
2291 { ISD::CTLZ, MVT::v16i32, 1 },
2292 { ISD::CTLZ, MVT::v32i16, 8 },
2293 { ISD::CTLZ, MVT::v64i8, 20 },
2294 { ISD::CTLZ, MVT::v4i64, 1 },
2295 { ISD::CTLZ, MVT::v8i32, 1 },
2296 { ISD::CTLZ, MVT::v16i16, 4 },
2297 { ISD::CTLZ, MVT::v32i8, 10 },
2298 { ISD::CTLZ, MVT::v2i64, 1 },
2299 { ISD::CTLZ, MVT::v4i32, 1 },
2300 { ISD::CTLZ, MVT::v8i16, 4 },
2301 { ISD::CTLZ, MVT::v16i8, 4 },
2302 };
2303 static const CostTblEntry AVX512BWCostTbl[] = {
2304 { ISD::ABS, MVT::v32i16, 1 },
2305 { ISD::ABS, MVT::v64i8, 1 },
2306 { ISD::BITREVERSE, MVT::v8i64, 5 },
2307 { ISD::BITREVERSE, MVT::v16i32, 5 },
2308 { ISD::BITREVERSE, MVT::v32i16, 5 },
2309 { ISD::BITREVERSE, MVT::v64i8, 5 },
2310 { ISD::CTLZ, MVT::v8i64, 23 },
2311 { ISD::CTLZ, MVT::v16i32, 22 },
2312 { ISD::CTLZ, MVT::v32i16, 18 },
2313 { ISD::CTLZ, MVT::v64i8, 17 },
2314 { ISD::CTPOP, MVT::v8i64, 7 },
2315 { ISD::CTPOP, MVT::v16i32, 11 },
2316 { ISD::CTPOP, MVT::v32i16, 9 },
2317 { ISD::CTPOP, MVT::v64i8, 6 },
2318 { ISD::CTTZ, MVT::v8i64, 10 },
2319 { ISD::CTTZ, MVT::v16i32, 14 },
2320 { ISD::CTTZ, MVT::v32i16, 12 },
2321 { ISD::CTTZ, MVT::v64i8, 9 },
2322 { ISD::SADDSAT, MVT::v32i16, 1 },
2323 { ISD::SADDSAT, MVT::v64i8, 1 },
2324 { ISD::SMAX, MVT::v32i16, 1 },
2325 { ISD::SMAX, MVT::v64i8, 1 },
2326 { ISD::SMIN, MVT::v32i16, 1 },
2327 { ISD::SMIN, MVT::v64i8, 1 },
2328 { ISD::SSUBSAT, MVT::v32i16, 1 },
2329 { ISD::SSUBSAT, MVT::v64i8, 1 },
2330 { ISD::UADDSAT, MVT::v32i16, 1 },
2331 { ISD::UADDSAT, MVT::v64i8, 1 },
2332 { ISD::UMAX, MVT::v32i16, 1 },
2333 { ISD::UMAX, MVT::v64i8, 1 },
2334 { ISD::UMIN, MVT::v32i16, 1 },
2335 { ISD::UMIN, MVT::v64i8, 1 },
2336 { ISD::USUBSAT, MVT::v32i16, 1 },
2337 { ISD::USUBSAT, MVT::v64i8, 1 },
2338 };
2339 static const CostTblEntry AVX512CostTbl[] = {
2340 { ISD::ABS, MVT::v8i64, 1 },
2341 { ISD::ABS, MVT::v16i32, 1 },
2342 { ISD::ABS, MVT::v32i16, 2 }, // FIXME: include split
2343 { ISD::ABS, MVT::v64i8, 2 }, // FIXME: include split
2344 { ISD::ABS, MVT::v4i64, 1 },
2345 { ISD::ABS, MVT::v2i64, 1 },
2346 { ISD::BITREVERSE, MVT::v8i64, 36 },
2347 { ISD::BITREVERSE, MVT::v16i32, 24 },
2348 { ISD::BITREVERSE, MVT::v32i16, 10 },
2349 { ISD::BITREVERSE, MVT::v64i8, 10 },
2350 { ISD::CTLZ, MVT::v8i64, 29 },
2351 { ISD::CTLZ, MVT::v16i32, 35 },
2352 { ISD::CTLZ, MVT::v32i16, 28 },
2353 { ISD::CTLZ, MVT::v64i8, 18 },
2354 { ISD::CTPOP, MVT::v8i64, 16 },
2355 { ISD::CTPOP, MVT::v16i32, 24 },
2356 { ISD::CTPOP, MVT::v32i16, 18 },
2357 { ISD::CTPOP, MVT::v64i8, 12 },
2358 { ISD::CTTZ, MVT::v8i64, 20 },
2359 { ISD::CTTZ, MVT::v16i32, 28 },
2360 { ISD::CTTZ, MVT::v32i16, 24 },
2361 { ISD::CTTZ, MVT::v64i8, 18 },
2362 { ISD::SMAX, MVT::v8i64, 1 },
2363 { ISD::SMAX, MVT::v16i32, 1 },
2364 { ISD::SMAX, MVT::v32i16, 2 }, // FIXME: include split
2365 { ISD::SMAX, MVT::v64i8, 2 }, // FIXME: include split
2366 { ISD::SMAX, MVT::v4i64, 1 },
2367 { ISD::SMAX, MVT::v2i64, 1 },
2368 { ISD::SMIN, MVT::v8i64, 1 },
2369 { ISD::SMIN, MVT::v16i32, 1 },
2370 { ISD::SMIN, MVT::v32i16, 2 }, // FIXME: include split
2371 { ISD::SMIN, MVT::v64i8, 2 }, // FIXME: include split
2372 { ISD::SMIN, MVT::v4i64, 1 },
2373 { ISD::SMIN, MVT::v2i64, 1 },
2374 { ISD::UMAX, MVT::v8i64, 1 },
2375 { ISD::UMAX, MVT::v16i32, 1 },
2376 { ISD::UMAX, MVT::v32i16, 2 }, // FIXME: include split
2377 { ISD::UMAX, MVT::v64i8, 2 }, // FIXME: include split
2378 { ISD::UMAX, MVT::v4i64, 1 },
2379 { ISD::UMAX, MVT::v2i64, 1 },
2380 { ISD::UMIN, MVT::v8i64, 1 },
2381 { ISD::UMIN, MVT::v16i32, 1 },
2382 { ISD::UMIN, MVT::v32i16, 2 }, // FIXME: include split
2383 { ISD::UMIN, MVT::v64i8, 2 }, // FIXME: include split
2384 { ISD::UMIN, MVT::v4i64, 1 },
2385 { ISD::UMIN, MVT::v2i64, 1 },
2386 { ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd
2387 { ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq
2388 { ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq
2389 { ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq
2390 { ISD::UADDSAT, MVT::v16i32, 3 }, // not + pminud + paddd
2391 { ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq
2392 { ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq
2393 { ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq
2394 { ISD::SADDSAT, MVT::v32i16, 2 }, // FIXME: include split
2395 { ISD::SADDSAT, MVT::v64i8, 2 }, // FIXME: include split
2396 { ISD::SSUBSAT, MVT::v32i16, 2 }, // FIXME: include split
2397 { ISD::SSUBSAT, MVT::v64i8, 2 }, // FIXME: include split
2398 { ISD::UADDSAT, MVT::v32i16, 2 }, // FIXME: include split
2399 { ISD::UADDSAT, MVT::v64i8, 2 }, // FIXME: include split
2400 { ISD::USUBSAT, MVT::v32i16, 2 }, // FIXME: include split
2401 { ISD::USUBSAT, MVT::v64i8, 2 }, // FIXME: include split
2402 { ISD::FMAXNUM, MVT::f32, 2 },
2403 { ISD::FMAXNUM, MVT::v4f32, 2 },
2404 { ISD::FMAXNUM, MVT::v8f32, 2 },
2405 { ISD::FMAXNUM, MVT::v16f32, 2 },
2406 { ISD::FMAXNUM, MVT::f64, 2 },
2407 { ISD::FMAXNUM, MVT::v2f64, 2 },
2408 { ISD::FMAXNUM, MVT::v4f64, 2 },
2409 { ISD::FMAXNUM, MVT::v8f64, 2 },
2410 };
2411 static const CostTblEntry XOPCostTbl[] = {
2412 { ISD::BITREVERSE, MVT::v4i64, 4 },
2413 { ISD::BITREVERSE, MVT::v8i32, 4 },
2414 { ISD::BITREVERSE, MVT::v16i16, 4 },
2415 { ISD::BITREVERSE, MVT::v32i8, 4 },
2416 { ISD::BITREVERSE, MVT::v2i64, 1 },
2417 { ISD::BITREVERSE, MVT::v4i32, 1 },
2418 { ISD::BITREVERSE, MVT::v8i16, 1 },
2419 { ISD::BITREVERSE, MVT::v16i8, 1 },
2420 { ISD::BITREVERSE, MVT::i64, 3 },
2421 { ISD::BITREVERSE, MVT::i32, 3 },
2422 { ISD::BITREVERSE, MVT::i16, 3 },
2423 { ISD::BITREVERSE, MVT::i8, 3 }
2424 };
2425 static const CostTblEntry AVX2CostTbl[] = {
2426 { ISD::ABS, MVT::v4i64, 2 }, // VBLENDVPD(X,VPSUBQ(0,X),X)
2427 { ISD::ABS, MVT::v8i32, 1 },
2428 { ISD::ABS, MVT::v16i16, 1 },
2429 { ISD::ABS, MVT::v32i8, 1 },
2430 { ISD::BITREVERSE, MVT::v4i64, 5 },
2431 { ISD::BITREVERSE, MVT::v8i32, 5 },
2432 { ISD::BITREVERSE, MVT::v16i16, 5 },
2433 { ISD::BITREVERSE, MVT::v32i8, 5 },
2434 { ISD::BSWAP, MVT::v4i64, 1 },
2435 { ISD::BSWAP, MVT::v8i32, 1 },
2436 { ISD::BSWAP, MVT::v16i16, 1 },
2437 { ISD::CTLZ, MVT::v4i64, 23 },
2438 { ISD::CTLZ, MVT::v8i32, 18 },
2439 { ISD::CTLZ, MVT::v16i16, 14 },
2440 { ISD::CTLZ, MVT::v32i8, 9 },
2441 { ISD::CTPOP, MVT::v4i64, 7 },
2442 { ISD::CTPOP, MVT::v8i32, 11 },
2443 { ISD::CTPOP, MVT::v16i16, 9 },
2444 { ISD::CTPOP, MVT::v32i8, 6 },
2445 { ISD::CTTZ, MVT::v4i64, 10 },
2446 { ISD::CTTZ, MVT::v8i32, 14 },
2447 { ISD::CTTZ, MVT::v16i16, 12 },
2448 { ISD::CTTZ, MVT::v32i8, 9 },
2449 { ISD::SADDSAT, MVT::v16i16, 1 },
2450 { ISD::SADDSAT, MVT::v32i8, 1 },
2451 { ISD::SMAX, MVT::v8i32, 1 },
2452 { ISD::SMAX, MVT::v16i16, 1 },
2453 { ISD::SMAX, MVT::v32i8, 1 },
2454 { ISD::SMIN, MVT::v8i32, 1 },
2455 { ISD::SMIN, MVT::v16i16, 1 },
2456 { ISD::SMIN, MVT::v32i8, 1 },
2457 { ISD::SSUBSAT, MVT::v16i16, 1 },
2458 { ISD::SSUBSAT, MVT::v32i8, 1 },
2459 { ISD::UADDSAT, MVT::v16i16, 1 },
2460 { ISD::UADDSAT, MVT::v32i8, 1 },
2461 { ISD::UADDSAT, MVT::v8i32, 3 }, // not + pminud + paddd
2462 { ISD::UMAX, MVT::v8i32, 1 },
2463 { ISD::UMAX, MVT::v16i16, 1 },
2464 { ISD::UMAX, MVT::v32i8, 1 },
2465 { ISD::UMIN, MVT::v8i32, 1 },
2466 { ISD::UMIN, MVT::v16i16, 1 },
2467 { ISD::UMIN, MVT::v32i8, 1 },
2468 { ISD::USUBSAT, MVT::v16i16, 1 },
2469 { ISD::USUBSAT, MVT::v32i8, 1 },
2470 { ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd
2471 { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/
2472 { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
2473 { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
2474 { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/
2475 { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
2476 { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
2477 };
2478 static const CostTblEntry AVX1CostTbl[] = {
2479 { ISD::ABS, MVT::v4i64, 6 }, // VBLENDVPD(X,VPSUBQ(0,X),X)
2480 { ISD::ABS, MVT::v8i32, 3 },
2481 { ISD::ABS, MVT::v16i16, 3 },
2482 { ISD::ABS, MVT::v32i8, 3 },
2483 { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert
2484 { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert
2485 { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert
2486 { ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert
2487 { ISD::BSWAP, MVT::v4i64, 4 },
2488 { ISD::BSWAP, MVT::v8i32, 4 },
2489 { ISD::BSWAP, MVT::v16i16, 4 },
2490 { ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert
2491 { ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert
2492 { ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert
2493 { ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
2494 { ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert
2495 { ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert
2496 { ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert
2497 { ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert
2498 { ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert
2499 { ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert
2500 { ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert
2501 { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
2502 { ISD::SADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2503 { ISD::SADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2504 { ISD::SMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
2505 { ISD::SMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2506 { ISD::SMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2507 { ISD::SMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
2508 { ISD::SMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2509 { ISD::SMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2510 { ISD::SSUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2511 { ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2512 { ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2513 { ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2514 { ISD::UADDSAT, MVT::v8i32, 8 }, // 2 x 128-bit Op + extract/insert
2515 { ISD::UMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
2516 { ISD::UMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2517 { ISD::UMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2518 { ISD::UMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
2519 { ISD::UMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2520 { ISD::UMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2521 { ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2522 { ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2523 { ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert
2524 { ISD::FMAXNUM, MVT::f32, 3 },
2525 { ISD::FMAXNUM, MVT::v4f32, 3 },
2526 { ISD::FMAXNUM, MVT::v8f32, 5 },
2527 { ISD::FMAXNUM, MVT::f64, 3 },
2528 { ISD::FMAXNUM, MVT::v2f64, 3 },
2529 { ISD::FMAXNUM, MVT::v4f64, 5 },
2530 { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
2531 { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
2532 { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
2533 { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/
2534 { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/
2535 { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/
2536 };
2537 static const CostTblEntry GLMCostTbl[] = {
2538 { ISD::FSQRT, MVT::f32, 19 }, // sqrtss
2539 { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps
2540 { ISD::FSQRT, MVT::f64, 34 }, // sqrtsd
2541 { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd
2542 };
2543 static const CostTblEntry SLMCostTbl[] = {
2544 { ISD::FSQRT, MVT::f32, 20 }, // sqrtss
2545 { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps
2546 { ISD::FSQRT, MVT::f64, 35 }, // sqrtsd
2547 { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd
2548 };
2549 static const CostTblEntry SSE42CostTbl[] = {
2550 { ISD::ABS, MVT::v2i64, 3 }, // BLENDVPD(X,PSUBQ(0,X),X)
2551 { ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd
2552 { ISD::UADDSAT, MVT::v4i32, 3 }, // not + pminud + paddd
2553 { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
2554 { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
2555 };
2556 static const CostTblEntry SSE41CostTbl[] = {
2557 { ISD::SMAX, MVT::v4i32, 1 },
2558 { ISD::SMAX, MVT::v16i8, 1 },
2559 { ISD::SMIN, MVT::v4i32, 1 },
2560 { ISD::SMIN, MVT::v16i8, 1 },
2561 { ISD::UMAX, MVT::v4i32, 1 },
2562 { ISD::UMAX, MVT::v8i16, 1 },
2563 { ISD::UMIN, MVT::v4i32, 1 },
2564 { ISD::UMIN, MVT::v8i16, 1 },
2565 };
2566 static const CostTblEntry SSSE3CostTbl[] = {
2567 { ISD::ABS, MVT::v4i32, 1 },
2568 { ISD::ABS, MVT::v8i16, 1 },
2569 { ISD::ABS, MVT::v16i8, 1 },
2570 { ISD::BITREVERSE, MVT::v2i64, 5 },
2571 { ISD::BITREVERSE, MVT::v4i32, 5 },
2572 { ISD::BITREVERSE, MVT::v8i16, 5 },
2573 { ISD::BITREVERSE, MVT::v16i8, 5 },
2574 { ISD::BSWAP, MVT::v2i64, 1 },
2575 { ISD::BSWAP, MVT::v4i32, 1 },
2576 { ISD::BSWAP, MVT::v8i16, 1 },
2577 { ISD::CTLZ, MVT::v2i64, 23 },
2578 { ISD::CTLZ, MVT::v4i32, 18 },
2579 { ISD::CTLZ, MVT::v8i16, 14 },
2580 { ISD::CTLZ, MVT::v16i8, 9 },
2581 { ISD::CTPOP, MVT::v2i64, 7 },
2582 { ISD::CTPOP, MVT::v4i32, 11 },
2583 { ISD::CTPOP, MVT::v8i16, 9 },
2584 { ISD::CTPOP, MVT::v16i8, 6 },
2585 { ISD::CTTZ, MVT::v2i64, 10 },
2586 { ISD::CTTZ, MVT::v4i32, 14 },
2587 { ISD::CTTZ, MVT::v8i16, 12 },
2588 { ISD::CTTZ, MVT::v16i8, 9 }
2589 };
2590 static const CostTblEntry SSE2CostTbl[] = {
2591 { ISD::ABS, MVT::v2i64, 4 },
2592 { ISD::ABS, MVT::v4i32, 3 },
2593 { ISD::ABS, MVT::v8i16, 3 },
2594 { ISD::ABS, MVT::v16i8, 3 },
2595 { ISD::BITREVERSE, MVT::v2i64, 29 },
2596 { ISD::BITREVERSE, MVT::v4i32, 27 },
2597 { ISD::BITREVERSE, MVT::v8i16, 27 },
2598 { ISD::BITREVERSE, MVT::v16i8, 20 },
2599 { ISD::BSWAP, MVT::v2i64, 7 },
2600 { ISD::BSWAP, MVT::v4i32, 7 },
2601 { ISD::BSWAP, MVT::v8i16, 7 },
2602 { ISD::CTLZ, MVT::v2i64, 25 },
2603 { ISD::CTLZ, MVT::v4i32, 26 },
2604 { ISD::CTLZ, MVT::v8i16, 20 },
2605 { ISD::CTLZ, MVT::v16i8, 17 },
2606 { ISD::CTPOP, MVT::v2i64, 12 },
2607 { ISD::CTPOP, MVT::v4i32, 15 },
2608 { ISD::CTPOP, MVT::v8i16, 13 },
2609 { ISD::CTPOP, MVT::v16i8, 10 },
2610 { ISD::CTTZ, MVT::v2i64, 14 },
2611 { ISD::CTTZ, MVT::v4i32, 18 },
2612 { ISD::CTTZ, MVT::v8i16, 16 },
2613 { ISD::CTTZ, MVT::v16i8, 13 },
2614 { ISD::SADDSAT, MVT::v8i16, 1 },
2615 { ISD::SADDSAT, MVT::v16i8, 1 },
2616 { ISD::SMAX, MVT::v8i16, 1 },
2617 { ISD::SMIN, MVT::v8i16, 1 },
2618 { ISD::SSUBSAT, MVT::v8i16, 1 },
2619 { ISD::SSUBSAT, MVT::v16i8, 1 },
2620 { ISD::UADDSAT, MVT::v8i16, 1 },
2621 { ISD::UADDSAT, MVT::v16i8, 1 },
2622 { ISD::UMAX, MVT::v16i8, 1 },
2623 { ISD::UMIN, MVT::v16i8, 1 },
2624 { ISD::USUBSAT, MVT::v8i16, 1 },
2625 { ISD::USUBSAT, MVT::v16i8, 1 },
2626 { ISD::FMAXNUM, MVT::f64, 4 },
2627 { ISD::FMAXNUM, MVT::v2f64, 4 },
2628 { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/
2629 { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/
2630 };
2631 static const CostTblEntry SSE1CostTbl[] = {
2632 { ISD::FMAXNUM, MVT::f32, 4 },
2633 { ISD::FMAXNUM, MVT::v4f32, 4 },
2634 { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
2635 { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
2636 };
2637 static const CostTblEntry BMI64CostTbl[] = { // 64-bit targets
2638 { ISD::CTTZ, MVT::i64, 1 },
2639 };
2640 static const CostTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
2641 { ISD::CTTZ, MVT::i32, 1 },
2642 { ISD::CTTZ, MVT::i16, 1 },
2643 { ISD::CTTZ, MVT::i8, 1 },
2644 };
2645 static const CostTblEntry LZCNT64CostTbl[] = { // 64-bit targets
2646 { ISD::CTLZ, MVT::i64, 1 },
2647 };
2648 static const CostTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
2649 { ISD::CTLZ, MVT::i32, 1 },
2650 { ISD::CTLZ, MVT::i16, 1 },
2651 { ISD::CTLZ, MVT::i8, 1 },
2652 };
2653 static const CostTblEntry POPCNT64CostTbl[] = { // 64-bit targets
2654 { ISD::CTPOP, MVT::i64, 1 },
2655 };
2656 static const CostTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
2657 { ISD::CTPOP, MVT::i32, 1 },
2658 { ISD::CTPOP, MVT::i16, 1 },
2659 { ISD::CTPOP, MVT::i8, 1 },
2660 };
2661 static const CostTblEntry X64CostTbl[] = { // 64-bit targets
2662 { ISD::BITREVERSE, MVT::i64, 14 },
2663 { ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV
2664 { ISD::CTTZ, MVT::i64, 3 }, // TEST+BSF+CMOV/BRANCH
2665 { ISD::CTPOP, MVT::i64, 10 },
2666 { ISD::SADDO, MVT::i64, 1 },
2667 { ISD::UADDO, MVT::i64, 1 },
2668 };
2669 static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
2670 { ISD::BITREVERSE, MVT::i32, 14 },
2671 { ISD::BITREVERSE, MVT::i16, 14 },
2672 { ISD::BITREVERSE, MVT::i8, 11 },
2673 { ISD::CTLZ, MVT::i32, 4 }, // BSR+XOR or BSR+XOR+CMOV
2674 { ISD::CTLZ, MVT::i16, 4 }, // BSR+XOR or BSR+XOR+CMOV
2675 { ISD::CTLZ, MVT::i8, 4 }, // BSR+XOR or BSR+XOR+CMOV
2676 { ISD::CTTZ, MVT::i32, 3 }, // TEST+BSF+CMOV/BRANCH
2677 { ISD::CTTZ, MVT::i16, 3 }, // TEST+BSF+CMOV/BRANCH
2678 { ISD::CTTZ, MVT::i8, 3 }, // TEST+BSF+CMOV/BRANCH
2679 { ISD::CTPOP, MVT::i32, 8 },
2680 { ISD::CTPOP, MVT::i16, 9 },
2681 { ISD::CTPOP, MVT::i8, 7 },
2682 { ISD::SADDO, MVT::i32, 1 },
2683 { ISD::SADDO, MVT::i16, 1 },
2684 { ISD::SADDO, MVT::i8, 1 },
2685 { ISD::UADDO, MVT::i32, 1 },
2686 { ISD::UADDO, MVT::i16, 1 },
2687 { ISD::UADDO, MVT::i8, 1 },
2688 };
2689
2690 Type *RetTy = ICA.getReturnType();
2691 Type *OpTy = RetTy;
2692 Intrinsic::ID IID = ICA.getID();
2693 unsigned ISD = ISD::DELETED_NODE;
2694 switch (IID) {
2695 default:
2696 break;
2697 case Intrinsic::abs:
2698 ISD = ISD::ABS;
2699 break;
2700 case Intrinsic::bitreverse:
2701 ISD = ISD::BITREVERSE;
2702 break;
2703 case Intrinsic::bswap:
2704 ISD = ISD::BSWAP;
2705 break;
2706 case Intrinsic::ctlz:
2707 ISD = ISD::CTLZ;
2708 break;
2709 case Intrinsic::ctpop:
2710 ISD = ISD::CTPOP;
2711 break;
2712 case Intrinsic::cttz:
2713 ISD = ISD::CTTZ;
2714 break;
2715 case Intrinsic::maxnum:
2716 case Intrinsic::minnum:
2717 // FMINNUM has same costs so don't duplicate.
2718 ISD = ISD::FMAXNUM;
2719 break;
2720 case Intrinsic::sadd_sat:
2721 ISD = ISD::SADDSAT;
2722 break;
2723 case Intrinsic::smax:
2724 ISD = ISD::SMAX;
2725 break;
2726 case Intrinsic::smin:
2727 ISD = ISD::SMIN;
2728 break;
2729 case Intrinsic::ssub_sat:
2730 ISD = ISD::SSUBSAT;
2731 break;
2732 case Intrinsic::uadd_sat:
2733 ISD = ISD::UADDSAT;
2734 break;
2735 case Intrinsic::umax:
2736 ISD = ISD::UMAX;
2737 break;
2738 case Intrinsic::umin:
2739 ISD = ISD::UMIN;
2740 break;
2741 case Intrinsic::usub_sat:
2742 ISD = ISD::USUBSAT;
2743 break;
2744 case Intrinsic::sqrt:
2745 ISD = ISD::FSQRT;
2746 break;
2747 case Intrinsic::sadd_with_overflow:
2748 case Intrinsic::ssub_with_overflow:
2749 // SSUBO has same costs so don't duplicate.
2750 ISD = ISD::SADDO;
2751 OpTy = RetTy->getContainedType(0);
2752 break;
2753 case Intrinsic::uadd_with_overflow:
2754 case Intrinsic::usub_with_overflow:
2755 // USUBO has same costs so don't duplicate.
2756 ISD = ISD::UADDO;
2757 OpTy = RetTy->getContainedType(0);
2758 break;
2759 }
2760
2761 if (ISD != ISD::DELETED_NODE) {
2762 // Legalize the type.
2763 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, OpTy);
2764 MVT MTy = LT.second;
2765
2766 // Attempt to lookup cost.
2767 if (ST->useGLMDivSqrtCosts())
2768 if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
2769 return LT.first * Entry->Cost;
2770
2771 if (ST->isSLM())
2772 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
2773 return LT.first * Entry->Cost;
2774
2775 if (ST->hasCDI())
2776 if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
2777 return LT.first * Entry->Cost;
2778
2779 if (ST->hasBWI())
2780 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
2781 return LT.first * Entry->Cost;
2782
2783 if (ST->hasAVX512())
2784 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2785 return LT.first * Entry->Cost;
2786
2787 if (ST->hasXOP())
2788 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
2789 return LT.first * Entry->Cost;
2790
2791 if (ST->hasAVX2())
2792 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
2793 return LT.first * Entry->Cost;
2794
2795 if (ST->hasAVX())
2796 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
2797 return LT.first * Entry->Cost;
2798
2799 if (ST->hasSSE42())
2800 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
2801 return LT.first * Entry->Cost;
2802
2803 if (ST->hasSSE41())
2804 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
2805 return LT.first * Entry->Cost;
2806
2807 if (ST->hasSSSE3())
2808 if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
2809 return LT.first * Entry->Cost;
2810
2811 if (ST->hasSSE2())
2812 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
2813 return LT.first * Entry->Cost;
2814
2815 if (ST->hasSSE1())
2816 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
2817 return LT.first * Entry->Cost;
2818
2819 if (ST->hasBMI()) {
2820 if (ST->is64Bit())
2821 if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
2822 return LT.first * Entry->Cost;
2823
2824 if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
2825 return LT.first * Entry->Cost;
2826 }
2827
2828 if (ST->hasLZCNT()) {
2829 if (ST->is64Bit())
2830 if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
2831 return LT.first * Entry->Cost;
2832
2833 if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
2834 return LT.first * Entry->Cost;
2835 }
2836
2837 if (ST->hasPOPCNT()) {
2838 if (ST->is64Bit())
2839 if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
2840 return LT.first * Entry->Cost;
2841
2842 if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
2843 return LT.first * Entry->Cost;
2844 }
2845
2846 // TODO - add BMI (TZCNT) scalar handling
2847
2848 if (ST->is64Bit())
2849 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
2850 return LT.first * Entry->Cost;
2851
2852 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
2853 return LT.first * Entry->Cost;
2854 }
2855
2856 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
2857}
2858
2859int X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
2860 TTI::TargetCostKind CostKind) {
2861 if (CostKind != TTI::TCK_RecipThroughput)
2862 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
2863
2864 if (ICA.isTypeBasedOnly())
2865 return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
2866
2867 static const CostTblEntry AVX512CostTbl[] = {
2868 { ISD::ROTL, MVT::v8i64, 1 },
2869 { ISD::ROTL, MVT::v4i64, 1 },
2870 { ISD::ROTL, MVT::v2i64, 1 },
2871 { ISD::ROTL, MVT::v16i32, 1 },
2872 { ISD::ROTL, MVT::v8i32, 1 },
2873 { ISD::ROTL, MVT::v4i32, 1 },
2874 { ISD::ROTR, MVT::v8i64, 1 },
2875 { ISD::ROTR, MVT::v4i64, 1 },
2876 { ISD::ROTR, MVT::v2i64, 1 },
2877 { ISD::ROTR, MVT::v16i32, 1 },
2878 { ISD::ROTR, MVT::v8i32, 1 },
2879 { ISD::ROTR, MVT::v4i32, 1 }
2880 };
2881 // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
2882 static const CostTblEntry XOPCostTbl[] = {
2883 { ISD::ROTL, MVT::v4i64, 4 },
2884 { ISD::ROTL, MVT::v8i32, 4 },
2885 { ISD::ROTL, MVT::v16i16, 4 },
2886 { ISD::ROTL, MVT::v32i8, 4 },
2887 { ISD::ROTL, MVT::v2i64, 1 },
2888 { ISD::ROTL, MVT::v4i32, 1 },
2889 { ISD::ROTL, MVT::v8i16, 1 },
2890 { ISD::ROTL, MVT::v16i8, 1 },
2891 { ISD::ROTR, MVT::v4i64, 6 },
2892 { ISD::ROTR, MVT::v8i32, 6 },
2893 { ISD::ROTR, MVT::v16i16, 6 },
2894 { ISD::ROTR, MVT::v32i8, 6 },
2895 { ISD::ROTR, MVT::v2i64, 2 },
2896 { ISD::ROTR, MVT::v4i32, 2 },
2897 { ISD::ROTR, MVT::v8i16, 2 },
2898 { ISD::ROTR, MVT::v16i8, 2 }
2899 };
2900 static const CostTblEntry X64CostTbl[] = { // 64-bit targets
2901 { ISD::ROTL, MVT::i64, 1 },
2902 { ISD::ROTR, MVT::i64, 1 },
2903 { ISD::FSHL, MVT::i64, 4 }
2904 };
2905 static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
2906 { ISD::ROTL, MVT::i32, 1 },
2907 { ISD::ROTL, MVT::i16, 1 },
2908 { ISD::ROTL, MVT::i8, 1 },
2909 { ISD::ROTR, MVT::i32, 1 },
2910 { ISD::ROTR, MVT::i16, 1 },
2911 { ISD::ROTR, MVT::i8, 1 },
2912 { ISD::FSHL, MVT::i32, 4 },
2913 { ISD::FSHL, MVT::i16, 4 },
2914 { ISD::FSHL, MVT::i8, 4 }
2915 };
2916
2917 Intrinsic::ID IID = ICA.getID();
2918 Type *RetTy = ICA.getReturnType();
2919 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
2920 unsigned ISD = ISD::DELETED_NODE;
2921 switch (IID) {
2922 default:
2923 break;
2924 case Intrinsic::fshl:
2925 ISD = ISD::FSHL;
2926 if (Args[0] == Args[1])
2927 ISD = ISD::ROTL;
2928 break;
2929 case Intrinsic::fshr:
2930 // FSHR has same costs so don't duplicate.
2931 ISD = ISD::FSHL;
2932 if (Args[0] == Args[1])
2933 ISD = ISD::ROTR;
2934 break;
2935 }
2936
2937 if (ISD != ISD::DELETED_NODE) {
2938 // Legalize the type.
2939 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
2940 MVT MTy = LT.second;
2941
2942 // Attempt to lookup cost.
2943 if (ST->hasAVX512())
2944 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2945 return LT.first * Entry->Cost;
2946
2947 if (ST->hasXOP())
2948 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
2949 return LT.first * Entry->Cost;
2950
2951 if (ST->is64Bit())
2952 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
2953 return LT.first * Entry->Cost;
2954
2955 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
2956 return LT.first * Entry->Cost;
2957 }
2958
2959 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
2960}
2961
2962int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
2963 static const CostTblEntry SLMCostTbl[] = {
2964 { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },
2965 { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 },
2966 { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 },
2967 { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 }
2968 };
2969
2970 assert(Val->isVectorTy() && "This must be a vector type")((Val->isVectorTy() && "This must be a vector type"
) ? static_cast<void> (0) : __assert_fail ("Val->isVectorTy() && \"This must be a vector type\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 2970, __PRETTY_FUNCTION__))
;
21
'?' condition is true
2971 Type *ScalarType = Val->getScalarType();
2972 int RegisterFileMoveCost = 0;
2973
2974 if (Index != -1U && (Opcode
21.1
'Opcode' is equal to ExtractElement
21.1
'Opcode' is equal to ExtractElement
21.1
'Opcode' is equal to ExtractElement
== Instruction::ExtractElement ||
2975 Opcode == Instruction::InsertElement)) {
2976 // Legalize the type.
2977 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
2978
2979 // This type is legalized to a scalar type.
2980 if (!LT.second.isVector())
22
Calling 'MVT::isVector'
26
Returning from 'MVT::isVector'
27
Taking false branch
2981 return 0;
2982
2983 // The type may be split. Normalize the index to the new type.
2984 unsigned NumElts = LT.second.getVectorNumElements();
2985 unsigned SubNumElts = NumElts;
2986 Index = Index % NumElts;
2987
2988 // For >128-bit vectors, we need to extract higher 128-bit subvectors.
2989 // For inserts, we also need to insert the subvector back.
2990 if (LT.second.getSizeInBits() > 128) {
28
Assuming the condition is true
29
Taking true branch
2991 assert((LT.second.getSizeInBits() % 128) == 0 && "Illegal vector")(((LT.second.getSizeInBits() % 128) == 0 && "Illegal vector"
) ? static_cast<void> (0) : __assert_fail ("(LT.second.getSizeInBits() % 128) == 0 && \"Illegal vector\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 2991, __PRETTY_FUNCTION__))
;
30
Assuming the condition is true
31
'?' condition is true
2992 unsigned NumSubVecs = LT.second.getSizeInBits() / 128;
2993 SubNumElts = NumElts / NumSubVecs;
32
Value assigned to 'SubNumElts'
2994 if (SubNumElts <= Index) {
33
Assuming 'SubNumElts' is <= 'Index'
34
Taking true branch
2995 RegisterFileMoveCost += (Opcode
34.1
'Opcode' is not equal to InsertElement
34.1
'Opcode' is not equal to InsertElement
34.1
'Opcode' is not equal to InsertElement
== Instruction::InsertElement ? 2 : 1);
35
'?' condition is false
2996 Index %= SubNumElts;
36
Division by zero
2997 }
2998 }
2999
3000 if (Index == 0) {
3001 // Floating point scalars are already located in index #0.
3002 // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
3003 // true for all.
3004 if (ScalarType->isFloatingPointTy())
3005 return RegisterFileMoveCost;
3006
3007 // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
3008 if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
3009 return 1 + RegisterFileMoveCost;
3010 }
3011
3012 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3013 assert(ISD && "Unexpected vector opcode")((ISD && "Unexpected vector opcode") ? static_cast<
void> (0) : __assert_fail ("ISD && \"Unexpected vector opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3013, __PRETTY_FUNCTION__))
;
3014 MVT MScalarTy = LT.second.getScalarType();
3015 if (ST->isSLM())
3016 if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
3017 return Entry->Cost + RegisterFileMoveCost;
3018
3019 // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
3020 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
3021 (MScalarTy.isInteger() && ST->hasSSE41()))
3022 return 1 + RegisterFileMoveCost;
3023
3024 // Assume insertps is relatively cheap on all targets.
3025 if (MScalarTy == MVT::f32 && ST->hasSSE41() &&
3026 Opcode == Instruction::InsertElement)
3027 return 1 + RegisterFileMoveCost;
3028
3029 // For extractions we just need to shuffle the element to index 0, which
3030 // should be very cheap (assume cost = 1). For insertions we need to shuffle
3031 // the elements to its destination. In both cases we must handle the
3032 // subvector move(s).
3033 // If the vector type is already less than 128-bits then don't reduce it.
3034 // TODO: Under what circumstances should we shuffle using the full width?
3035 int ShuffleCost = 1;
3036 if (Opcode == Instruction::InsertElement) {
3037 auto *SubTy = cast<VectorType>(Val);
3038 EVT VT = TLI->getValueType(DL, Val);
3039 if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
3040 SubTy = FixedVectorType::get(ScalarType, SubNumElts);
3041 ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, 0, SubTy);
3042 }
3043 int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
3044 return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
3045 }
3046
3047 // Add to the base cost if we know that the extracted element of a vector is
3048 // destined to be moved to and used in the integer register file.
3049 if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
3050 RegisterFileMoveCost += 1;
3051
3052 return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
3053}
3054
3055unsigned X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
3056 const APInt &DemandedElts,
3057 bool Insert, bool Extract) {
3058 unsigned Cost = 0;
3059
3060 // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
3061 // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
3062 if (Insert) {
3063 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
3064 MVT MScalarTy = LT.second.getScalarType();
3065
3066 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
3067 (MScalarTy.isInteger() && ST->hasSSE41()) ||
3068 (MScalarTy == MVT::f32 && ST->hasSSE41())) {
3069 // For types we can insert directly, insertion into 128-bit sub vectors is
3070 // cheap, followed by a cheap chain of concatenations.
3071 if (LT.second.getSizeInBits() <= 128) {
3072 Cost +=
3073 BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false);
3074 } else {
3075 unsigned NumSubVecs = LT.second.getSizeInBits() / 128;
3076 Cost += (PowerOf2Ceil(NumSubVecs) - 1) * LT.first;
3077 Cost += DemandedElts.countPopulation();
3078
3079 // For vXf32 cases, insertion into the 0'th index in each v4f32
3080 // 128-bit vector is free.
3081 // NOTE: This assumes legalization widens vXf32 vectors.
3082 if (MScalarTy == MVT::f32)
3083 for (unsigned i = 0, e = cast<FixedVectorType>(Ty)->getNumElements();
3084 i < e; i += 4)
3085 if (DemandedElts[i])
3086 Cost--;
3087 }
3088 } else if (LT.second.isVector()) {
3089 // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
3090 // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
3091 // series of UNPCK followed by CONCAT_VECTORS - all of these can be
3092 // considered cheap.
3093 if (Ty->isIntOrIntVectorTy())
3094 Cost += DemandedElts.countPopulation();
3095
3096 // Get the smaller of the legalized or original pow2-extended number of
3097 // vector elements, which represents the number of unpacks we'll end up
3098 // performing.
3099 unsigned NumElts = LT.second.getVectorNumElements();
3100 unsigned Pow2Elts =
3101 PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements());
3102 Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
3103 }
3104 }
3105
3106 // TODO: Use default extraction for now, but we should investigate extending this
3107 // to handle repeated subvector extraction.
3108 if (Extract)
3109 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, false, Extract);
3110
3111 return Cost;
3112}
3113
3114int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
3115 MaybeAlign Alignment, unsigned AddressSpace,
3116 TTI::TargetCostKind CostKind,
3117 const Instruction *I) {
3118 // TODO: Handle other cost kinds.
3119 if (CostKind != TTI::TCK_RecipThroughput) {
3120 if (isa_and_nonnull<StoreInst>(I)) {
3121 Value *Ptr = I->getOperand(1);
3122 // Store instruction with index and scale costs 2 Uops.
3123 // Check the preceding GEP to identify non-const indices.
3124 if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
3125 if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
3126 return TTI::TCC_Basic * 2;
3127 }
3128 }
3129 return TTI::TCC_Basic;
3130 }
3131
3132 // Handle non-power-of-two vectors such as <3 x float>
3133 if (auto *VTy = dyn_cast<FixedVectorType>(Src)) {
3134 unsigned NumElem = VTy->getNumElements();
3135
3136 // Handle a few common cases:
3137 // <3 x float>
3138 if (NumElem == 3 && VTy->getScalarSizeInBits() == 32)
3139 // Cost = 64 bit store + extract + 32 bit store.
3140 return 3;
3141
3142 // <3 x double>
3143 if (NumElem == 3 && VTy->getScalarSizeInBits() == 64)
3144 // Cost = 128 bit store + unpack + 64 bit store.
3145 return 3;
3146
3147 // Assume that all other non-power-of-two numbers are scalarized.
3148 if (!isPowerOf2_32(NumElem)) {
3149 APInt DemandedElts = APInt::getAllOnesValue(NumElem);
3150 int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
3151 AddressSpace, CostKind);
3152 int SplitCost = getScalarizationOverhead(VTy, DemandedElts,
3153 Opcode == Instruction::Load,
3154 Opcode == Instruction::Store);
3155 return NumElem * Cost + SplitCost;
3156 }
3157 }
3158
3159 // Type legalization can't handle structs
3160 if (TLI->getValueType(DL, Src, true) == MVT::Other)
3161 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
3162 CostKind);
3163
3164 // Legalize the type.
3165 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
3166 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&(((Opcode == Instruction::Load || Opcode == Instruction::Store
) && "Invalid Opcode") ? static_cast<void> (0) :
__assert_fail ("(Opcode == Instruction::Load || Opcode == Instruction::Store) && \"Invalid Opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3167, __PRETTY_FUNCTION__))
3167 "Invalid Opcode")(((Opcode == Instruction::Load || Opcode == Instruction::Store
) && "Invalid Opcode") ? static_cast<void> (0) :
__assert_fail ("(Opcode == Instruction::Load || Opcode == Instruction::Store) && \"Invalid Opcode\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3167, __PRETTY_FUNCTION__))
;
3168
3169 // Each load/store unit costs 1.
3170 int Cost = LT.first * 1;
3171
3172 // This isn't exactly right. We're using slow unaligned 32-byte accesses as a
3173 // proxy for a double-pumped AVX memory interface such as on Sandybridge.
3174 if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow())
3175 Cost *= 2;
3176
3177 return Cost;
3178}
3179
3180int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
3181 Align Alignment, unsigned AddressSpace,
3182 TTI::TargetCostKind CostKind) {
3183 bool IsLoad = (Instruction::Load == Opcode);
3184 bool IsStore = (Instruction::Store == Opcode);
3185
3186 auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
3187 if (!SrcVTy)
3188 // To calculate scalar take the regular cost, without mask
3189 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
3190
3191 unsigned NumElem = SrcVTy->getNumElements();
3192 auto *MaskTy =
3193 FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
3194 if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) ||
3195 (IsStore && !isLegalMaskedStore(SrcVTy, Alignment)) ||
3196 !isPowerOf2_32(NumElem)) {
3197 // Scalarization
3198 APInt DemandedElts = APInt::getAllOnesValue(NumElem);
3199 int MaskSplitCost =
3200 getScalarizationOverhead(MaskTy, DemandedElts, false, true);
3201 int ScalarCompareCost = getCmpSelInstrCost(
3202 Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
3203 CostKind);
3204 int BranchCost = getCFInstrCost(Instruction::Br, CostKind);
3205 int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
3206 int ValueSplitCost =
3207 getScalarizationOverhead(SrcVTy, DemandedElts, IsLoad, IsStore);
3208 int MemopCost =
3209 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
3210 Alignment, AddressSpace, CostKind);
3211 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
3212 }
3213
3214 // Legalize the type.
3215 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
3216 auto VT = TLI->getValueType(DL, SrcVTy);
3217 int Cost = 0;
3218 if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
3219 LT.second.getVectorNumElements() == NumElem)
3220 // Promotion requires expand/truncate for data and a shuffle for mask.
3221 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, 0, nullptr) +
3222 getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, 0, nullptr);
3223
3224 else if (LT.second.getVectorNumElements() > NumElem) {
3225 auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
3226 LT.second.getVectorNumElements());
3227 // Expanding requires fill mask with zeroes
3228 Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy);
3229 }
3230
3231 // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
3232 if (!ST->hasAVX512())
3233 return Cost + LT.first * (IsLoad ? 2 : 8);
3234
3235 // AVX-512 masked load/store is cheapper
3236 return Cost + LT.first;
3237}
3238
3239int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
3240 const SCEV *Ptr) {
3241 // Address computations in vectorized code with non-consecutive addresses will
3242 // likely result in more instructions compared to scalar code where the
3243 // computation can more often be merged into the index mode. The resulting
3244 // extra micro-ops can significantly decrease throughput.
3245 const unsigned NumVectorInstToHideOverhead = 10;
3246
3247 // Cost modeling of Strided Access Computation is hidden by the indexing
3248 // modes of X86 regardless of the stride value. We dont believe that there
3249 // is a difference between constant strided access in gerenal and constant
3250 // strided value which is less than or equal to 64.
3251 // Even in the case of (loop invariant) stride whose value is not known at
3252 // compile time, the address computation will not incur more than one extra
3253 // ADD instruction.
3254 if (Ty->isVectorTy() && SE) {
3255 if (!BaseT::isStridedAccess(Ptr))
3256 return NumVectorInstToHideOverhead;
3257 if (!BaseT::getConstantStrideStep(SE, Ptr))
3258 return 1;
3259 }
3260
3261 return BaseT::getAddressComputationCost(Ty, SE, Ptr);
3262}
3263
3264int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
3265 bool IsPairwise,
3266 TTI::TargetCostKind CostKind) {
3267 // Just use the default implementation for pair reductions.
3268 if (IsPairwise)
3269 return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise, CostKind);
3270
3271 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
3272 // and make it as the cost.
3273
3274 static const CostTblEntry SLMCostTblNoPairWise[] = {
3275 { ISD::FADD, MVT::v2f64, 3 },
3276 { ISD::ADD, MVT::v2i64, 5 },
3277 };
3278
3279 static const CostTblEntry SSE2CostTblNoPairWise[] = {
3280 { ISD::FADD, MVT::v2f64, 2 },
3281 { ISD::FADD, MVT::v4f32, 4 },
3282 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
3283 { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
3284 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
3285 { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
3286 { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
3287 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
3288 { ISD::ADD, MVT::v2i8, 2 },
3289 { ISD::ADD, MVT::v4i8, 2 },
3290 { ISD::ADD, MVT::v8i8, 2 },
3291 { ISD::ADD, MVT::v16i8, 3 },
3292 };
3293
3294 static const CostTblEntry AVX1CostTblNoPairWise[] = {
3295 { ISD::FADD, MVT::v4f64, 3 },
3296 { ISD::FADD, MVT::v4f32, 3 },
3297 { ISD::FADD, MVT::v8f32, 4 },
3298 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
3299 { ISD::ADD, MVT::v4i64, 3 },
3300 { ISD::ADD, MVT::v8i32, 5 },
3301 { ISD::ADD, MVT::v16i16, 5 },
3302 { ISD::ADD, MVT::v32i8, 4 },
3303 };
3304
3305 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3306 assert(ISD && "Invalid opcode")((ISD && "Invalid opcode") ? static_cast<void> (
0) : __assert_fail ("ISD && \"Invalid opcode\"", "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3306, __PRETTY_FUNCTION__))
;
3307
3308 // Before legalizing the type, give a chance to look up illegal narrow types
3309 // in the table.
3310 // FIXME: Is there a better way to do this?
3311 EVT VT = TLI->getValueType(DL, ValTy);
3312 if (VT.isSimple()) {
3313 MVT MTy = VT.getSimpleVT();
3314 if (ST->isSLM())
3315 if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
3316 return Entry->Cost;
3317
3318 if (ST->hasAVX())
3319 if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
3320 return Entry->Cost;
3321
3322 if (ST->hasSSE2())
3323 if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
3324 return Entry->Cost;
3325 }
3326
3327 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
3328
3329 MVT MTy = LT.second;
3330
3331 auto *ValVTy = cast<FixedVectorType>(ValTy);
3332
3333 unsigned ArithmeticCost = 0;
3334 if (LT.first != 1 && MTy.isVector() &&
3335 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
3336 // Type needs to be split. We need LT.first - 1 arithmetic ops.
3337 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
3338 MTy.getVectorNumElements());
3339 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
3340 ArithmeticCost *= LT.first - 1;
3341 }
3342
3343 if (ST->isSLM())
3344 if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
3345 return ArithmeticCost + Entry->Cost;
3346
3347 if (ST->hasAVX())
3348 if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
3349 return ArithmeticCost + Entry->Cost;
3350
3351 if (ST->hasSSE2())
3352 if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
3353 return ArithmeticCost + Entry->Cost;
3354
3355 // FIXME: These assume a naive kshift+binop lowering, which is probably
3356 // conservative in most cases.
3357 static const CostTblEntry AVX512BoolReduction[] = {
3358 { ISD::AND, MVT::v2i1, 3 },
3359 { ISD::AND, MVT::v4i1, 5 },
3360 { ISD::AND, MVT::v8i1, 7 },
3361 { ISD::AND, MVT::v16i1, 9 },
3362 { ISD::AND, MVT::v32i1, 11 },
3363 { ISD::AND, MVT::v64i1, 13 },
3364 { ISD::OR, MVT::v2i1, 3 },
3365 { ISD::OR, MVT::v4i1, 5 },
3366 { ISD::OR, MVT::v8i1, 7 },
3367 { ISD::OR, MVT::v16i1, 9 },
3368 { ISD::OR, MVT::v32i1, 11 },
3369 { ISD::OR, MVT::v64i1, 13 },
3370 };
3371
3372 static const CostTblEntry AVX2BoolReduction[] = {
3373 { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
3374 { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
3375 { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
3376 { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
3377 };
3378
3379 static const CostTblEntry AVX1BoolReduction[] = {
3380 { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
3381 { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
3382 { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
3383 { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
3384 { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
3385 { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
3386 { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
3387 { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
3388 };
3389
3390 static const CostTblEntry SSE2BoolReduction[] = {
3391 { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
3392 { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
3393 { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
3394 { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
3395 { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
3396 { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
3397 { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
3398 { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
3399 };
3400
3401 // Handle bool allof/anyof patterns.
3402 if (ValVTy->getElementType()->isIntegerTy(1)) {
3403 unsigned ArithmeticCost = 0;
3404 if (LT.first != 1 && MTy.isVector() &&
3405 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
3406 // Type needs to be split. We need LT.first - 1 arithmetic ops.
3407 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
3408 MTy.getVectorNumElements());
3409 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
3410 ArithmeticCost *= LT.first - 1;
3411 }
3412
3413 if (ST->hasAVX512())
3414 if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
3415 return ArithmeticCost + Entry->Cost;
3416 if (ST->hasAVX2())
3417 if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
3418 return ArithmeticCost + Entry->Cost;
3419 if (ST->hasAVX())
3420 if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
3421 return ArithmeticCost + Entry->Cost;
3422 if (ST->hasSSE2())
3423 if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
3424 return ArithmeticCost + Entry->Cost;
3425
3426 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, IsPairwise,
3427 CostKind);
3428 }
3429
3430 unsigned NumVecElts = ValVTy->getNumElements();
3431 unsigned ScalarSize = ValVTy->getScalarSizeInBits();
3432
3433 // Special case power of 2 reductions where the scalar type isn't changed
3434 // by type legalization.
3435 if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
3436 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, IsPairwise,
3437 CostKind);
3438
3439 unsigned ReductionCost = 0;
3440
3441 auto *Ty = ValVTy;
3442 if (LT.first != 1 && MTy.isVector() &&
3443 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
3444 // Type needs to be split. We need LT.first - 1 arithmetic ops.
3445 Ty = FixedVectorType::get(ValVTy->getElementType(),
3446 MTy.getVectorNumElements());
3447 ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
3448 ReductionCost *= LT.first - 1;
3449 NumVecElts = MTy.getVectorNumElements();
3450 }
3451
3452 // Now handle reduction with the legal type, taking into account size changes
3453 // at each level.
3454 while (NumVecElts > 1) {
3455 // Determine the size of the remaining vector we need to reduce.
3456 unsigned Size = NumVecElts * ScalarSize;
3457 NumVecElts /= 2;
3458 // If we're reducing from 256/512 bits, use an extract_subvector.
3459 if (Size > 128) {
3460 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
3461 ReductionCost +=
3462 getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts, SubTy);
3463 Ty = SubTy;
3464 } else if (Size == 128) {
3465 // Reducing from 128 bits is a permute of v2f64/v2i64.
3466 FixedVectorType *ShufTy;
3467 if (ValVTy->isFloatingPointTy())
3468 ShufTy =
3469 FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
3470 else
3471 ShufTy =
3472 FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
3473 ReductionCost +=
3474 getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
3475 } else if (Size == 64) {
3476 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
3477 FixedVectorType *ShufTy;
3478 if (ValVTy->isFloatingPointTy())
3479 ShufTy =
3480 FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
3481 else
3482 ShufTy =
3483 FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
3484 ReductionCost +=
3485 getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
3486 } else {
3487 // Reducing from smaller size is a shift by immediate.
3488 auto *ShiftTy = FixedVectorType::get(
3489 Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
3490 ReductionCost += getArithmeticInstrCost(
3491 Instruction::LShr, ShiftTy, CostKind,
3492 TargetTransformInfo::OK_AnyValue,
3493 TargetTransformInfo::OK_UniformConstantValue,
3494 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
3495 }
3496
3497 // Add the arithmetic op for this level.
3498 ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
3499 }
3500
3501 // Add the final extract element to the cost.
3502 return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
3503}
3504
3505int X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy, bool IsUnsigned) {
3506 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
3507
3508 MVT MTy = LT.second;
3509
3510 int ISD;
3511 if (Ty->isIntOrIntVectorTy()) {
3512 ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
3513 } else {
3514 assert(Ty->isFPOrFPVectorTy() &&((Ty->isFPOrFPVectorTy() && "Expected float point or integer vector type."
) ? static_cast<void> (0) : __assert_fail ("Ty->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3515, __PRETTY_FUNCTION__))
3515 "Expected float point or integer vector type.")((Ty->isFPOrFPVectorTy() && "Expected float point or integer vector type."
) ? static_cast<void> (0) : __assert_fail ("Ty->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3515, __PRETTY_FUNCTION__))
;
3516 ISD = ISD::FMINNUM;
3517 }
3518
3519 static const CostTblEntry SSE1CostTbl[] = {
3520 {ISD::FMINNUM, MVT::v4f32, 1},
3521 };
3522
3523 static const CostTblEntry SSE2CostTbl[] = {
3524 {ISD::FMINNUM, MVT::v2f64, 1},
3525 {ISD::SMIN, MVT::v8i16, 1},
3526 {ISD::UMIN, MVT::v16i8, 1},
3527 };
3528
3529 static const CostTblEntry SSE41CostTbl[] = {
3530 {ISD::SMIN, MVT::v4i32, 1},
3531 {ISD::UMIN, MVT::v4i32, 1},
3532 {ISD::UMIN, MVT::v8i16, 1},
3533 {ISD::SMIN, MVT::v16i8, 1},
3534 };
3535
3536 static const CostTblEntry SSE42CostTbl[] = {
3537 {ISD::UMIN, MVT::v2i64, 3}, // xor+pcmpgtq+blendvpd
3538 };
3539
3540 static const CostTblEntry AVX1CostTbl[] = {
3541 {ISD::FMINNUM, MVT::v8f32, 1},
3542 {ISD::FMINNUM, MVT::v4f64, 1},
3543 {ISD::SMIN, MVT::v8i32, 3},
3544 {ISD::UMIN, MVT::v8i32, 3},
3545 {ISD::SMIN, MVT::v16i16, 3},
3546 {ISD::UMIN, MVT::v16i16, 3},
3547 {ISD::SMIN, MVT::v32i8, 3},
3548 {ISD::UMIN, MVT::v32i8, 3},
3549 };
3550
3551 static const CostTblEntry AVX2CostTbl[] = {
3552 {ISD::SMIN, MVT::v8i32, 1},
3553 {ISD::UMIN, MVT::v8i32, 1},
3554 {ISD::SMIN, MVT::v16i16, 1},
3555 {ISD::UMIN, MVT::v16i16, 1},
3556 {ISD::SMIN, MVT::v32i8, 1},
3557 {ISD::UMIN, MVT::v32i8, 1},
3558 };
3559
3560 static const CostTblEntry AVX512CostTbl[] = {
3561 {ISD::FMINNUM, MVT::v16f32, 1},
3562 {ISD::FMINNUM, MVT::v8f64, 1},
3563 {ISD::SMIN, MVT::v2i64, 1},
3564 {ISD::UMIN, MVT::v2i64, 1},
3565 {ISD::SMIN, MVT::v4i64, 1},
3566 {ISD::UMIN, MVT::v4i64, 1},
3567 {ISD::SMIN, MVT::v8i64, 1},
3568 {ISD::UMIN, MVT::v8i64, 1},
3569 {ISD::SMIN, MVT::v16i32, 1},
3570 {ISD::UMIN, MVT::v16i32, 1},
3571 };
3572
3573 static const CostTblEntry AVX512BWCostTbl[] = {
3574 {ISD::SMIN, MVT::v32i16, 1},
3575 {ISD::UMIN, MVT::v32i16, 1},
3576 {ISD::SMIN, MVT::v64i8, 1},
3577 {ISD::UMIN, MVT::v64i8, 1},
3578 };
3579
3580 // If we have a native MIN/MAX instruction for this type, use it.
3581 if (ST->hasBWI())
3582 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3583 return LT.first * Entry->Cost;
3584
3585 if (ST->hasAVX512())
3586 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3587 return LT.first * Entry->Cost;
3588
3589 if (ST->hasAVX2())
3590 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3591 return LT.first * Entry->Cost;
3592
3593 if (ST->hasAVX())
3594 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3595 return LT.first * Entry->Cost;
3596
3597 if (ST->hasSSE42())
3598 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3599 return LT.first * Entry->Cost;
3600
3601 if (ST->hasSSE41())
3602 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3603 return LT.first * Entry->Cost;
3604
3605 if (ST->hasSSE2())
3606 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3607 return LT.first * Entry->Cost;
3608
3609 if (ST->hasSSE1())
3610 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3611 return LT.first * Entry->Cost;
3612
3613 unsigned CmpOpcode;
3614 if (Ty->isFPOrFPVectorTy()) {
3615 CmpOpcode = Instruction::FCmp;
3616 } else {
3617 assert(Ty->isIntOrIntVectorTy() &&((Ty->isIntOrIntVectorTy() && "expecting floating point or integer type for min/max reduction"
) ? static_cast<void> (0) : __assert_fail ("Ty->isIntOrIntVectorTy() && \"expecting floating point or integer type for min/max reduction\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3618, __PRETTY_FUNCTION__))
3618 "expecting floating point or integer type for min/max reduction")((Ty->isIntOrIntVectorTy() && "expecting floating point or integer type for min/max reduction"
) ? static_cast<void> (0) : __assert_fail ("Ty->isIntOrIntVectorTy() && \"expecting floating point or integer type for min/max reduction\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3618, __PRETTY_FUNCTION__))
;
3619 CmpOpcode = Instruction::ICmp;
3620 }
3621
3622 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3623 // Otherwise fall back to cmp+select.
3624 return getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CostKind) +
3625 getCmpSelInstrCost(Instruction::Select, Ty, CondTy, CostKind);
3626}
3627
3628int X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy,
3629 bool IsPairwise, bool IsUnsigned,
3630 TTI::TargetCostKind CostKind) {
3631 // Just use the default implementation for pair reductions.
3632 if (IsPairwise)
3633 return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned,
3634 CostKind);
3635
3636 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
3637
3638 MVT MTy = LT.second;
3639
3640 int ISD;
3641 if (ValTy->isIntOrIntVectorTy()) {
3642 ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
3643 } else {
3644 assert(ValTy->isFPOrFPVectorTy() &&((ValTy->isFPOrFPVectorTy() && "Expected float point or integer vector type."
) ? static_cast<void> (0) : __assert_fail ("ValTy->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3645, __PRETTY_FUNCTION__))
3645 "Expected float point or integer vector type.")((ValTy->isFPOrFPVectorTy() && "Expected float point or integer vector type."
) ? static_cast<void> (0) : __assert_fail ("ValTy->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3645, __PRETTY_FUNCTION__))
;
3646 ISD = ISD::FMINNUM;
3647 }
3648
3649 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
3650 // and make it as the cost.
3651
3652 static const CostTblEntry SSE2CostTblNoPairWise[] = {
3653 {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
3654 {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
3655 {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
3656 };
3657
3658 static const CostTblEntry SSE41CostTblNoPairWise[] = {
3659 {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
3660 {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
3661 {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
3662 {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
3663 {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
3664 {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
3665 {ISD::SMIN, MVT::v2i8, 3}, // pminsb
3666 {ISD::SMIN, MVT::v4i8, 5}, // pminsb
3667 {ISD::SMIN, MVT::v8i8, 7}, // pminsb
3668 {ISD::SMIN, MVT::v16i8, 6},
3669 {ISD::UMIN, MVT::v2i8, 3}, // same as sse2
3670 {ISD::UMIN, MVT::v4i8, 5}, // same as sse2
3671 {ISD::UMIN, MVT::v8i8, 7}, // same as sse2
3672 {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
3673 };
3674
3675 static const CostTblEntry AVX1CostTblNoPairWise[] = {
3676 {ISD::SMIN, MVT::v16i16, 6},
3677 {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
3678 {ISD::SMIN, MVT::v32i8, 8},
3679 {ISD::UMIN, MVT::v32i8, 8},
3680 };
3681
3682 static const CostTblEntry AVX512BWCostTblNoPairWise[] = {
3683 {ISD::SMIN, MVT::v32i16, 8},
3684 {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
3685 {ISD::SMIN, MVT::v64i8, 10},
3686 {ISD::UMIN, MVT::v64i8, 10},
3687 };
3688
3689 // Before legalizing the type, give a chance to look up illegal narrow types
3690 // in the table.
3691 // FIXME: Is there a better way to do this?
3692 EVT VT = TLI->getValueType(DL, ValTy);
3693 if (VT.isSimple()) {
3694 MVT MTy = VT.getSimpleVT();
3695 if (ST->hasBWI())
3696 if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy))
3697 return Entry->Cost;
3698
3699 if (ST->hasAVX())
3700 if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
3701 return Entry->Cost;
3702
3703 if (ST->hasSSE41())
3704 if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy))
3705 return Entry->Cost;
3706
3707 if (ST->hasSSE2())
3708 if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
3709 return Entry->Cost;
3710 }
3711
3712 auto *ValVTy = cast<FixedVectorType>(ValTy);
3713 unsigned NumVecElts = ValVTy->getNumElements();
3714
3715 auto *Ty = ValVTy;
3716 unsigned MinMaxCost = 0;
3717 if (LT.first != 1 && MTy.isVector() &&
3718 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
3719 // Type needs to be split. We need LT.first - 1 operations ops.
3720 Ty = FixedVectorType::get(ValVTy->getElementType(),
3721 MTy.getVectorNumElements());
3722 auto *SubCondTy = FixedVectorType::get(CondTy->getElementType(),
3723 MTy.getVectorNumElements());
3724 MinMaxCost = getMinMaxCost(Ty, SubCondTy, IsUnsigned);
3725 MinMaxCost *= LT.first - 1;
3726 NumVecElts = MTy.getVectorNumElements();
3727 }
3728
3729 if (ST->hasBWI())
3730 if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy))
3731 return MinMaxCost + Entry->Cost;
3732
3733 if (ST->hasAVX())
3734 if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
3735 return MinMaxCost + Entry->Cost;
3736
3737 if (ST->hasSSE41())
3738 if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy))
3739 return MinMaxCost + Entry->Cost;
3740
3741 if (ST->hasSSE2())
3742 if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
3743 return MinMaxCost + Entry->Cost;
3744
3745 unsigned ScalarSize = ValTy->getScalarSizeInBits();
3746
3747 // Special case power of 2 reductions where the scalar type isn't changed
3748 // by type legalization.
3749 if (!isPowerOf2_32(ValVTy->getNumElements()) ||
3750 ScalarSize != MTy.getScalarSizeInBits())
3751 return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned,
3752 CostKind);
3753
3754 // Now handle reduction with the legal type, taking into account size changes
3755 // at each level.
3756 while (NumVecElts > 1) {
3757 // Determine the size of the remaining vector we need to reduce.
3758 unsigned Size = NumVecElts * ScalarSize;
3759 NumVecElts /= 2;
3760 // If we're reducing from 256/512 bits, use an extract_subvector.
3761 if (Size > 128) {
3762 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
3763 MinMaxCost +=
3764 getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts, SubTy);
3765 Ty = SubTy;
3766 } else if (Size == 128) {
3767 // Reducing from 128 bits is a permute of v2f64/v2i64.
3768 VectorType *ShufTy;
3769 if (ValTy->isFloatingPointTy())
3770 ShufTy =
3771 FixedVectorType::get(Type::getDoubleTy(ValTy->getContext()), 2);
3772 else
3773 ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
3774 MinMaxCost +=
3775 getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
3776 } else if (Size == 64) {
3777 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
3778 FixedVectorType *ShufTy;
3779 if (ValTy->isFloatingPointTy())
3780 ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
3781 else
3782 ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
3783 MinMaxCost +=
3784 getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
3785 } else {
3786 // Reducing from smaller size is a shift by immediate.
3787 auto *ShiftTy = FixedVectorType::get(
3788 Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
3789 MinMaxCost += getArithmeticInstrCost(
3790 Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
3791 TargetTransformInfo::OK_AnyValue,
3792 TargetTransformInfo::OK_UniformConstantValue,
3793 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
3794 }
3795
3796 // Add the arithmetic op for this level.
3797 auto *SubCondTy =
3798 FixedVectorType::get(CondTy->getElementType(), Ty->getNumElements());
3799 MinMaxCost += getMinMaxCost(Ty, SubCondTy, IsUnsigned);
3800 }
3801
3802 // Add the final extract element to the cost.
3803 return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
3804}
3805
3806/// Calculate the cost of materializing a 64-bit value. This helper
3807/// method might only calculate a fraction of a larger immediate. Therefore it
3808/// is valid to return a cost of ZERO.
3809int X86TTIImpl::getIntImmCost(int64_t Val) {
3810 if (Val == 0)
3811 return TTI::TCC_Free;
3812
3813 if (isInt<32>(Val))
3814 return TTI::TCC_Basic;
3815
3816 return 2 * TTI::TCC_Basic;
3817}
3818
3819int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
3820 TTI::TargetCostKind CostKind) {
3821 assert(Ty->isIntegerTy())((Ty->isIntegerTy()) ? static_cast<void> (0) : __assert_fail
("Ty->isIntegerTy()", "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3821, __PRETTY_FUNCTION__))
;
3822
3823 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3824 if (BitSize == 0)
3825 return ~0U;
3826
3827 // Never hoist constants larger than 128bit, because this might lead to
3828 // incorrect code generation or assertions in codegen.
3829 // Fixme: Create a cost model for types larger than i128 once the codegen
3830 // issues have been fixed.
3831 if (BitSize > 128)
3832 return TTI::TCC_Free;
3833
3834 if (Imm == 0)
3835 return TTI::TCC_Free;
3836
3837 // Sign-extend all constants to a multiple of 64-bit.
3838 APInt ImmVal = Imm;
3839 if (BitSize % 64 != 0)
3840 ImmVal = Imm.sext(alignTo(BitSize, 64));
3841
3842 // Split the constant into 64-bit chunks and calculate the cost for each
3843 // chunk.
3844 int Cost = 0;
3845 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
3846 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
3847 int64_t Val = Tmp.getSExtValue();
3848 Cost += getIntImmCost(Val);
3849 }
3850 // We need at least one instruction to materialize the constant.
3851 return std::max(1, Cost);
3852}
3853
3854int X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
3855 const APInt &Imm, Type *Ty,
3856 TTI::TargetCostKind CostKind,
3857 Instruction *Inst) {
3858 assert(Ty->isIntegerTy())((Ty->isIntegerTy()) ? static_cast<void> (0) : __assert_fail
("Ty->isIntegerTy()", "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3858, __PRETTY_FUNCTION__))
;
3859
3860 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3861 // There is no cost model for constants with a bit size of 0. Return TCC_Free
3862 // here, so that constant hoisting will ignore this constant.
3863 if (BitSize == 0)
3864 return TTI::TCC_Free;
3865
3866 unsigned ImmIdx = ~0U;
3867 switch (Opcode) {
3868 default:
3869 return TTI::TCC_Free;
3870 case Instruction::GetElementPtr:
3871 // Always hoist the base address of a GetElementPtr. This prevents the
3872 // creation of new constants for every base constant that gets constant
3873 // folded with the offset.
3874 if (Idx == 0)
3875 return 2 * TTI::TCC_Basic;
3876 return TTI::TCC_Free;
3877 case Instruction::Store:
3878 ImmIdx = 0;
3879 break;
3880 case Instruction::ICmp:
3881 // This is an imperfect hack to prevent constant hoisting of
3882 // compares that might be trying to check if a 64-bit value fits in
3883 // 32-bits. The backend can optimize these cases using a right shift by 32.
3884 // Ideally we would check the compare predicate here. There also other
3885 // similar immediates the backend can use shifts for.
3886 if (Idx == 1 && Imm.getBitWidth() == 64) {
3887 uint64_t ImmVal = Imm.getZExtValue();
3888 if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
3889 return TTI::TCC_Free;
3890 }
3891 ImmIdx = 1;
3892 break;
3893 case Instruction::And:
3894 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
3895 // by using a 32-bit operation with implicit zero extension. Detect such
3896 // immediates here as the normal path expects bit 31 to be sign extended.
3897 if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue()))
3898 return TTI::TCC_Free;
3899 ImmIdx = 1;
3900 break;
3901 case Instruction::Add:
3902 case Instruction::Sub:
3903 // For add/sub, we can use the opposite instruction for INT32_MIN.
3904 if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000)
3905 return TTI::TCC_Free;
3906 ImmIdx = 1;
3907 break;
3908 case Instruction::UDiv:
3909 case Instruction::SDiv:
3910 case Instruction::URem:
3911 case Instruction::SRem:
3912 // Division by constant is typically expanded later into a different
3913 // instruction sequence. This completely changes the constants.
3914 // Report them as "free" to stop ConstantHoist from marking them as opaque.
3915 return TTI::TCC_Free;
3916 case Instruction::Mul:
3917 case Instruction::Or:
3918 case Instruction::Xor:
3919 ImmIdx = 1;
3920 break;
3921 // Always return TCC_Free for the shift value of a shift instruction.
3922 case Instruction::Shl:
3923 case Instruction::LShr:
3924 case Instruction::AShr:
3925 if (Idx == 1)
3926 return TTI::TCC_Free;
3927 break;
3928 case Instruction::Trunc:
3929 case Instruction::ZExt:
3930 case Instruction::SExt:
3931 case Instruction::IntToPtr:
3932 case Instruction::PtrToInt:
3933 case Instruction::BitCast:
3934 case Instruction::PHI:
3935 case Instruction::Call:
3936 case Instruction::Select:
3937 case Instruction::Ret:
3938 case Instruction::Load:
3939 break;
3940 }
3941
3942 if (Idx == ImmIdx) {
3943 int NumConstants = divideCeil(BitSize, 64);
3944 int Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
3945 return (Cost <= NumConstants * TTI::TCC_Basic)
3946 ? static_cast<int>(TTI::TCC_Free)
3947 : Cost;
3948 }
3949
3950 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
3951}
3952
3953int X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
3954 const APInt &Imm, Type *Ty,
3955 TTI::TargetCostKind CostKind) {
3956 assert(Ty->isIntegerTy())((Ty->isIntegerTy()) ? static_cast<void> (0) : __assert_fail
("Ty->isIntegerTy()", "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3956, __PRETTY_FUNCTION__))
;
3957
3958 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3959 // There is no cost model for constants with a bit size of 0. Return TCC_Free
3960 // here, so that constant hoisting will ignore this constant.
3961 if (BitSize == 0)
3962 return TTI::TCC_Free;
3963
3964 switch (IID) {
3965 default:
3966 return TTI::TCC_Free;
3967 case Intrinsic::sadd_with_overflow:
3968 case Intrinsic::uadd_with_overflow:
3969 case Intrinsic::ssub_with_overflow:
3970 case Intrinsic::usub_with_overflow:
3971 case Intrinsic::smul_with_overflow:
3972 case Intrinsic::umul_with_overflow:
3973 if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
3974 return TTI::TCC_Free;
3975 break;
3976 case Intrinsic::experimental_stackmap:
3977 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
3978 return TTI::TCC_Free;
3979 break;
3980 case Intrinsic::experimental_patchpoint_void:
3981 case Intrinsic::experimental_patchpoint_i64:
3982 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
3983 return TTI::TCC_Free;
3984 break;
3985 }
3986 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
3987}
3988
3989unsigned
3990X86TTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) {
3991 if (CostKind != TTI::TCK_RecipThroughput)
3992 return Opcode == Instruction::PHI ? 0 : 1;
3993 // Branches are assumed to be predicted.
3994 return CostKind == TTI::TCK_RecipThroughput ? 0 : 1;
3995}
3996
3997int X86TTIImpl::getGatherOverhead() const {
3998 // Some CPUs have more overhead for gather. The specified overhead is relative
3999 // to the Load operation. "2" is the number provided by Intel architects. This
4000 // parameter is used for cost estimation of Gather Op and comparison with
4001 // other alternatives.
4002 // TODO: Remove the explicit hasAVX512()?, That would mean we would only
4003 // enable gather with a -march.
4004 if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
4005 return 2;
4006
4007 return 1024;
4008}
4009
4010int X86TTIImpl::getScatterOverhead() const {
4011 if (ST->hasAVX512())
4012 return 2;
4013
4014 return 1024;
4015}
4016
4017// Return an average cost of Gather / Scatter instruction, maybe improved later
4018int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, const Value *Ptr,
4019 Align Alignment, unsigned AddressSpace) {
4020
4021 assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost")((isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost"
) ? static_cast<void> (0) : __assert_fail ("isa<VectorType>(SrcVTy) && \"Unexpected type in getGSVectorCost\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 4021, __PRETTY_FUNCTION__))
;
4022 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
4023
4024 // Try to reduce index size from 64 bit (default for GEP)
4025 // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
4026 // operation will use 16 x 64 indices which do not fit in a zmm and needs
4027 // to split. Also check that the base pointer is the same for all lanes,
4028 // and that there's at most one variable index.
4029 auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
4030 unsigned IndexSize = DL.getPointerSizeInBits();
4031 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
4032 if (IndexSize < 64 || !GEP)
4033 return IndexSize;
4034
4035 unsigned NumOfVarIndices = 0;
4036 const Value *Ptrs = GEP->getPointerOperand();
4037 if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
4038 return IndexSize;
4039 for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
4040 if (isa<Constant>(GEP->getOperand(i)))
4041 continue;
4042 Type *IndxTy = GEP->getOperand(i)->getType();
4043 if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
4044 IndxTy = IndexVTy->getElementType();
4045 if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
4046 !isa<SExtInst>(GEP->getOperand(i))) ||
4047 ++NumOfVarIndices > 1)
4048 return IndexSize; // 64
4049 }
4050 return (unsigned)32;
4051 };
4052
4053 // Trying to reduce IndexSize to 32 bits for vector 16.
4054 // By default the IndexSize is equal to pointer size.
4055 unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
4056 ? getIndexSizeInBits(Ptr, DL)
4057 : DL.getPointerSizeInBits();
4058
4059 auto *IndexVTy = FixedVectorType::get(
4060 IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
4061 std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
4062 std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
4063 int SplitFactor = std::max(IdxsLT.first, SrcLT.first);
4064 if (SplitFactor > 1) {
4065 // Handle splitting of vector of pointers
4066 auto *SplitSrcTy =
4067 FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
4068 return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
4069 AddressSpace);
4070 }
4071
4072 // The gather / scatter cost is given by Intel architects. It is a rough
4073 // number since we are looking at one instruction in a time.
4074 const int GSOverhead = (Opcode == Instruction::Load)
4075 ? getGatherOverhead()
4076 : getScatterOverhead();
4077 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
4078 MaybeAlign(Alignment), AddressSpace,
4079 TTI::TCK_RecipThroughput);
4080}
4081
4082/// Return the cost of full scalarization of gather / scatter operation.
4083///
4084/// Opcode - Load or Store instruction.
4085/// SrcVTy - The type of the data vector that should be gathered or scattered.
4086/// VariableMask - The mask is non-constant at compile time.
4087/// Alignment - Alignment for one element.
4088/// AddressSpace - pointer[s] address space.
4089///
4090int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
4091 bool VariableMask, Align Alignment,
4092 unsigned AddressSpace) {
4093 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
4094 APInt DemandedElts = APInt::getAllOnesValue(VF);
4095 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4096
4097 int MaskUnpackCost = 0;
4098 if (VariableMask) {
4099 auto *MaskTy =
4100 FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
4101 MaskUnpackCost =
4102 getScalarizationOverhead(MaskTy, DemandedElts, false, true);
4103 int ScalarCompareCost =
4104 getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()),
4105 nullptr, CostKind);
4106 int BranchCost = getCFInstrCost(Instruction::Br, CostKind);
4107 MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
4108 }
4109
4110 // The cost of the scalar loads/stores.
4111 int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
4112 MaybeAlign(Alignment), AddressSpace,
4113 CostKind);
4114
4115 int InsertExtractCost = 0;
4116 if (Opcode == Instruction::Load)
4117 for (unsigned i = 0; i < VF; ++i)
4118 // Add the cost of inserting each scalar load into the vector
4119 InsertExtractCost +=
4120 getVectorInstrCost(Instruction::InsertElement, SrcVTy, i);
4121 else
4122 for (unsigned i = 0; i < VF; ++i)
4123 // Add the cost of extracting each element out of the data vector
4124 InsertExtractCost +=
4125 getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i);
4126
4127 return MemoryOpCost + MaskUnpackCost + InsertExtractCost;
4128}
4129
4130/// Calculate the cost of Gather / Scatter operation
4131int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
4132 const Value *Ptr, bool VariableMask,
4133 Align Alignment,
4134 TTI::TargetCostKind CostKind,
4135 const Instruction *I = nullptr) {
4136
4137 if (CostKind != TTI::TCK_RecipThroughput)
4138 return 1;
4139
4140 assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter")((SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter"
) ? static_cast<void> (0) : __assert_fail ("SrcVTy->isVectorTy() && \"Unexpected data type for Gather/Scatter\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 4140, __PRETTY_FUNCTION__))
;
4141 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
4142 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
4143 if (!PtrTy && Ptr->getType()->isVectorTy())
4144 PtrTy = dyn_cast<PointerType>(
4145 cast<VectorType>(Ptr->getType())->getElementType());
4146 assert(PtrTy && "Unexpected type for Ptr argument")((PtrTy && "Unexpected type for Ptr argument") ? static_cast
<void> (0) : __assert_fail ("PtrTy && \"Unexpected type for Ptr argument\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 4146, __PRETTY_FUNCTION__))
;
4147 unsigned AddressSpace = PtrTy->getAddressSpace();
4148
4149 bool Scalarize = false;
4150 if ((Opcode == Instruction::Load &&
4151 !isLegalMaskedGather(SrcVTy, Align(Alignment))) ||
4152 (Opcode == Instruction::Store &&
4153 !isLegalMaskedScatter(SrcVTy, Align(Alignment))))
4154 Scalarize = true;
4155 // Gather / Scatter for vector 2 is not profitable on KNL / SKX
4156 // Vector-4 of gather/scatter instruction does not exist on KNL.
4157 // We can extend it to 8 elements, but zeroing upper bits of
4158 // the mask vector will add more instructions. Right now we give the scalar
4159 // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction
4160 // is better in the VariableMask case.
4161 if (ST->hasAVX512() && (VF == 2 || (VF == 4 && !ST->hasVLX())))
4162 Scalarize = true;
4163
4164 if (Scalarize)
4165 return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
4166 AddressSpace);
4167
4168 return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
4169}
4170
4171bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
4172 TargetTransformInfo::LSRCost &C2) {
4173 // X86 specific here are "instruction number 1st priority".
4174 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
4175 C1.NumIVMuls, C1.NumBaseAdds,
4176 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
4177 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
4178 C2.NumIVMuls, C2.NumBaseAdds,
4179 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
4180}
4181
4182bool X86TTIImpl::canMacroFuseCmp() {
4183 return ST->hasMacroFusion() || ST->hasBranchFusion();
4184}
4185
4186bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
4187 if (!ST->hasAVX())
4188 return false;
4189
4190 // The backend can't handle a single element vector.
4191 if (isa<VectorType>(DataTy) &&
4192 cast<FixedVectorType>(DataTy)->getNumElements() == 1)
4193 return false;
4194 Type *ScalarTy = DataTy->getScalarType();
4195
4196 if (ScalarTy->isPointerTy())
4197 return true;
4198
4199 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
4200 return true;
4201
4202 if (!ScalarTy->isIntegerTy())
4203 return false;
4204
4205 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
4206 return IntWidth == 32 || IntWidth == 64 ||
4207 ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
4208}
4209
4210bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) {
4211 return isLegalMaskedLoad(DataType, Alignment);
4212}
4213
4214bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) {
4215 unsigned DataSize = DL.getTypeStoreSize(DataType);
4216 // The only supported nontemporal loads are for aligned vectors of 16 or 32
4217 // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
4218 // (the equivalent stores only require AVX).
4219 if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
4220 return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
4221
4222 return false;
4223}
4224
4225bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) {
4226 unsigned DataSize = DL.getTypeStoreSize(DataType);
4227
4228 // SSE4A supports nontemporal stores of float and double at arbitrary
4229 // alignment.
4230 if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
4231 return true;
4232
4233 // Besides the SSE4A subtarget exception above, only aligned stores are
4234 // available nontemporaly on any other subtarget. And only stores with a size
4235 // of 4..32 bytes (powers of 2, only) are permitted.
4236 if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
4237 !isPowerOf2_32(DataSize))
4238 return false;
4239
4240 // 32-byte vector nontemporal stores are supported by AVX (the equivalent
4241 // loads require AVX2).
4242 if (DataSize == 32)
4243 return ST->hasAVX();
4244 else if (DataSize == 16)
4245 return ST->hasSSE1();
4246 return true;
4247}
4248
4249bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) {
4250 if (!isa<VectorType>(DataTy))
4251 return false;
4252
4253 if (!ST->hasAVX512())
4254 return false;
4255
4256 // The backend can't handle a single element vector.
4257 if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
4258 return false;
4259
4260 Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
4261
4262 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
4263 return true;
4264
4265 if (!ScalarTy->isIntegerTy())
4266 return false;
4267
4268 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
4269 return IntWidth == 32 || IntWidth == 64 ||
4270 ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
4271}
4272
4273bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) {
4274 return isLegalMaskedExpandLoad(DataTy);
4275}
4276
4277bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
4278 // Some CPUs have better gather performance than others.
4279 // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
4280 // enable gather with a -march.
4281 if (!(ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2())))
4282 return false;
4283
4284 // This function is called now in two cases: from the Loop Vectorizer
4285 // and from the Scalarizer.
4286 // When the Loop Vectorizer asks about legality of the feature,
4287 // the vectorization factor is not calculated yet. The Loop Vectorizer
4288 // sends a scalar type and the decision is based on the width of the
4289 // scalar element.
4290 // Later on, the cost model will estimate usage this intrinsic based on
4291 // the vector type.
4292 // The Scalarizer asks again about legality. It sends a vector type.
4293 // In this case we can reject non-power-of-2 vectors.
4294 // We also reject single element vectors as the type legalizer can't
4295 // scalarize it.
4296 if (auto *DataVTy = dyn_cast<FixedVectorType>(DataTy)) {
4297 unsigned NumElts = DataVTy->getNumElements();
4298 if (NumElts == 1)
4299 return false;
4300 }
4301 Type *ScalarTy = DataTy->getScalarType();
4302 if (ScalarTy->isPointerTy())
4303 return true;
4304
4305 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
4306 return true;
4307
4308 if (!ScalarTy->isIntegerTy())
4309 return false;
4310
4311 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
4312 return IntWidth == 32 || IntWidth == 64;
4313}
4314
4315bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) {
4316 // AVX2 doesn't support scatter
4317 if (!ST->hasAVX512())
4318 return false;
4319 return isLegalMaskedGather(DataType, Alignment);
4320}
4321
4322bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
4323 EVT VT = TLI->getValueType(DL, DataType);
4324 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
4325}
4326
4327bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
4328 return false;
4329}
4330
4331bool X86TTIImpl::areInlineCompatible(const Function *Caller,
4332 const Function *Callee) const {
4333 const TargetMachine &TM = getTLI()->getTargetMachine();
4334
4335 // Work this as a subsetting of subtarget features.
4336 const FeatureBitset &CallerBits =
4337 TM.getSubtargetImpl(*Caller)->getFeatureBits();
4338 const FeatureBitset &CalleeBits =
4339 TM.getSubtargetImpl(*Callee)->getFeatureBits();
4340
4341 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
4342 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
4343 return (RealCallerBits & RealCalleeBits) == RealCalleeBits;
4344}
4345
4346bool X86TTIImpl::areFunctionArgsABICompatible(
4347 const Function *Caller, const Function *Callee,
4348 SmallPtrSetImpl<Argument *> &Args) const {
4349 if (!BaseT::areFunctionArgsABICompatible(Caller, Callee, Args))
4350 return false;
4351
4352 // If we get here, we know the target features match. If one function
4353 // considers 512-bit vectors legal and the other does not, consider them
4354 // incompatible.
4355 const TargetMachine &TM = getTLI()->getTargetMachine();
4356
4357 if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
4358 TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs())
4359 return true;
4360
4361 // Consider the arguments compatible if they aren't vectors or aggregates.
4362 // FIXME: Look at the size of vectors.
4363 // FIXME: Look at the element types of aggregates to see if there are vectors.
4364 // FIXME: The API of this function seems intended to allow arguments
4365 // to be removed from the set, but the caller doesn't check if the set
4366 // becomes empty so that may not work in practice.
4367 return llvm::none_of(Args, [](Argument *A) {
4368 auto *EltTy = cast<PointerType>(A->getType())->getElementType();
4369 return EltTy->isVectorTy() || EltTy->isAggregateType();
4370 });
4371}
4372
4373X86TTIImpl::TTI::MemCmpExpansionOptions
4374X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
4375 TTI::MemCmpExpansionOptions Options;
4376 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
4377 Options.NumLoadsPerBlock = 2;
4378 // All GPR and vector loads can be unaligned.
4379 Options.AllowOverlappingLoads = true;
4380 if (IsZeroCmp) {
4381 // Only enable vector loads for equality comparison. Right now the vector
4382 // version is not as fast for three way compare (see #33329).
4383 const unsigned PreferredWidth = ST->getPreferVectorWidth();
4384 if (PreferredWidth >= 512 && ST->hasAVX512()) Options.LoadSizes.push_back(64);
4385 if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
4386 if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
4387 }
4388 if (ST->is64Bit()) {
4389 Options.LoadSizes.push_back(8);
4390 }
4391 Options.LoadSizes.push_back(4);
4392 Options.LoadSizes.push_back(2);
4393 Options.LoadSizes.push_back(1);
4394 return Options;
4395}
4396
4397bool X86TTIImpl::enableInterleavedAccessVectorization() {
4398 // TODO: We expect this to be beneficial regardless of arch,
4399 // but there are currently some unexplained performance artifacts on Atom.
4400 // As a temporary solution, disable on Atom.
4401 return !(ST->isAtom());
4402}
4403
4404// Get estimation for interleaved load/store operations for AVX2.
4405// \p Factor is the interleaved-access factor (stride) - number of
4406// (interleaved) elements in the group.
4407// \p Indices contains the indices for a strided load: when the
4408// interleaved load has gaps they indicate which elements are used.
4409// If Indices is empty (or if the number of indices is equal to the size
4410// of the interleaved-access as given in \p Factor) the access has no gaps.
4411//
4412// As opposed to AVX-512, AVX2 does not have generic shuffles that allow
4413// computing the cost using a generic formula as a function of generic
4414// shuffles. We therefore use a lookup table instead, filled according to
4415// the instruction sequences that codegen currently generates.
4416int X86TTIImpl::getInterleavedMemoryOpCostAVX2(
4417 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
4418 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
4419 TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
4420
4421 if (UseMaskForCond || UseMaskForGaps)
4
Assuming 'UseMaskForCond' is false
5
Assuming 'UseMaskForGaps' is false
6
Taking false branch
4422 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
4423 Alignment, AddressSpace, CostKind,
4424 UseMaskForCond, UseMaskForGaps);
4425
4426 // We currently Support only fully-interleaved groups, with no gaps.
4427 // TODO: Support also strided loads (interleaved-groups with gaps).
4428 if (Indices.size() && Indices.size() != Factor)
7
Assuming the condition is true
8
Assuming the condition is true
9
Taking true branch
4429 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
10
Calling 'BasicTTIImplBase::getInterleavedMemoryOpCost'
4430 Alignment, AddressSpace,
4431 CostKind);
4432
4433 // VecTy for interleave memop is <VF*Factor x Elt>.
4434 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
4435 // VecTy = <12 x i32>.
4436 MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
4437
4438 // This function can be called with VecTy=<6xi128>, Factor=3, in which case
4439 // the VF=2, while v2i128 is an unsupported MVT vector type
4440 // (see MachineValueType.h::getVectorVT()).
4441 if (!LegalVT.isVector())
4442 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
4443 Alignment, AddressSpace,
4444 CostKind);
4445
4446 unsigned VF = VecTy->getNumElements() / Factor;
4447 Type *ScalarTy = VecTy->getElementType();
4448
4449 // Calculate the number of memory operations (NumOfMemOps), required
4450 // for load/store the VecTy.
4451 unsigned VecTySize = DL.getTypeStoreSize(VecTy);
4452 unsigned LegalVTSize = LegalVT.getStoreSize();
4453 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
4454
4455 // Get the cost of one memory operation.
4456 auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
4457 LegalVT.getVectorNumElements());
4458 unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy,
4459 MaybeAlign(Alignment), AddressSpace,
4460 CostKind);
4461
4462 auto *VT = FixedVectorType::get(ScalarTy, VF);
4463 EVT ETy = TLI->getValueType(DL, VT);
4464 if (!ETy.isSimple())
4465 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
4466 Alignment, AddressSpace,
4467 CostKind);
4468
4469 // TODO: Complete for other data-types and strides.
4470 // Each combination of Stride, ElementTy and VF results in a different
4471 // sequence; The cost tables are therefore accessed with:
4472 // Factor (stride) and VectorType=VFxElemType.
4473 // The Cost accounts only for the shuffle sequence;
4474 // The cost of the loads/stores is accounted for separately.
4475 //
4476 static const CostTblEntry AVX2InterleavedLoadTbl[] = {
4477 { 2, MVT::v4i64, 6 }, //(load 8i64 and) deinterleave into 2 x 4i64
4478 { 2, MVT::v4f64, 6 }, //(load 8f64 and) deinterleave into 2 x 4f64
4479
4480 { 3, MVT::v2i8, 10 }, //(load 6i8 and) deinterleave into 3 x 2i8
4481 { 3, MVT::v4i8, 4 }, //(load 12i8 and) deinterleave into 3 x 4i8
4482 { 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8
4483 { 3, MVT::v16i8, 11}, //(load 48i8 and) deinterleave into 3 x 16i8
4484 { 3, MVT::v32i8, 13}, //(load 96i8 and) deinterleave into 3 x 32i8
4485 { 3, MVT::v8f32, 17 }, //(load 24f32 and)deinterleave into 3 x 8f32
4486
4487 { 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8
4488 { 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8
4489 { 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8
4490 { 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8
4491 { 4, MVT::v32i8, 80 }, //(load 128i8 and) deinterleave into 4 x 32i8
4492
4493 { 8, MVT::v8f32, 40 } //(load 64f32 and)deinterleave into 8 x 8f32
4494 };
4495
4496 static const CostTblEntry AVX2InterleavedStoreTbl[] = {
4497 { 2, MVT::v4i64, 6 }, //interleave into 2 x 4i64 into 8i64 (and store)
4498 { 2, MVT::v4f64, 6 }, //interleave into 2 x 4f64 into 8f64 (and store)
4499
4500 { 3, MVT::v2i8, 7 }, //interleave 3 x 2i8 into 6i8 (and store)
4501 { 3, MVT::v4i8, 8 }, //interleave 3 x 4i8 into 12i8 (and store)
4502 { 3, MVT::v8i8, 11 }, //interleave 3 x 8i8 into 24i8 (and store)
4503 { 3, MVT::v16i8, 11 }, //interleave 3 x 16i8 into 48i8 (and store)
4504 { 3, MVT::v32i8, 13 }, //interleave 3 x 32i8 into 96i8 (and store)
4505
4506 { 4, MVT::v2i8, 12 }, //interleave 4 x 2i8 into 8i8 (and store)
4507 { 4, MVT::v4i8, 9 }, //interleave 4 x 4i8 into 16i8 (and store)
4508 { 4, MVT::v8i8, 10 }, //interleave 4 x 8i8 into 32i8 (and store)
4509 { 4, MVT::v16i8, 10 }, //interleave 4 x 16i8 into 64i8 (and store)
4510 { 4, MVT::v32i8, 12 } //interleave 4 x 32i8 into 128i8 (and store)
4511 };
4512
4513 if (Opcode == Instruction::Load) {
4514 if (const auto *Entry =
4515 CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT()))
4516 return NumOfMemOps * MemOpCost + Entry->Cost;
4517 } else {
4518 assert(Opcode == Instruction::Store &&((Opcode == Instruction::Store && "Expected Store Instruction at this point"
) ? static_cast<void> (0) : __assert_fail ("Opcode == Instruction::Store && \"Expected Store Instruction at this point\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 4519, __PRETTY_FUNCTION__))
4519 "Expected Store Instruction at this point")((Opcode == Instruction::Store && "Expected Store Instruction at this point"
) ? static_cast<void> (0) : __assert_fail ("Opcode == Instruction::Store && \"Expected Store Instruction at this point\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 4519, __PRETTY_FUNCTION__))
;
4520 if (const auto *Entry =
4521 CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT()))
4522 return NumOfMemOps * MemOpCost + Entry->Cost;
4523 }
4524
4525 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
4526 Alignment, AddressSpace, CostKind);
4527}
4528
4529// Get estimation for interleaved load/store operations and strided load.
4530// \p Indices contains indices for strided load.
4531// \p Factor - the factor of interleaving.
4532// AVX-512 provides 3-src shuffles that significantly reduces the cost.
4533int X86TTIImpl::getInterleavedMemoryOpCostAVX512(
4534 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
4535 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
4536 TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
4537
4538 if (UseMaskForCond || UseMaskForGaps)
4539 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
4540 Alignment, AddressSpace, CostKind,
4541 UseMaskForCond, UseMaskForGaps);
4542
4543 // VecTy for interleave memop is <VF*Factor x Elt>.
4544 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
4545 // VecTy = <12 x i32>.
4546
4547 // Calculate the number of memory operations (NumOfMemOps), required
4548 // for load/store the VecTy.
4549 MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
4550 unsigned VecTySize = DL.getTypeStoreSize(VecTy);
4551 unsigned LegalVTSize = LegalVT.getStoreSize();
4552 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
4553
4554 // Get the cost of one memory operation.
4555 auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
4556 LegalVT.getVectorNumElements());
4557 unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy,
4558 MaybeAlign(Alignment), AddressSpace,
4559 CostKind);
4560
4561 unsigned VF = VecTy->getNumElements() / Factor;
4562 MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
4563
4564 if (Opcode == Instruction::Load) {
4565 // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
4566 // contain the cost of the optimized shuffle sequence that the
4567 // X86InterleavedAccess pass will generate.
4568 // The cost of loads and stores are computed separately from the table.
4569
4570 // X86InterleavedAccess support only the following interleaved-access group.
4571 static const CostTblEntry AVX512InterleavedLoadTbl[] = {
4572 {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
4573 {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
4574 {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
4575 };
4576
4577 if (const auto *Entry =
4578 CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
4579 return NumOfMemOps * MemOpCost + Entry->Cost;
4580 //If an entry does not exist, fallback to the default implementation.
4581
4582 // Kind of shuffle depends on number of loaded values.
4583 // If we load the entire data in one register, we can use a 1-src shuffle.
4584 // Otherwise, we'll merge 2 sources in each operation.
4585 TTI::ShuffleKind ShuffleKind =
4586 (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
4587
4588 unsigned ShuffleCost =
4589 getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr);
4590
4591 unsigned NumOfLoadsInInterleaveGrp =
4592 Indices.size() ? Indices.size() : Factor;
4593 auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
4594 VecTy->getNumElements() / Factor);
4595 unsigned NumOfResults =
4596 getTLI()->getTypeLegalizationCost(DL, ResultTy).first *
4597 NumOfLoadsInInterleaveGrp;
4598
4599 // About a half of the loads may be folded in shuffles when we have only
4600 // one result. If we have more than one result, we do not fold loads at all.
4601 unsigned NumOfUnfoldedLoads =
4602 NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
4603
4604 // Get a number of shuffle operations per result.
4605 unsigned NumOfShufflesPerResult =
4606 std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
4607
4608 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
4609 // When we have more than one destination, we need additional instructions
4610 // to keep sources.
4611 unsigned NumOfMoves = 0;
4612 if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
4613 NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
4614
4615 int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
4616 NumOfUnfoldedLoads * MemOpCost + NumOfMoves;
4617
4618 return Cost;
4619 }
4620
4621 // Store.
4622 assert(Opcode == Instruction::Store &&((Opcode == Instruction::Store && "Expected Store Instruction at this point"
) ? static_cast<void> (0) : __assert_fail ("Opcode == Instruction::Store && \"Expected Store Instruction at this point\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 4623, __PRETTY_FUNCTION__))
4623 "Expected Store Instruction at this point")((Opcode == Instruction::Store && "Expected Store Instruction at this point"
) ? static_cast<void> (0) : __assert_fail ("Opcode == Instruction::Store && \"Expected Store Instruction at this point\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 4623, __PRETTY_FUNCTION__))
;
4624 // X86InterleavedAccess support only the following interleaved-access group.
4625 static const CostTblEntry AVX512InterleavedStoreTbl[] = {
4626 {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
4627 {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
4628 {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
4629
4630 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
4631 {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
4632 {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
4633 {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
4634 };
4635
4636 if (const auto *Entry =
4637 CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
4638 return NumOfMemOps * MemOpCost + Entry->Cost;
4639 //If an entry does not exist, fallback to the default implementation.
4640
4641 // There is no strided stores meanwhile. And store can't be folded in
4642 // shuffle.
4643 unsigned NumOfSources = Factor; // The number of values to be merged.
4644 unsigned ShuffleCost =
4645 getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr);
4646 unsigned NumOfShufflesPerStore = NumOfSources - 1;
4647
4648 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
4649 // We need additional instructions to keep sources.
4650 unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
4651 int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
4652 NumOfMoves;
4653 return Cost;
4654}
4655
4656int X86TTIImpl::getInterleavedMemoryOpCost(
4657 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
4658 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
4659 bool UseMaskForCond, bool UseMaskForGaps) {
4660 auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) {
4661 Type *EltTy = cast<VectorType>(VecTy)->getElementType();
4662 if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
4663 EltTy->isIntegerTy(32) || EltTy->isPointerTy())
4664 return true;
4665 if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8))
4666 return HasBW;
4667 return false;
4668 };
4669 if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
4670 return getInterleavedMemoryOpCostAVX512(
4671 Opcode, cast<FixedVectorType>(VecTy), Factor, Indices, Alignment,
4672 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
4673 if (ST->hasAVX2())
1
Taking true branch
4674 return getInterleavedMemoryOpCostAVX2(
3
Calling 'X86TTIImpl::getInterleavedMemoryOpCostAVX2'
4675 Opcode, cast<FixedVectorType>(VecTy), Factor, Indices, Alignment,
2
'VecTy' is a 'FixedVectorType'
4676 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
4677
4678 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
4679 Alignment, AddressSpace, CostKind,
4680 UseMaskForCond, UseMaskForGaps);
4681}

/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/include/llvm/CodeGen/BasicTTIImpl.h

1//===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file provides a helper that implements much of the TTI interface in
11/// terms of the target-independent code generator and TargetLowering
12/// interfaces.
13//
14//===----------------------------------------------------------------------===//
15
16#ifndef LLVM_CODEGEN_BASICTTIIMPL_H
17#define LLVM_CODEGEN_BASICTTIIMPL_H
18
19#include "llvm/ADT/APInt.h"
20#include "llvm/ADT/ArrayRef.h"
21#include "llvm/ADT/BitVector.h"
22#include "llvm/ADT/SmallPtrSet.h"
23#include "llvm/ADT/SmallVector.h"
24#include "llvm/Analysis/LoopInfo.h"
25#include "llvm/Analysis/TargetTransformInfo.h"
26#include "llvm/Analysis/TargetTransformInfoImpl.h"
27#include "llvm/CodeGen/ISDOpcodes.h"
28#include "llvm/CodeGen/TargetLowering.h"
29#include "llvm/CodeGen/TargetSubtargetInfo.h"
30#include "llvm/CodeGen/ValueTypes.h"
31#include "llvm/IR/BasicBlock.h"
32#include "llvm/IR/Constant.h"
33#include "llvm/IR/Constants.h"
34#include "llvm/IR/DataLayout.h"
35#include "llvm/IR/DerivedTypes.h"
36#include "llvm/IR/InstrTypes.h"
37#include "llvm/IR/Instruction.h"
38#include "llvm/IR/Instructions.h"
39#include "llvm/IR/Intrinsics.h"
40#include "llvm/IR/Operator.h"
41#include "llvm/IR/Type.h"
42#include "llvm/IR/Value.h"
43#include "llvm/Support/Casting.h"
44#include "llvm/Support/CommandLine.h"
45#include "llvm/Support/ErrorHandling.h"
46#include "llvm/Support/MachineValueType.h"
47#include "llvm/Support/MathExtras.h"
48#include <algorithm>
49#include <cassert>
50#include <cstdint>
51#include <limits>
52#include <utility>
53
54namespace llvm {
55
56class Function;
57class GlobalValue;
58class LLVMContext;
59class ScalarEvolution;
60class SCEV;
61class TargetMachine;
62
63extern cl::opt<unsigned> PartialUnrollingThreshold;
64
65/// Base class which can be used to help build a TTI implementation.
66///
67/// This class provides as much implementation of the TTI interface as is
68/// possible using the target independent parts of the code generator.
69///
70/// In order to subclass it, your class must implement a getST() method to
71/// return the subtarget, and a getTLI() method to return the target lowering.
72/// We need these methods implemented in the derived class so that this class
73/// doesn't have to duplicate storage for them.
74template <typename T>
75class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
76private:
77 using BaseT = TargetTransformInfoImplCRTPBase<T>;
78 using TTI = TargetTransformInfo;
79
80 /// Helper function to access this as a T.
81 T *thisT() { return static_cast<T *>(this); }
82
83 /// Estimate a cost of Broadcast as an extract and sequence of insert
84 /// operations.
85 unsigned getBroadcastShuffleOverhead(FixedVectorType *VTy) {
86 unsigned Cost = 0;
87 // Broadcast cost is equal to the cost of extracting the zero'th element
88 // plus the cost of inserting it into every element of the result vector.
89 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 0);
90
91 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
92 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i);
93 }
94 return Cost;
95 }
96
97 /// Estimate a cost of shuffle as a sequence of extract and insert
98 /// operations.
99 unsigned getPermuteShuffleOverhead(FixedVectorType *VTy) {
100 unsigned Cost = 0;
101 // Shuffle cost is equal to the cost of extracting element from its argument
102 // plus the cost of inserting them onto the result vector.
103
104 // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
105 // index 0 of first vector, index 1 of second vector,index 2 of first
106 // vector and finally index 3 of second vector and insert them at index
107 // <0,1,2,3> of result vector.
108 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
109 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i);
110 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, i);
111 }
112 return Cost;
113 }
114
115 /// Estimate a cost of subvector extraction as a sequence of extract and
116 /// insert operations.
117 unsigned getExtractSubvectorOverhead(FixedVectorType *VTy, int Index,
118 FixedVectorType *SubVTy) {
119 assert(VTy && SubVTy &&((VTy && SubVTy && "Can only extract subvectors from vectors"
) ? static_cast<void> (0) : __assert_fail ("VTy && SubVTy && \"Can only extract subvectors from vectors\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 120, __PRETTY_FUNCTION__))
120 "Can only extract subvectors from vectors")((VTy && SubVTy && "Can only extract subvectors from vectors"
) ? static_cast<void> (0) : __assert_fail ("VTy && SubVTy && \"Can only extract subvectors from vectors\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 120, __PRETTY_FUNCTION__))
;
121 int NumSubElts = SubVTy->getNumElements();
122 assert((Index + NumSubElts) <= (int)VTy->getNumElements() &&(((Index + NumSubElts) <= (int)VTy->getNumElements() &&
"SK_ExtractSubvector index out of range") ? static_cast<void
> (0) : __assert_fail ("(Index + NumSubElts) <= (int)VTy->getNumElements() && \"SK_ExtractSubvector index out of range\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 123, __PRETTY_FUNCTION__))
123 "SK_ExtractSubvector index out of range")(((Index + NumSubElts) <= (int)VTy->getNumElements() &&
"SK_ExtractSubvector index out of range") ? static_cast<void
> (0) : __assert_fail ("(Index + NumSubElts) <= (int)VTy->getNumElements() && \"SK_ExtractSubvector index out of range\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 123, __PRETTY_FUNCTION__))
;
124
125 unsigned Cost = 0;
126 // Subvector extraction cost is equal to the cost of extracting element from
127 // the source type plus the cost of inserting them into the result vector
128 // type.
129 for (int i = 0; i != NumSubElts; ++i) {
130 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
131 i + Index);
132 Cost +=
133 thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, i);
134 }
135 return Cost;
136 }
137
138 /// Estimate a cost of subvector insertion as a sequence of extract and
139 /// insert operations.
140 unsigned getInsertSubvectorOverhead(FixedVectorType *VTy, int Index,
141 FixedVectorType *SubVTy) {
142 assert(VTy && SubVTy &&((VTy && SubVTy && "Can only insert subvectors into vectors"
) ? static_cast<void> (0) : __assert_fail ("VTy && SubVTy && \"Can only insert subvectors into vectors\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 143, __PRETTY_FUNCTION__))
143 "Can only insert subvectors into vectors")((VTy && SubVTy && "Can only insert subvectors into vectors"
) ? static_cast<void> (0) : __assert_fail ("VTy && SubVTy && \"Can only insert subvectors into vectors\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 143, __PRETTY_FUNCTION__))
;
144 int NumSubElts = SubVTy->getNumElements();
145 assert((Index + NumSubElts) <= (int)VTy->getNumElements() &&(((Index + NumSubElts) <= (int)VTy->getNumElements() &&
"SK_InsertSubvector index out of range") ? static_cast<void
> (0) : __assert_fail ("(Index + NumSubElts) <= (int)VTy->getNumElements() && \"SK_InsertSubvector index out of range\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 146, __PRETTY_FUNCTION__))
146 "SK_InsertSubvector index out of range")(((Index + NumSubElts) <= (int)VTy->getNumElements() &&
"SK_InsertSubvector index out of range") ? static_cast<void
> (0) : __assert_fail ("(Index + NumSubElts) <= (int)VTy->getNumElements() && \"SK_InsertSubvector index out of range\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 146, __PRETTY_FUNCTION__))
;
147
148 unsigned Cost = 0;
149 // Subvector insertion cost is equal to the cost of extracting element from
150 // the source type plus the cost of inserting them into the result vector
151 // type.
152 for (int i = 0; i != NumSubElts; ++i) {
153 Cost +=
154 thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, i);
155 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
156 i + Index);
157 }
158 return Cost;
159 }
160
161 /// Local query method delegates up to T which *must* implement this!
162 const TargetSubtargetInfo *getST() const {
163 return static_cast<const T *>(this)->getST();
164 }
165
166 /// Local query method delegates up to T which *must* implement this!
167 const TargetLoweringBase *getTLI() const {
168 return static_cast<const T *>(this)->getTLI();
169 }
170
171 static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
172 switch (M) {
173 case TTI::MIM_Unindexed:
174 return ISD::UNINDEXED;
175 case TTI::MIM_PreInc:
176 return ISD::PRE_INC;
177 case TTI::MIM_PreDec:
178 return ISD::PRE_DEC;
179 case TTI::MIM_PostInc:
180 return ISD::POST_INC;
181 case TTI::MIM_PostDec:
182 return ISD::POST_DEC;
183 }
184 llvm_unreachable("Unexpected MemIndexedMode")::llvm::llvm_unreachable_internal("Unexpected MemIndexedMode"
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 184)
;
185 }
186
187protected:
188 explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
189 : BaseT(DL) {}
190 virtual ~BasicTTIImplBase() = default;
191
192 using TargetTransformInfoImplBase::DL;
193
194public:
195 /// \name Scalar TTI Implementations
196 /// @{
197 bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth,
198 unsigned AddressSpace, unsigned Alignment,
199 bool *Fast) const {
200 EVT E = EVT::getIntegerVT(Context, BitWidth);
201 return getTLI()->allowsMisalignedMemoryAccesses(
202 E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast);
203 }
204
205 bool hasBranchDivergence() { return false; }
206
207 bool useGPUDivergenceAnalysis() { return false; }
208
209 bool isSourceOfDivergence(const Value *V) { return false; }
210
211 bool isAlwaysUniform(const Value *V) { return false; }
212
213 unsigned getFlatAddressSpace() {
214 // Return an invalid address space.
215 return -1;
216 }
217
218 bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
219 Intrinsic::ID IID) const {
220 return false;
221 }
222
223 bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
224 return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
225 }
226
227 Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
228 Value *NewV) const {
229 return nullptr;
230 }
231
232 bool isLegalAddImmediate(int64_t imm) {
233 return getTLI()->isLegalAddImmediate(imm);
234 }
235
236 bool isLegalICmpImmediate(int64_t imm) {
237 return getTLI()->isLegalICmpImmediate(imm);
238 }
239
240 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
241 bool HasBaseReg, int64_t Scale,
242 unsigned AddrSpace, Instruction *I = nullptr) {
243 TargetLoweringBase::AddrMode AM;
244 AM.BaseGV = BaseGV;
245 AM.BaseOffs = BaseOffset;
246 AM.HasBaseReg = HasBaseReg;
247 AM.Scale = Scale;
248 return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
249 }
250
251 bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty,
252 const DataLayout &DL) const {
253 EVT VT = getTLI()->getValueType(DL, Ty);
254 return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
255 }
256
257 bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty,
258 const DataLayout &DL) const {
259 EVT VT = getTLI()->getValueType(DL, Ty);
260 return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
261 }
262
263 bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) {
264 return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
265 }
266
267 bool isProfitableLSRChainElement(Instruction *I) {
268 return TargetTransformInfoImplBase::isProfitableLSRChainElement(I);
269 }
270
271 int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
272 bool HasBaseReg, int64_t Scale, unsigned AddrSpace) {
273 TargetLoweringBase::AddrMode AM;
274 AM.BaseGV = BaseGV;
275 AM.BaseOffs = BaseOffset;
276 AM.HasBaseReg = HasBaseReg;
277 AM.Scale = Scale;
278 return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace);
279 }
280
281 bool isTruncateFree(Type *Ty1, Type *Ty2) {
282 return getTLI()->isTruncateFree(Ty1, Ty2);
283 }
284
285 bool isProfitableToHoist(Instruction *I) {
286 return getTLI()->isProfitableToHoist(I);
287 }
288
289 bool useAA() const { return getST()->useAA(); }
290
291 bool isTypeLegal(Type *Ty) {
292 EVT VT = getTLI()->getValueType(DL, Ty);
293 return getTLI()->isTypeLegal(VT);
294 }
295
296 int getGEPCost(Type *PointeeType, const Value *Ptr,
297 ArrayRef<const Value *> Operands) {
298 return BaseT::getGEPCost(PointeeType, Ptr, Operands);
299 }
300
301 unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
302 unsigned &JumpTableSize,
303 ProfileSummaryInfo *PSI,
304 BlockFrequencyInfo *BFI) {
305 /// Try to find the estimated number of clusters. Note that the number of
306 /// clusters identified in this function could be different from the actual
307 /// numbers found in lowering. This function ignore switches that are
308 /// lowered with a mix of jump table / bit test / BTree. This function was
309 /// initially intended to be used when estimating the cost of switch in
310 /// inline cost heuristic, but it's a generic cost model to be used in other
311 /// places (e.g., in loop unrolling).
312 unsigned N = SI.getNumCases();
313 const TargetLoweringBase *TLI = getTLI();
314 const DataLayout &DL = this->getDataLayout();
315
316 JumpTableSize = 0;
317 bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());
318
319 // Early exit if both a jump table and bit test are not allowed.
320 if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
321 return N;
322
323 APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
324 APInt MinCaseVal = MaxCaseVal;
325 for (auto CI : SI.cases()) {
326 const APInt &CaseVal = CI.getCaseValue()->getValue();
327 if (CaseVal.sgt(MaxCaseVal))
328 MaxCaseVal = CaseVal;
329 if (CaseVal.slt(MinCaseVal))
330 MinCaseVal = CaseVal;
331 }
332
333 // Check if suitable for a bit test
334 if (N <= DL.getIndexSizeInBits(0u)) {
335 SmallPtrSet<const BasicBlock *, 4> Dests;
336 for (auto I : SI.cases())
337 Dests.insert(I.getCaseSuccessor());
338
339 if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
340 DL))
341 return 1;
342 }
343
344 // Check if suitable for a jump table.
345 if (IsJTAllowed) {
346 if (N < 2 || N < TLI->getMinimumJumpTableEntries())
347 return N;
348 uint64_t Range =
349 (MaxCaseVal - MinCaseVal)
350 .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
351 // Check whether a range of clusters is dense enough for a jump table
352 if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) {
353 JumpTableSize = Range;
354 return 1;
355 }
356 }
357 return N;
358 }
359
360 bool shouldBuildLookupTables() {
361 const TargetLoweringBase *TLI = getTLI();
362 return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
363 TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
364 }
365
366 bool haveFastSqrt(Type *Ty) {
367 const TargetLoweringBase *TLI = getTLI();
368 EVT VT = TLI->getValueType(DL, Ty);
369 return TLI->isTypeLegal(VT) &&
370 TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
371 }
372
373 bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
374 return true;
375 }
376
377 unsigned getFPOpCost(Type *Ty) {
378 // Check whether FADD is available, as a proxy for floating-point in
379 // general.
380 const TargetLoweringBase *TLI = getTLI();
381 EVT VT = TLI->getValueType(DL, Ty);
382 if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT))
383 return TargetTransformInfo::TCC_Basic;
384 return TargetTransformInfo::TCC_Expensive;
385 }
386
387 unsigned getInliningThresholdMultiplier() { return 1; }
388
389 int getInlinerVectorBonusPercent() { return 150; }
390
391 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
392 TTI::UnrollingPreferences &UP) {
393 // This unrolling functionality is target independent, but to provide some
394 // motivation for its intended use, for x86:
395
396 // According to the Intel 64 and IA-32 Architectures Optimization Reference
397 // Manual, Intel Core models and later have a loop stream detector (and
398 // associated uop queue) that can benefit from partial unrolling.
399 // The relevant requirements are:
400 // - The loop must have no more than 4 (8 for Nehalem and later) branches
401 // taken, and none of them may be calls.
402 // - The loop can have no more than 18 (28 for Nehalem and later) uops.
403
404 // According to the Software Optimization Guide for AMD Family 15h
405 // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
406 // and loop buffer which can benefit from partial unrolling.
407 // The relevant requirements are:
408 // - The loop must have fewer than 16 branches
409 // - The loop must have less than 40 uops in all executed loop branches
410
411 // The number of taken branches in a loop is hard to estimate here, and
412 // benchmarking has revealed that it is better not to be conservative when
413 // estimating the branch count. As a result, we'll ignore the branch limits
414 // until someone finds a case where it matters in practice.
415
416 unsigned MaxOps;
417 const TargetSubtargetInfo *ST = getST();
418 if (PartialUnrollingThreshold.getNumOccurrences() > 0)
419 MaxOps = PartialUnrollingThreshold;
420 else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
421 MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
422 else
423 return;
424
425 // Scan the loop: don't unroll loops with calls.
426 for (BasicBlock *BB : L->blocks()) {
427 for (Instruction &I : *BB) {
428 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
429 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
430 if (!thisT()->isLoweredToCall(F))
431 continue;
432 }
433
434 return;
435 }
436 }
437 }
438
439 // Enable runtime and partial unrolling up to the specified size.
440 // Enable using trip count upper bound to unroll loops.
441 UP.Partial = UP.Runtime = UP.UpperBound = true;
442 UP.PartialThreshold = MaxOps;
443
444 // Avoid unrolling when optimizing for size.
445 UP.OptSizeThreshold = 0;
446 UP.PartialOptSizeThreshold = 0;
447
448 // Set number of instructions optimized when "back edge"
449 // becomes "fall through" to default value of 2.
450 UP.BEInsns = 2;
451 }
452
453 void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
454 TTI::PeelingPreferences &PP) {
455 PP.PeelCount = 0;
456 PP.AllowPeeling = true;
457 PP.AllowLoopNestsPeeling = false;
458 PP.PeelProfiledIterations = true;
459 }
460
461 bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
462 AssumptionCache &AC,
463 TargetLibraryInfo *LibInfo,
464 HardwareLoopInfo &HWLoopInfo) {
465 return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
466 }
467
468 bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
469 AssumptionCache &AC, TargetLibraryInfo *TLI,
470 DominatorTree *DT,
471 const LoopAccessInfo *LAI) {
472 return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
473 }
474
475 bool emitGetActiveLaneMask() {
476 return BaseT::emitGetActiveLaneMask();
477 }
478
479 Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
480 IntrinsicInst &II) {
481 return BaseT::instCombineIntrinsic(IC, II);
482 }
483
484 Optional<Value *> simplifyDemandedUseBitsIntrinsic(InstCombiner &IC,
485 IntrinsicInst &II,
486 APInt DemandedMask,
487 KnownBits &Known,
488 bool &KnownBitsComputed) {
489 return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
490 KnownBitsComputed);
491 }
492
493 Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
494 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
495 APInt &UndefElts2, APInt &UndefElts3,
496 std::function<void(Instruction *, unsigned, APInt, APInt &)>
497 SimplifyAndSetOp) {
498 return BaseT::simplifyDemandedVectorEltsIntrinsic(
499 IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
500 SimplifyAndSetOp);
501 }
502
503 int getInstructionLatency(const Instruction *I) {
504 if (isa<LoadInst>(I))
505 return getST()->getSchedModel().DefaultLoadLatency;
506
507 return BaseT::getInstructionLatency(I);
508 }
509
510 virtual Optional<unsigned>
511 getCacheSize(TargetTransformInfo::CacheLevel Level) const {
512 return Optional<unsigned>(
513 getST()->getCacheSize(static_cast<unsigned>(Level)));
514 }
515
516 virtual Optional<unsigned>
517 getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const {
518 Optional<unsigned> TargetResult =
519 getST()->getCacheAssociativity(static_cast<unsigned>(Level));
520
521 if (TargetResult)
522 return TargetResult;
523
524 return BaseT::getCacheAssociativity(Level);
525 }
526
527 virtual unsigned getCacheLineSize() const {
528 return getST()->getCacheLineSize();
529 }
530
531 virtual unsigned getPrefetchDistance() const {
532 return getST()->getPrefetchDistance();
533 }
534
535 virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
536 unsigned NumStridedMemAccesses,
537 unsigned NumPrefetches,
538 bool HasCall) const {
539 return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
540 NumPrefetches, HasCall);
541 }
542
543 virtual unsigned getMaxPrefetchIterationsAhead() const {
544 return getST()->getMaxPrefetchIterationsAhead();
545 }
546
547 virtual bool enableWritePrefetching() const {
548 return getST()->enableWritePrefetching();
549 }
550
551 /// @}
552
553 /// \name Vector TTI Implementations
554 /// @{
555
556 unsigned getRegisterBitWidth(bool Vector) const { return 32; }
557
558 /// Estimate the overhead of scalarizing an instruction. Insert and Extract
559 /// are set if the demanded result elements need to be inserted and/or
560 /// extracted from vectors.
561 unsigned getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts,
562 bool Insert, bool Extract) {
563 /// FIXME: a bitfield is not a reasonable abstraction for talking about
564 /// which elements are needed from a scalable vector
565 auto *Ty = cast<FixedVectorType>(InTy);
566
567 assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&((DemandedElts.getBitWidth() == Ty->getNumElements() &&
"Vector size mismatch") ? static_cast<void> (0) : __assert_fail
("DemandedElts.getBitWidth() == Ty->getNumElements() && \"Vector size mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 568, __PRETTY_FUNCTION__))
568 "Vector size mismatch")((DemandedElts.getBitWidth() == Ty->getNumElements() &&
"Vector size mismatch") ? static_cast<void> (0) : __assert_fail
("DemandedElts.getBitWidth() == Ty->getNumElements() && \"Vector size mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 568, __PRETTY_FUNCTION__))
;
569
570 unsigned Cost = 0;
571
572 for (int i = 0, e = Ty->getNumElements(); i < e; ++i) {
573 if (!DemandedElts[i])
574 continue;
575 if (Insert)
576 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, i);
577 if (Extract)
578 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
579 }
580
581 return Cost;
582 }
583
584 /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
585 unsigned getScalarizationOverhead(VectorType *InTy, bool Insert,
586 bool Extract) {
587 auto *Ty = cast<FixedVectorType>(InTy);
588
589 APInt DemandedElts = APInt::getAllOnesValue(Ty->getNumElements());
590 return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
591 }
592
593 /// Estimate the overhead of scalarizing an instructions unique
594 /// non-constant operands. The types of the arguments are ordinarily
595 /// scalar, in which case the costs are multiplied with VF.
596 unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
597 unsigned VF) {
598 unsigned Cost = 0;
599 SmallPtrSet<const Value*, 4> UniqueOperands;
600 for (const Value *A : Args) {
601 if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
602 auto *VecTy = dyn_cast<VectorType>(A->getType());
603 if (VecTy) {
604 // If A is a vector operand, VF should be 1 or correspond to A.
605 assert((VF == 1 ||(((VF == 1 || VF == cast<FixedVectorType>(VecTy)->getNumElements
()) && "Vector argument does not match VF") ? static_cast
<void> (0) : __assert_fail ("(VF == 1 || VF == cast<FixedVectorType>(VecTy)->getNumElements()) && \"Vector argument does not match VF\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 607, __PRETTY_FUNCTION__))
606 VF == cast<FixedVectorType>(VecTy)->getNumElements()) &&(((VF == 1 || VF == cast<FixedVectorType>(VecTy)->getNumElements
()) && "Vector argument does not match VF") ? static_cast
<void> (0) : __assert_fail ("(VF == 1 || VF == cast<FixedVectorType>(VecTy)->getNumElements()) && \"Vector argument does not match VF\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 607, __PRETTY_FUNCTION__))
607 "Vector argument does not match VF")(((VF == 1 || VF == cast<FixedVectorType>(VecTy)->getNumElements
()) && "Vector argument does not match VF") ? static_cast
<void> (0) : __assert_fail ("(VF == 1 || VF == cast<FixedVectorType>(VecTy)->getNumElements()) && \"Vector argument does not match VF\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 607, __PRETTY_FUNCTION__))
;
608 }
609 else
610 VecTy = FixedVectorType::get(A->getType(), VF);
611
612 Cost += getScalarizationOverhead(VecTy, false, true);
613 }
614 }
615
616 return Cost;
617 }
618
619 unsigned getScalarizationOverhead(VectorType *InTy,
620 ArrayRef<const Value *> Args) {
621 auto *Ty = cast<FixedVectorType>(InTy);
622
623 unsigned Cost = 0;
624
625 Cost += getScalarizationOverhead(Ty, true, false);
626 if (!Args.empty())
627 Cost += getOperandsScalarizationOverhead(Args, Ty->getNumElements());
628 else
629 // When no information on arguments is provided, we add the cost
630 // associated with one argument as a heuristic.
631 Cost += getScalarizationOverhead(Ty, false, true);
632
633 return Cost;
634 }
635
636 unsigned getMaxInterleaveFactor(unsigned VF) { return 1; }
637
638 unsigned getArithmeticInstrCost(
639 unsigned Opcode, Type *Ty,
640 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
641 TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
642 TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
643 TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
644 TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
645 ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
646 const Instruction *CxtI = nullptr) {
647 // Check if any of the operands are vector operands.
648 const TargetLoweringBase *TLI = getTLI();
649 int ISD = TLI->InstructionOpcodeToISD(Opcode);
650 assert(ISD && "Invalid opcode")((ISD && "Invalid opcode") ? static_cast<void> (
0) : __assert_fail ("ISD && \"Invalid opcode\"", "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 650, __PRETTY_FUNCTION__))
;
651
652 // TODO: Handle more cost kinds.
653 if (CostKind != TTI::TCK_RecipThroughput)
654 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
655 Opd1Info, Opd2Info,
656 Opd1PropInfo, Opd2PropInfo,
657 Args, CxtI);
658
659 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
660
661 bool IsFloat = Ty->isFPOrFPVectorTy();
662 // Assume that floating point arithmetic operations cost twice as much as
663 // integer operations.
664 unsigned OpCost = (IsFloat ? 2 : 1);
665
666 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
667 // The operation is legal. Assume it costs 1.
668 // TODO: Once we have extract/insert subvector cost we need to use them.
669 return LT.first * OpCost;
670 }
671
672 if (!TLI->isOperationExpand(ISD, LT.second)) {
673 // If the operation is custom lowered, then assume that the code is twice
674 // as expensive.
675 return LT.first * 2 * OpCost;
676 }
677
678 // Else, assume that we need to scalarize this op.
679 // TODO: If one of the types get legalized by splitting, handle this
680 // similarly to what getCastInstrCost() does.
681 if (auto *VTy = dyn_cast<VectorType>(Ty)) {
682 unsigned Num = cast<FixedVectorType>(VTy)->getNumElements();
683 unsigned Cost = thisT()->getArithmeticInstrCost(
684 Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
685 Opd1PropInfo, Opd2PropInfo, Args, CxtI);
686 // Return the cost of multiple scalar invocation plus the cost of
687 // inserting and extracting the values.
688 return getScalarizationOverhead(VTy, Args) + Num * Cost;
689 }
690
691 // We don't know anything about this scalar instruction.
692 return OpCost;
693 }
694
695 unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index,
696 VectorType *SubTp) {
697
698 switch (Kind) {
699 case TTI::SK_Broadcast:
700 return getBroadcastShuffleOverhead(cast<FixedVectorType>(Tp));
701 case TTI::SK_Select:
702 case TTI::SK_Reverse:
703 case TTI::SK_Transpose:
704 case TTI::SK_PermuteSingleSrc:
705 case TTI::SK_PermuteTwoSrc:
706 return getPermuteShuffleOverhead(cast<FixedVectorType>(Tp));
707 case TTI::SK_ExtractSubvector:
708 return getExtractSubvectorOverhead(cast<FixedVectorType>(Tp), Index,
709 cast<FixedVectorType>(SubTp));
710 case TTI::SK_InsertSubvector:
711 return getInsertSubvectorOverhead(cast<FixedVectorType>(Tp), Index,
712 cast<FixedVectorType>(SubTp));
713 }
714 llvm_unreachable("Unknown TTI::ShuffleKind")::llvm::llvm_unreachable_internal("Unknown TTI::ShuffleKind",
"/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 714)
;
715 }
716
717 unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
718 TTI::CastContextHint CCH,
719 TTI::TargetCostKind CostKind,
720 const Instruction *I = nullptr) {
721 if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0)
722 return 0;
723
724 const TargetLoweringBase *TLI = getTLI();
725 int ISD = TLI->InstructionOpcodeToISD(Opcode);
726 assert(ISD && "Invalid opcode")((ISD && "Invalid opcode") ? static_cast<void> (
0) : __assert_fail ("ISD && \"Invalid opcode\"", "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 726, __PRETTY_FUNCTION__))
;
727 std::pair<unsigned, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, Src);
728 std::pair<unsigned, MVT> DstLT = TLI->getTypeLegalizationCost(DL, Dst);
729
730 TypeSize SrcSize = SrcLT.second.getSizeInBits();
731 TypeSize DstSize = DstLT.second.getSizeInBits();
732 bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy();
733 bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy();
734
735 switch (Opcode) {
736 default:
737 break;
738 case Instruction::Trunc:
739 // Check for NOOP conversions.
740 if (TLI->isTruncateFree(SrcLT.second, DstLT.second))
741 return 0;
742 LLVM_FALLTHROUGH[[gnu::fallthrough]];
743 case Instruction::BitCast:
744 // Bitcast between types that are legalized to the same type are free and
745 // assume int to/from ptr of the same size is also free.
746 if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst &&
747 SrcSize == DstSize)
748 return 0;
749 break;
750 case Instruction::FPExt:
751 if (I && getTLI()->isExtFree(I))
752 return 0;
753 break;
754 case Instruction::ZExt:
755 if (TLI->isZExtFree(SrcLT.second, DstLT.second))
756 return 0;
757 LLVM_FALLTHROUGH[[gnu::fallthrough]];
758 case Instruction::SExt:
759 if (I && getTLI()->isExtFree(I))
760 return 0;
761
762 // If this is a zext/sext of a load, return 0 if the corresponding
763 // extending load exists on target.
764 if (CCH == TTI::CastContextHint::Normal) {
765 EVT ExtVT = EVT::getEVT(Dst);
766 EVT LoadVT = EVT::getEVT(Src);
767 unsigned LType =
768 ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
769 if (TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
770 return 0;
771 }
772 break;
773 case Instruction::AddrSpaceCast:
774 if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(),
775 Dst->getPointerAddressSpace()))
776 return 0;
777 break;
778 }
779
780 auto *SrcVTy = dyn_cast<VectorType>(Src);
781 auto *DstVTy = dyn_cast<VectorType>(Dst);
782
783 // If the cast is marked as legal (or promote) then assume low cost.
784 if (SrcLT.first == DstLT.first &&
785 TLI->isOperationLegalOrPromote(ISD, DstLT.second))
786 return SrcLT.first;
787
788 // Handle scalar conversions.
789 if (!SrcVTy && !DstVTy) {
790 // Just check the op cost. If the operation is legal then assume it costs
791 // 1.
792 if (!TLI->isOperationExpand(ISD, DstLT.second))
793 return 1;
794
795 // Assume that illegal scalar instruction are expensive.
796 return 4;
797 }
798
799 // Check vector-to-vector casts.
800 if (DstVTy && SrcVTy) {
801 // If the cast is between same-sized registers, then the check is simple.
802 if (SrcLT.first == DstLT.first && SrcSize == DstSize) {
803
804 // Assume that Zext is done using AND.
805 if (Opcode == Instruction::ZExt)
806 return SrcLT.first;
807
808 // Assume that sext is done using SHL and SRA.
809 if (Opcode == Instruction::SExt)
810 return SrcLT.first * 2;
811
812 // Just check the op cost. If the operation is legal then assume it
813 // costs
814 // 1 and multiply by the type-legalization overhead.
815 if (!TLI->isOperationExpand(ISD, DstLT.second))
816 return SrcLT.first * 1;
817 }
818
819 // If we are legalizing by splitting, query the concrete TTI for the cost
820 // of casting the original vector twice. We also need to factor in the
821 // cost of the split itself. Count that as 1, to be consistent with
822 // TLI->getTypeLegalizationCost().
823 bool SplitSrc =
824 TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
825 TargetLowering::TypeSplitVector;
826 bool SplitDst =
827 TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
828 TargetLowering::TypeSplitVector;
829 if ((SplitSrc || SplitDst) &&
830 cast<FixedVectorType>(SrcVTy)->getNumElements() > 1 &&
831 cast<FixedVectorType>(DstVTy)->getNumElements() > 1) {
832 Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy);
833 Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy);
834 T *TTI = static_cast<T *>(this);
835 // If both types need to be split then the split is free.
836 unsigned SplitCost =
837 (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0;
838 return SplitCost +
839 (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH,
840 CostKind, I));
841 }
842
843 // In other cases where the source or destination are illegal, assume
844 // the operation will get scalarized.
845 unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements();
846 unsigned Cost = thisT()->getCastInstrCost(
847 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I);
848
849 // Return the cost of multiple scalar invocation plus the cost of
850 // inserting and extracting the values.
851 return getScalarizationOverhead(DstVTy, true, true) + Num * Cost;
852 }
853
854 // We already handled vector-to-vector and scalar-to-scalar conversions.
855 // This
856 // is where we handle bitcast between vectors and scalars. We need to assume
857 // that the conversion is scalarized in one way or another.
858 if (Opcode == Instruction::BitCast) {
859 // Illegal bitcasts are done by storing and loading from a stack slot.
860 return (SrcVTy ? getScalarizationOverhead(SrcVTy, false, true) : 0) +
861 (DstVTy ? getScalarizationOverhead(DstVTy, true, false) : 0);
862 }
863
864 llvm_unreachable("Unhandled cast")::llvm::llvm_unreachable_internal("Unhandled cast", "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 864)
;
865 }
866
867 unsigned getExtractWithExtendCost(unsigned Opcode, Type *Dst,
868 VectorType *VecTy, unsigned Index) {
869 return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
870 Index) +
871 thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
872 TTI::CastContextHint::None, TTI::TCK_RecipThroughput);
873 }
874
875 unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) {
876 return BaseT::getCFInstrCost(Opcode, CostKind);
877 }
878
879 unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
880 TTI::TargetCostKind CostKind,
881 const Instruction *I = nullptr) {
882 const TargetLoweringBase *TLI = getTLI();
883 int ISD = TLI->InstructionOpcodeToISD(Opcode);
884 assert(ISD && "Invalid opcode")((ISD && "Invalid opcode") ? static_cast<void> (
0) : __assert_fail ("ISD && \"Invalid opcode\"", "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 884, __PRETTY_FUNCTION__))
;
885
886 // TODO: Handle other cost kinds.
887 if (CostKind != TTI::TCK_RecipThroughput)
888 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
889
890 // Selects on vectors are actually vector selects.
891 if (ISD == ISD::SELECT) {
892 assert(CondTy && "CondTy must exist")((CondTy && "CondTy must exist") ? static_cast<void
> (0) : __assert_fail ("CondTy && \"CondTy must exist\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 892, __PRETTY_FUNCTION__))
;
893 if (CondTy->isVectorTy())
894 ISD = ISD::VSELECT;
895 }
896 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
897
898 if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
899 !TLI->isOperationExpand(ISD, LT.second)) {
900 // The operation is legal. Assume it costs 1. Multiply
901 // by the type-legalization overhead.
902 return LT.first * 1;
903 }
904
905 // Otherwise, assume that the cast is scalarized.
906 // TODO: If one of the types get legalized by splitting, handle this
907 // similarly to what getCastInstrCost() does.
908 if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) {
909 unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements();
910 if (CondTy)
911 CondTy = CondTy->getScalarType();
912 unsigned Cost = thisT()->getCmpSelInstrCost(
913 Opcode, ValVTy->getScalarType(), CondTy, CostKind, I);
914
915 // Return the cost of multiple scalar invocation plus the cost of
916 // inserting and extracting the values.
917 return getScalarizationOverhead(ValVTy, true, false) + Num * Cost;
918 }
919
920 // Unknown scalar opcode.
921 return 1;
922 }
923
924 unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
925 std::pair<unsigned, MVT> LT =
926 getTLI()->getTypeLegalizationCost(DL, Val->getScalarType());
927
928 return LT.first;
929 }
930
931 unsigned getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
932 unsigned AddressSpace,
933 TTI::TargetCostKind CostKind,
934 const Instruction *I = nullptr) {
935 assert(!Src->isVoidTy() && "Invalid type")((!Src->isVoidTy() && "Invalid type") ? static_cast
<void> (0) : __assert_fail ("!Src->isVoidTy() && \"Invalid type\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 935, __PRETTY_FUNCTION__))
;
936 // Assume types, such as structs, are expensive.
937 if (getTLI()->getValueType(DL, Src, true) == MVT::Other)
938 return 4;
939 std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Src);
940
941 // Assuming that all loads of legal types cost 1.
942 unsigned Cost = LT.first;
943 if (CostKind != TTI::TCK_RecipThroughput)
944 return Cost;
945
946 if (Src->isVectorTy() &&
947 Src->getPrimitiveSizeInBits() < LT.second.getSizeInBits()) {
948 // This is a vector load that legalizes to a larger type than the vector
949 // itself. Unless the corresponding extending load or truncating store is
950 // legal, then this will scalarize.
951 TargetLowering::LegalizeAction LA = TargetLowering::Expand;
952 EVT MemVT = getTLI()->getValueType(DL, Src);
953 if (Opcode == Instruction::Store)
954 LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
955 else
956 LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
957
958 if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
959 // This is a vector load/store for some illegal type that is scalarized.
960 // We must account for the cost of building or decomposing the vector.
961 Cost += getScalarizationOverhead(cast<VectorType>(Src),
962 Opcode != Instruction::Store,
963 Opcode == Instruction::Store);
964 }
965 }
966
967 return Cost;
968 }
969
970 unsigned getInterleavedMemoryOpCost(
971 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
972 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
973 bool UseMaskForCond = false, bool UseMaskForGaps = false) {
974 auto *VT = cast<FixedVectorType>(VecTy);
11
'VecTy' is a 'FixedVectorType'
975
976 unsigned NumElts = VT->getNumElements();
977 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor")((Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor"
) ? static_cast<void> (0) : __assert_fail ("Factor > 1 && NumElts % Factor == 0 && \"Invalid interleave factor\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 977, __PRETTY_FUNCTION__))
;
12
Assuming 'Factor' is > 1
13
Assuming the condition is true
14
'?' condition is true
978
979 unsigned NumSubElts = NumElts / Factor;
980 auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts);
981
982 // Firstly, the cost of load/store operation.
983 unsigned Cost;
984 if (UseMaskForCond
14.1
'UseMaskForCond' is false
14.1
'UseMaskForCond' is false
14.1
'UseMaskForCond' is false
|| UseMaskForGaps
14.2
'UseMaskForGaps' is false
14.2
'UseMaskForGaps' is false
14.2
'UseMaskForGaps' is false
)
15
Taking false branch
985 Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment,
986 AddressSpace, CostKind);
987 else
988 Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace,
989 CostKind);
990
991 // Legalize the vector type, and get the legalized and unlegalized type
992 // sizes.
993 MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
994 unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy);
995 unsigned VecTyLTSize = VecTyLT.getStoreSize();
996
997 // Return the ceiling of dividing A by B.
998 auto ceil = [](unsigned A, unsigned B) { return (A + B - 1) / B; };
999
1000 // Scale the cost of the memory operation by the fraction of legalized
1001 // instructions that will actually be used. We shouldn't account for the
1002 // cost of dead instructions since they will be removed.
1003 //
1004 // E.g., An interleaved load of factor 8:
1005 // %vec = load <16 x i64>, <16 x i64>* %ptr
1006 // %v0 = shufflevector %vec, undef, <0, 8>
1007 //
1008 // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
1009 // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
1010 // type). The other loads are unused.
1011 //
1012 // We only scale the cost of loads since interleaved store groups aren't
1013 // allowed to have gaps.
1014 if (Opcode == Instruction::Load && VecTySize > VecTyLTSize) {
16
Assuming 'Opcode' is not equal to Load
1015 // The number of loads of a legal type it will take to represent a load
1016 // of the unlegalized vector type.
1017 unsigned NumLegalInsts = ceil(VecTySize, VecTyLTSize);
1018
1019 // The number of elements of the unlegalized type that correspond to a
1020 // single legal instruction.
1021 unsigned NumEltsPerLegalInst = ceil(NumElts, NumLegalInsts);
1022
1023 // Determine which legal instructions will be used.
1024 BitVector UsedInsts(NumLegalInsts, false);
1025 for (unsigned Index : Indices)
1026 for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
1027 UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
1028
1029 // Scale the cost of the load by the fraction of legal instructions that
1030 // will be used.
1031 Cost *= UsedInsts.count() / NumLegalInsts;
1032 }
1033
1034 // Then plus the cost of interleave operation.
1035 if (Opcode
16.1
'Opcode' is not equal to Load
16.1
'Opcode' is not equal to Load
16.1
'Opcode' is not equal to Load
== Instruction::Load) {
17
Taking false branch
1036 // The interleave cost is similar to extract sub vectors' elements
1037 // from the wide vector, and insert them into sub vectors.
1038 //
1039 // E.g. An interleaved load of factor 2 (with one member of index 0):
1040 // %vec = load <8 x i32>, <8 x i32>* %ptr
1041 // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0
1042 // The cost is estimated as extract elements at 0, 2, 4, 6 from the
1043 // <8 x i32> vector and insert them into a <4 x i32> vector.
1044
1045 assert(Indices.size() <= Factor &&((Indices.size() <= Factor && "Interleaved memory op has too many members"
) ? static_cast<void> (0) : __assert_fail ("Indices.size() <= Factor && \"Interleaved memory op has too many members\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 1046, __PRETTY_FUNCTION__))
1046 "Interleaved memory op has too many members")((Indices.size() <= Factor && "Interleaved memory op has too many members"
) ? static_cast<void> (0) : __assert_fail ("Indices.size() <= Factor && \"Interleaved memory op has too many members\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 1046, __PRETTY_FUNCTION__))
;
1047
1048 for (unsigned Index : Indices) {
1049 assert(Index < Factor && "Invalid index for interleaved memory op")((Index < Factor && "Invalid index for interleaved memory op"
) ? static_cast<void> (0) : __assert_fail ("Index < Factor && \"Invalid index for interleaved memory op\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 1049, __PRETTY_FUNCTION__))
;
1050
1051 // Extract elements from loaded vector for each sub vector.
1052 for (unsigned i = 0; i < NumSubElts; i++)
1053 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VT,
1054 Index + i * Factor);
1055 }
1056
1057 unsigned InsSubCost = 0;
1058 for (unsigned i = 0; i < NumSubElts; i++)
1059 InsSubCost +=
1060 thisT()->getVectorInstrCost(Instruction::InsertElement, SubVT, i);
1061
1062 Cost += Indices.size() * InsSubCost;
1063 } else {
1064 // The interleave cost is extract all elements from sub vectors, and
1065 // insert them into the wide vector.
1066 //
1067 // E.g. An interleaved store of factor 2:
1068 // %v0_v1 = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7>
1069 // store <8 x i32> %interleaved.vec, <8 x i32>* %ptr
1070 // The cost is estimated as extract all elements from both <4 x i32>
1071 // vectors and insert into the <8 x i32> vector.
1072
1073 unsigned ExtSubCost = 0;
1074 for (unsigned i = 0; i < NumSubElts; i++)
18
Assuming 'i' is < 'NumSubElts'
19
Loop condition is true. Entering loop body
1075 ExtSubCost +=
1076 thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i);
20
Calling 'X86TTIImpl::getVectorInstrCost'
1077 Cost += ExtSubCost * Factor;
1078
1079 for (unsigned i = 0; i < NumElts; i++)
1080 Cost += static_cast<T *>(this)
1081 ->getVectorInstrCost(Instruction::InsertElement, VT, i);
1082 }
1083
1084 if (!UseMaskForCond)
1085 return Cost;
1086
1087 Type *I8Type = Type::getInt8Ty(VT->getContext());
1088 auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
1089 SubVT = FixedVectorType::get(I8Type, NumSubElts);
1090
1091 // The Mask shuffling cost is extract all the elements of the Mask
1092 // and insert each of them Factor times into the wide vector:
1093 //
1094 // E.g. an interleaved group with factor 3:
1095 // %mask = icmp ult <8 x i32> %vec1, %vec2
1096 // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
1097 // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
1098 // The cost is estimated as extract all mask elements from the <8xi1> mask
1099 // vector and insert them factor times into the <24xi1> shuffled mask
1100 // vector.
1101 for (unsigned i = 0; i < NumSubElts; i++)
1102 Cost +=
1103 thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i);
1104
1105 for (unsigned i = 0; i < NumElts; i++)
1106 Cost +=
1107 thisT()->getVectorInstrCost(Instruction::InsertElement, MaskVT, i);
1108
1109 // The Gaps mask is invariant and created outside the loop, therefore the
1110 // cost of creating it is not accounted for here. However if we have both
1111 // a MaskForGaps and some other mask that guards the execution of the
1112 // memory access, we need to account for the cost of And-ing the two masks
1113 // inside the loop.
1114 if (UseMaskForGaps)
1115 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT,
1116 CostKind);
1117
1118 return Cost;
1119 }
1120
1121 /// Get intrinsic cost based on arguments.
1122 unsigned getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1123 TTI::TargetCostKind CostKind) {
1124 Intrinsic::ID IID = ICA.getID();
1125
1126 // Special case some scalar intrinsics.
1127 if (CostKind != TTI::TCK_RecipThroughput) {
1128 switch (IID) {
1129 default:
1130 break;
1131 case Intrinsic::cttz:
1132 if (getTLI()->isCheapToSpeculateCttz())
1133 return TargetTransformInfo::TCC_Basic;
1134 break;
1135 case Intrinsic::ctlz:
1136 if (getTLI()->isCheapToSpeculateCtlz())
1137 return TargetTransformInfo::TCC_Basic;
1138 break;
1139 case Intrinsic::memcpy:
1140 return thisT()->getMemcpyCost(ICA.getInst());
1141 // TODO: other libc intrinsics.
1142 }
1143 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1144 }
1145
1146 if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0)
1147 return 0;
1148
1149 // TODO: Combine these two logic paths.
1150 if (ICA.isTypeBasedOnly())
1151 return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
1152
1153 Type *RetTy = ICA.getReturnType();
1154 unsigned VF = ICA.getVectorFactor();
1155 unsigned RetVF =
1156 (RetTy->isVectorTy() ? cast<FixedVectorType>(RetTy)->getNumElements()
1157 : 1);
1158 assert((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type")(((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type"
) ? static_cast<void> (0) : __assert_fail ("(RetVF == 1 || VF == 1) && \"VF > 1 and RetVF is a vector type\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 1158, __PRETTY_FUNCTION__))
;
1159 const IntrinsicInst *I = ICA.getInst();
1160 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
1161 FastMathFlags FMF = ICA.getFlags();
1162
1163 switch (IID) {
1164 default: {
1165 // Assume that we need to scalarize this intrinsic.
1166 SmallVector<Type *, 4> Types;
1167 for (const Value *Op : Args) {
1168 Type *OpTy = Op->getType();
1169 assert(VF == 1 || !OpTy->isVectorTy())((VF == 1 || !OpTy->isVectorTy()) ? static_cast<void>
(0) : __assert_fail ("VF == 1 || !OpTy->isVectorTy()", "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 1169, __PRETTY_FUNCTION__))
;
1170 Types.push_back(VF == 1 ? OpTy : FixedVectorType::get(OpTy, VF));
1171 }
1172
1173 if (VF > 1 && !RetTy->isVoidTy())
1174 RetTy = FixedVectorType::get(RetTy, VF);
1175
1176 // Compute the scalarization overhead based on Args for a vector
1177 // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while
1178 // CostModel will pass a vector RetTy and VF is 1.
1179 unsigned ScalarizationCost = std::numeric_limits<unsigned>::max();
1180 if (RetVF > 1 || VF > 1) {
1181 ScalarizationCost = 0;
1182 if (!RetTy->isVoidTy())
1183 ScalarizationCost +=
1184</