Bug Summary

File:llvm/lib/Target/X86/X86TargetTransformInfo.cpp
Warning:line 3192, column 20
Called C++ object pointer is null

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name X86TargetTransformInfo.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -fno-split-dwarf-inlining -debugger-tuning=gdb -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-12/lib/clang/12.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/build-llvm/lib/Target/X86 -I /build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86 -I /build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/build-llvm/include -I /build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-12/lib/clang/12.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/build-llvm/lib/Target/X86 -fdebug-prefix-map=/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f=. -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -o /tmp/scan-build-2021-01-05-120504-36406-1 -x c++ /build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp

/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp

1//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements a TargetTransformInfo analysis pass specific to the
10/// X86 target machine. It uses the target's detailed information to provide
11/// more precise answers to certain TTI queries, while letting the target
12/// independent and default TTI implementations handle the rest.
13///
14//===----------------------------------------------------------------------===//
15/// About Cost Model numbers used below it's necessary to say the following:
16/// the numbers correspond to some "generic" X86 CPU instead of usage of
17/// concrete CPU model. Usually the numbers correspond to CPU where the feature
18/// apeared at the first time. For example, if we do Subtarget.hasSSE42() in
19/// the lookups below the cost is based on Nehalem as that was the first CPU
20/// to support that feature level and thus has most likely the worst case cost.
21/// Some examples of other technologies/CPUs:
22/// SSE 3 - Pentium4 / Athlon64
23/// SSE 4.1 - Penryn
24/// SSE 4.2 - Nehalem
25/// AVX - Sandy Bridge
26/// AVX2 - Haswell
27/// AVX-512 - Xeon Phi / Skylake
28/// And some examples of instruction target dependent costs (latency)
29/// divss sqrtss rsqrtss
30/// AMD K7 11-16 19 3
31/// Piledriver 9-24 13-15 5
32/// Jaguar 14 16 2
33/// Pentium II,III 18 30 2
34/// Nehalem 7-14 7-18 3
35/// Haswell 10-13 11 5
36/// TODO: Develop and implement the target dependent cost model and
37/// specialize cost numbers for different Cost Model Targets such as throughput,
38/// code size, latency and uop count.
39//===----------------------------------------------------------------------===//
40
41#include "X86TargetTransformInfo.h"
42#include "llvm/Analysis/TargetTransformInfo.h"
43#include "llvm/CodeGen/BasicTTIImpl.h"
44#include "llvm/CodeGen/CostTable.h"
45#include "llvm/CodeGen/TargetLowering.h"
46#include "llvm/IR/IntrinsicInst.h"
47#include "llvm/Support/Debug.h"
48
49using namespace llvm;
50
51#define DEBUG_TYPE"x86tti" "x86tti"
52
53//===----------------------------------------------------------------------===//
54//
55// X86 cost model.
56//
57//===----------------------------------------------------------------------===//
58
59TargetTransformInfo::PopcntSupportKind
60X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
61 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2")((isPowerOf2_32(TyWidth) && "Ty width must be power of 2"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(TyWidth) && \"Ty width must be power of 2\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 61, __PRETTY_FUNCTION__))
;
62 // TODO: Currently the __builtin_popcount() implementation using SSE3
63 // instructions is inefficient. Once the problem is fixed, we should
64 // call ST->hasSSE3() instead of ST->hasPOPCNT().
65 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
66}
67
68llvm::Optional<unsigned> X86TTIImpl::getCacheSize(
69 TargetTransformInfo::CacheLevel Level) const {
70 switch (Level) {
71 case TargetTransformInfo::CacheLevel::L1D:
72 // - Penryn
73 // - Nehalem
74 // - Westmere
75 // - Sandy Bridge
76 // - Ivy Bridge
77 // - Haswell
78 // - Broadwell
79 // - Skylake
80 // - Kabylake
81 return 32 * 1024; // 32 KByte
82 case TargetTransformInfo::CacheLevel::L2D:
83 // - Penryn
84 // - Nehalem
85 // - Westmere
86 // - Sandy Bridge
87 // - Ivy Bridge
88 // - Haswell
89 // - Broadwell
90 // - Skylake
91 // - Kabylake
92 return 256 * 1024; // 256 KByte
93 }
94
95 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel")::llvm::llvm_unreachable_internal("Unknown TargetTransformInfo::CacheLevel"
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 95)
;
96}
97
98llvm::Optional<unsigned> X86TTIImpl::getCacheAssociativity(
99 TargetTransformInfo::CacheLevel Level) const {
100 // - Penryn
101 // - Nehalem
102 // - Westmere
103 // - Sandy Bridge
104 // - Ivy Bridge
105 // - Haswell
106 // - Broadwell
107 // - Skylake
108 // - Kabylake
109 switch (Level) {
110 case TargetTransformInfo::CacheLevel::L1D:
111 LLVM_FALLTHROUGH[[gnu::fallthrough]];
112 case TargetTransformInfo::CacheLevel::L2D:
113 return 8;
114 }
115
116 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel")::llvm::llvm_unreachable_internal("Unknown TargetTransformInfo::CacheLevel"
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 116)
;
117}
118
119unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
120 bool Vector = (ClassID == 1);
121 if (Vector && !ST->hasSSE1())
122 return 0;
123
124 if (ST->is64Bit()) {
125 if (Vector && ST->hasAVX512())
126 return 32;
127 return 16;
128 }
129 return 8;
130}
131
132unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const {
133 unsigned PreferVectorWidth = ST->getPreferVectorWidth();
134 if (Vector) {
135 if (ST->hasAVX512() && PreferVectorWidth >= 512)
136 return 512;
137 if (ST->hasAVX() && PreferVectorWidth >= 256)
138 return 256;
139 if (ST->hasSSE1() && PreferVectorWidth >= 128)
140 return 128;
141 return 0;
142 }
143
144 if (ST->is64Bit())
145 return 64;
146
147 return 32;
148}
149
150unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
151 return getRegisterBitWidth(true);
152}
153
154unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
155 // If the loop will not be vectorized, don't interleave the loop.
156 // Let regular unroll to unroll the loop, which saves the overflow
157 // check and memory check cost.
158 if (VF == 1)
159 return 1;
160
161 if (ST->isAtom())
162 return 1;
163
164 // Sandybridge and Haswell have multiple execution ports and pipelined
165 // vector units.
166 if (ST->hasAVX())
167 return 4;
168
169 return 2;
170}
171
172int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
173 TTI::TargetCostKind CostKind,
174 TTI::OperandValueKind Op1Info,
175 TTI::OperandValueKind Op2Info,
176 TTI::OperandValueProperties Opd1PropInfo,
177 TTI::OperandValueProperties Opd2PropInfo,
178 ArrayRef<const Value *> Args,
179 const Instruction *CxtI) {
180 // TODO: Handle more cost kinds.
181 if (CostKind != TTI::TCK_RecipThroughput)
182 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
183 Op2Info, Opd1PropInfo,
184 Opd2PropInfo, Args, CxtI);
185 // Legalize the type.
186 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
187
188 int ISD = TLI->InstructionOpcodeToISD(Opcode);
189 assert(ISD && "Invalid opcode")((ISD && "Invalid opcode") ? static_cast<void> (
0) : __assert_fail ("ISD && \"Invalid opcode\"", "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 189, __PRETTY_FUNCTION__))
;
190
191 static const CostTblEntry GLMCostTable[] = {
192 { ISD::FDIV, MVT::f32, 18 }, // divss
193 { ISD::FDIV, MVT::v4f32, 35 }, // divps
194 { ISD::FDIV, MVT::f64, 33 }, // divsd
195 { ISD::FDIV, MVT::v2f64, 65 }, // divpd
196 };
197
198 if (ST->useGLMDivSqrtCosts())
199 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD,
200 LT.second))
201 return LT.first * Entry->Cost;
202
203 static const CostTblEntry SLMCostTable[] = {
204 { ISD::MUL, MVT::v4i32, 11 }, // pmulld
205 { ISD::MUL, MVT::v8i16, 2 }, // pmullw
206 { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
207 { ISD::FMUL, MVT::f64, 2 }, // mulsd
208 { ISD::FMUL, MVT::v2f64, 4 }, // mulpd
209 { ISD::FMUL, MVT::v4f32, 2 }, // mulps
210 { ISD::FDIV, MVT::f32, 17 }, // divss
211 { ISD::FDIV, MVT::v4f32, 39 }, // divps
212 { ISD::FDIV, MVT::f64, 32 }, // divsd
213 { ISD::FDIV, MVT::v2f64, 69 }, // divpd
214 { ISD::FADD, MVT::v2f64, 2 }, // addpd
215 { ISD::FSUB, MVT::v2f64, 2 }, // subpd
216 // v2i64/v4i64 mul is custom lowered as a series of long:
217 // multiplies(3), shifts(3) and adds(2)
218 // slm muldq version throughput is 2 and addq throughput 4
219 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
220 // 3X4 (addq throughput) = 17
221 { ISD::MUL, MVT::v2i64, 17 },
222 // slm addq\subq throughput is 4
223 { ISD::ADD, MVT::v2i64, 4 },
224 { ISD::SUB, MVT::v2i64, 4 },
225 };
226
227 if (ST->isSLM()) {
228 if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
229 // Check if the operands can be shrinked into a smaller datatype.
230 bool Op1Signed = false;
231 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
232 bool Op2Signed = false;
233 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
234
235 bool SignedMode = Op1Signed || Op2Signed;
236 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
237
238 if (OpMinSize <= 7)
239 return LT.first * 3; // pmullw/sext
240 if (!SignedMode && OpMinSize <= 8)
241 return LT.first * 3; // pmullw/zext
242 if (OpMinSize <= 15)
243 return LT.first * 5; // pmullw/pmulhw/pshuf
244 if (!SignedMode && OpMinSize <= 16)
245 return LT.first * 5; // pmullw/pmulhw/pshuf
246 }
247
248 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
249 LT.second)) {
250 return LT.first * Entry->Cost;
251 }
252 }
253
254 if ((ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
255 ISD == ISD::UREM) &&
256 (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
257 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
258 Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
259 if (ISD == ISD::SDIV || ISD == ISD::SREM) {
260 // On X86, vector signed division by constants power-of-two are
261 // normally expanded to the sequence SRA + SRL + ADD + SRA.
262 // The OperandValue properties may not be the same as that of the previous
263 // operation; conservatively assume OP_None.
264 int Cost =
265 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Op1Info,
266 Op2Info,
267 TargetTransformInfo::OP_None,
268 TargetTransformInfo::OP_None);
269 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info,
270 Op2Info,
271 TargetTransformInfo::OP_None,
272 TargetTransformInfo::OP_None);
273 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, Op1Info,
274 Op2Info,
275 TargetTransformInfo::OP_None,
276 TargetTransformInfo::OP_None);
277
278 if (ISD == ISD::SREM) {
279 // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
280 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info,
281 Op2Info);
282 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info,
283 Op2Info);
284 }
285
286 return Cost;
287 }
288
289 // Vector unsigned division/remainder will be simplified to shifts/masks.
290 if (ISD == ISD::UDIV)
291 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
292 Op1Info, Op2Info,
293 TargetTransformInfo::OP_None,
294 TargetTransformInfo::OP_None);
295
296 else // UREM
297 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
298 Op1Info, Op2Info,
299 TargetTransformInfo::OP_None,
300 TargetTransformInfo::OP_None);
301 }
302
303 static const CostTblEntry AVX512BWUniformConstCostTable[] = {
304 { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand.
305 { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand.
306 { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb.
307 };
308
309 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
310 ST->hasBWI()) {
311 if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
312 LT.second))
313 return LT.first * Entry->Cost;
314 }
315
316 static const CostTblEntry AVX512UniformConstCostTable[] = {
317 { ISD::SRA, MVT::v2i64, 1 },
318 { ISD::SRA, MVT::v4i64, 1 },
319 { ISD::SRA, MVT::v8i64, 1 },
320
321 { ISD::SHL, MVT::v64i8, 4 }, // psllw + pand.
322 { ISD::SRL, MVT::v64i8, 4 }, // psrlw + pand.
323 { ISD::SRA, MVT::v64i8, 8 }, // psrlw, pand, pxor, psubb.
324
325 { ISD::SDIV, MVT::v16i32, 6 }, // pmuludq sequence
326 { ISD::SREM, MVT::v16i32, 8 }, // pmuludq+mul+sub sequence
327 { ISD::UDIV, MVT::v16i32, 5 }, // pmuludq sequence
328 { ISD::UREM, MVT::v16i32, 7 }, // pmuludq+mul+sub sequence
329 };
330
331 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
332 ST->hasAVX512()) {
333 if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
334 LT.second))
335 return LT.first * Entry->Cost;
336 }
337
338 static const CostTblEntry AVX2UniformConstCostTable[] = {
339 { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand.
340 { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand.
341 { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb.
342
343 { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
344
345 { ISD::SDIV, MVT::v8i32, 6 }, // pmuludq sequence
346 { ISD::SREM, MVT::v8i32, 8 }, // pmuludq+mul+sub sequence
347 { ISD::UDIV, MVT::v8i32, 5 }, // pmuludq sequence
348 { ISD::UREM, MVT::v8i32, 7 }, // pmuludq+mul+sub sequence
349 };
350
351 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
352 ST->hasAVX2()) {
353 if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
354 LT.second))
355 return LT.first * Entry->Cost;
356 }
357
358 static const CostTblEntry SSE2UniformConstCostTable[] = {
359 { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand.
360 { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand.
361 { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
362
363 { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split.
364 { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split.
365 { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
366
367 { ISD::SDIV, MVT::v8i32, 12+2 }, // 2*pmuludq sequence + split.
368 { ISD::SREM, MVT::v8i32, 16+2 }, // 2*pmuludq+mul+sub sequence + split.
369 { ISD::SDIV, MVT::v4i32, 6 }, // pmuludq sequence
370 { ISD::SREM, MVT::v4i32, 8 }, // pmuludq+mul+sub sequence
371 { ISD::UDIV, MVT::v8i32, 10+2 }, // 2*pmuludq sequence + split.
372 { ISD::UREM, MVT::v8i32, 14+2 }, // 2*pmuludq+mul+sub sequence + split.
373 { ISD::UDIV, MVT::v4i32, 5 }, // pmuludq sequence
374 { ISD::UREM, MVT::v4i32, 7 }, // pmuludq+mul+sub sequence
375 };
376
377 // XOP has faster vXi8 shifts.
378 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
379 ST->hasSSE2() && !ST->hasXOP()) {
380 if (const auto *Entry =
381 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
382 return LT.first * Entry->Cost;
383 }
384
385 static const CostTblEntry AVX512BWConstCostTable[] = {
386 { ISD::SDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
387 { ISD::SREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
388 { ISD::UDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
389 { ISD::UREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
390 { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence
391 { ISD::SREM, MVT::v32i16, 8 }, // vpmulhw+mul+sub sequence
392 { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence
393 { ISD::UREM, MVT::v32i16, 8 }, // vpmulhuw+mul+sub sequence
394 };
395
396 if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
397 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
398 ST->hasBWI()) {
399 if (const auto *Entry =
400 CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
401 return LT.first * Entry->Cost;
402 }
403
404 static const CostTblEntry AVX512ConstCostTable[] = {
405 { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
406 { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
407 { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
408 { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
409 { ISD::SDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
410 { ISD::SREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
411 { ISD::UDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
412 { ISD::UREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
413 { ISD::SDIV, MVT::v32i16, 12 }, // 2*vpmulhw sequence
414 { ISD::SREM, MVT::v32i16, 16 }, // 2*vpmulhw+mul+sub sequence
415 { ISD::UDIV, MVT::v32i16, 12 }, // 2*vpmulhuw sequence
416 { ISD::UREM, MVT::v32i16, 16 }, // 2*vpmulhuw+mul+sub sequence
417 };
418
419 if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
420 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
421 ST->hasAVX512()) {
422 if (const auto *Entry =
423 CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
424 return LT.first * Entry->Cost;
425 }
426
427 static const CostTblEntry AVX2ConstCostTable[] = {
428 { ISD::SDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
429 { ISD::SREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
430 { ISD::UDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
431 { ISD::UREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
432 { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
433 { ISD::SREM, MVT::v16i16, 8 }, // vpmulhw+mul+sub sequence
434 { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
435 { ISD::UREM, MVT::v16i16, 8 }, // vpmulhuw+mul+sub sequence
436 { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
437 { ISD::SREM, MVT::v8i32, 19 }, // vpmuldq+mul+sub sequence
438 { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
439 { ISD::UREM, MVT::v8i32, 19 }, // vpmuludq+mul+sub sequence
440 };
441
442 if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
443 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
444 ST->hasAVX2()) {
445 if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
446 return LT.first * Entry->Cost;
447 }
448
449 static const CostTblEntry SSE2ConstCostTable[] = {
450 { ISD::SDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
451 { ISD::SREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
452 { ISD::SDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
453 { ISD::SREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
454 { ISD::UDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
455 { ISD::UREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
456 { ISD::UDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
457 { ISD::UREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
458 { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
459 { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split.
460 { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
461 { ISD::SREM, MVT::v8i16, 8 }, // pmulhw+mul+sub sequence
462 { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
463 { ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split.
464 { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
465 { ISD::UREM, MVT::v8i16, 8 }, // pmulhuw+mul+sub sequence
466 { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split.
467 { ISD::SREM, MVT::v8i32, 48+2 }, // 2*pmuludq+mul+sub sequence + split.
468 { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
469 { ISD::SREM, MVT::v4i32, 24 }, // pmuludq+mul+sub sequence
470 { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split.
471 { ISD::UREM, MVT::v8i32, 40+2 }, // 2*pmuludq+mul+sub sequence + split.
472 { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
473 { ISD::UREM, MVT::v4i32, 20 }, // pmuludq+mul+sub sequence
474 };
475
476 if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
477 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
478 ST->hasSSE2()) {
479 // pmuldq sequence.
480 if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
481 return LT.first * 32;
482 if (ISD == ISD::SREM && LT.second == MVT::v8i32 && ST->hasAVX())
483 return LT.first * 38;
484 if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
485 return LT.first * 15;
486 if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41())
487 return LT.first * 20;
488
489 if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
490 return LT.first * Entry->Cost;
491 }
492
493 static const CostTblEntry AVX512BWShiftCostTable[] = {
494 { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
495 { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
496 { ISD::SRA, MVT::v8i16, 1 }, // vpsravw
497
498 { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
499 { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
500 { ISD::SRA, MVT::v16i16, 1 }, // vpsravw
501
502 { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
503 { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
504 { ISD::SRA, MVT::v32i16, 1 }, // vpsravw
505 };
506
507 if (ST->hasBWI())
508 if (const auto *Entry = CostTableLookup(AVX512BWShiftCostTable, ISD, LT.second))
509 return LT.first * Entry->Cost;
510
511 static const CostTblEntry AVX2UniformCostTable[] = {
512 // Uniform splats are cheaper for the following instructions.
513 { ISD::SHL, MVT::v16i16, 1 }, // psllw.
514 { ISD::SRL, MVT::v16i16, 1 }, // psrlw.
515 { ISD::SRA, MVT::v16i16, 1 }, // psraw.
516 { ISD::SHL, MVT::v32i16, 2 }, // 2*psllw.
517 { ISD::SRL, MVT::v32i16, 2 }, // 2*psrlw.
518 { ISD::SRA, MVT::v32i16, 2 }, // 2*psraw.
519 };
520
521 if (ST->hasAVX2() &&
522 ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
523 (Op2Info == TargetTransformInfo::OK_UniformValue))) {
524 if (const auto *Entry =
525 CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
526 return LT.first * Entry->Cost;
527 }
528
529 static const CostTblEntry SSE2UniformCostTable[] = {
530 // Uniform splats are cheaper for the following instructions.
531 { ISD::SHL, MVT::v8i16, 1 }, // psllw.
532 { ISD::SHL, MVT::v4i32, 1 }, // pslld
533 { ISD::SHL, MVT::v2i64, 1 }, // psllq.
534
535 { ISD::SRL, MVT::v8i16, 1 }, // psrlw.
536 { ISD::SRL, MVT::v4i32, 1 }, // psrld.
537 { ISD::SRL, MVT::v2i64, 1 }, // psrlq.
538
539 { ISD::SRA, MVT::v8i16, 1 }, // psraw.
540 { ISD::SRA, MVT::v4i32, 1 }, // psrad.
541 };
542
543 if (ST->hasSSE2() &&
544 ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
545 (Op2Info == TargetTransformInfo::OK_UniformValue))) {
546 if (const auto *Entry =
547 CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
548 return LT.first * Entry->Cost;
549 }
550
551 static const CostTblEntry AVX512DQCostTable[] = {
552 { ISD::MUL, MVT::v2i64, 1 },
553 { ISD::MUL, MVT::v4i64, 1 },
554 { ISD::MUL, MVT::v8i64, 1 }
555 };
556
557 // Look for AVX512DQ lowering tricks for custom cases.
558 if (ST->hasDQI())
559 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
560 return LT.first * Entry->Cost;
561
562 static const CostTblEntry AVX512BWCostTable[] = {
563 { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence.
564 { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence.
565 { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence.
566
567 { ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence.
568 { ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence.
569 { ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence.
570 };
571
572 // Look for AVX512BW lowering tricks for custom cases.
573 if (ST->hasBWI())
574 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
575 return LT.first * Entry->Cost;
576
577 static const CostTblEntry AVX512CostTable[] = {
578 { ISD::SHL, MVT::v16i32, 1 },
579 { ISD::SRL, MVT::v16i32, 1 },
580 { ISD::SRA, MVT::v16i32, 1 },
581
582 { ISD::SHL, MVT::v8i64, 1 },
583 { ISD::SRL, MVT::v8i64, 1 },
584
585 { ISD::SRA, MVT::v2i64, 1 },
586 { ISD::SRA, MVT::v4i64, 1 },
587 { ISD::SRA, MVT::v8i64, 1 },
588
589 { ISD::MUL, MVT::v64i8, 26 }, // extend/pmullw/trunc sequence.
590 { ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence.
591 { ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence.
592 { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org)
593 { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org)
594 { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org)
595 { ISD::MUL, MVT::v8i64, 8 }, // 3*pmuludq/3*shift/2*add
596
597 { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
598 { ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
599 { ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
600
601 { ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
602 { ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
603 { ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
604 };
605
606 if (ST->hasAVX512())
607 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
608 return LT.first * Entry->Cost;
609
610 static const CostTblEntry AVX2ShiftCostTable[] = {
611 // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
612 // customize them to detect the cases where shift amount is a scalar one.
613 { ISD::SHL, MVT::v4i32, 1 },
614 { ISD::SRL, MVT::v4i32, 1 },
615 { ISD::SRA, MVT::v4i32, 1 },
616 { ISD::SHL, MVT::v8i32, 1 },
617 { ISD::SRL, MVT::v8i32, 1 },
618 { ISD::SRA, MVT::v8i32, 1 },
619 { ISD::SHL, MVT::v2i64, 1 },
620 { ISD::SRL, MVT::v2i64, 1 },
621 { ISD::SHL, MVT::v4i64, 1 },
622 { ISD::SRL, MVT::v4i64, 1 },
623 };
624
625 if (ST->hasAVX512()) {
626 if (ISD == ISD::SHL && LT.second == MVT::v32i16 &&
627 (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
628 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
629 // On AVX512, a packed v32i16 shift left by a constant build_vector
630 // is lowered into a vector multiply (vpmullw).
631 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
632 Op1Info, Op2Info,
633 TargetTransformInfo::OP_None,
634 TargetTransformInfo::OP_None);
635 }
636
637 // Look for AVX2 lowering tricks.
638 if (ST->hasAVX2()) {
639 if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
640 (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
641 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
642 // On AVX2, a packed v16i16 shift left by a constant build_vector
643 // is lowered into a vector multiply (vpmullw).
644 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
645 Op1Info, Op2Info,
646 TargetTransformInfo::OP_None,
647 TargetTransformInfo::OP_None);
648
649 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
650 return LT.first * Entry->Cost;
651 }
652
653 static const CostTblEntry XOPShiftCostTable[] = {
654 // 128bit shifts take 1cy, but right shifts require negation beforehand.
655 { ISD::SHL, MVT::v16i8, 1 },
656 { ISD::SRL, MVT::v16i8, 2 },
657 { ISD::SRA, MVT::v16i8, 2 },
658 { ISD::SHL, MVT::v8i16, 1 },
659 { ISD::SRL, MVT::v8i16, 2 },
660 { ISD::SRA, MVT::v8i16, 2 },
661 { ISD::SHL, MVT::v4i32, 1 },
662 { ISD::SRL, MVT::v4i32, 2 },
663 { ISD::SRA, MVT::v4i32, 2 },
664 { ISD::SHL, MVT::v2i64, 1 },
665 { ISD::SRL, MVT::v2i64, 2 },
666 { ISD::SRA, MVT::v2i64, 2 },
667 // 256bit shifts require splitting if AVX2 didn't catch them above.
668 { ISD::SHL, MVT::v32i8, 2+2 },
669 { ISD::SRL, MVT::v32i8, 4+2 },
670 { ISD::SRA, MVT::v32i8, 4+2 },
671 { ISD::SHL, MVT::v16i16, 2+2 },
672 { ISD::SRL, MVT::v16i16, 4+2 },
673 { ISD::SRA, MVT::v16i16, 4+2 },
674 { ISD::SHL, MVT::v8i32, 2+2 },
675 { ISD::SRL, MVT::v8i32, 4+2 },
676 { ISD::SRA, MVT::v8i32, 4+2 },
677 { ISD::SHL, MVT::v4i64, 2+2 },
678 { ISD::SRL, MVT::v4i64, 4+2 },
679 { ISD::SRA, MVT::v4i64, 4+2 },
680 };
681
682 // Look for XOP lowering tricks.
683 if (ST->hasXOP()) {
684 // If the right shift is constant then we'll fold the negation so
685 // it's as cheap as a left shift.
686 int ShiftISD = ISD;
687 if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) &&
688 (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
689 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
690 ShiftISD = ISD::SHL;
691 if (const auto *Entry =
692 CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
693 return LT.first * Entry->Cost;
694 }
695
696 static const CostTblEntry SSE2UniformShiftCostTable[] = {
697 // Uniform splats are cheaper for the following instructions.
698 { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split.
699 { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split.
700 { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split.
701
702 { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split.
703 { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split.
704 { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split.
705
706 { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split.
707 { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split.
708 { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle.
709 { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split.
710 };
711
712 if (ST->hasSSE2() &&
713 ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
714 (Op2Info == TargetTransformInfo::OK_UniformValue))) {
715
716 // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
717 if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2())
718 return LT.first * 4; // 2*psrad + shuffle.
719
720 if (const auto *Entry =
721 CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
722 return LT.first * Entry->Cost;
723 }
724
725 if (ISD == ISD::SHL &&
726 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) {
727 MVT VT = LT.second;
728 // Vector shift left by non uniform constant can be lowered
729 // into vector multiply.
730 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
731 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
732 ISD = ISD::MUL;
733 }
734
735 static const CostTblEntry AVX2CostTable[] = {
736 { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence.
737 { ISD::SHL, MVT::v64i8, 22 }, // 2*vpblendvb sequence.
738 { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
739 { ISD::SHL, MVT::v32i16, 20 }, // 2*extend/vpsrlvd/pack sequence.
740
741 { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence.
742 { ISD::SRL, MVT::v64i8, 22 }, // 2*vpblendvb sequence.
743 { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
744 { ISD::SRL, MVT::v32i16, 20 }, // 2*extend/vpsrlvd/pack sequence.
745
746 { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence.
747 { ISD::SRA, MVT::v64i8, 48 }, // 2*vpblendvb sequence.
748 { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence.
749 { ISD::SRA, MVT::v32i16, 20 }, // 2*extend/vpsravd/pack sequence.
750 { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence.
751 { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence.
752
753 { ISD::SUB, MVT::v32i8, 1 }, // psubb
754 { ISD::ADD, MVT::v32i8, 1 }, // paddb
755 { ISD::SUB, MVT::v16i16, 1 }, // psubw
756 { ISD::ADD, MVT::v16i16, 1 }, // paddw
757 { ISD::SUB, MVT::v8i32, 1 }, // psubd
758 { ISD::ADD, MVT::v8i32, 1 }, // paddd
759 { ISD::SUB, MVT::v4i64, 1 }, // psubq
760 { ISD::ADD, MVT::v4i64, 1 }, // paddq
761
762 { ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence.
763 { ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence.
764 { ISD::MUL, MVT::v16i16, 1 }, // pmullw
765 { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org)
766 { ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add
767
768 { ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
769 { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
770 { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
771 { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
772 { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
773 { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
774
775 { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
776 { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
777 { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
778 { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/
779 { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
780 { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
781 };
782
783 // Look for AVX2 lowering tricks for custom cases.
784 if (ST->hasAVX2())
785 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
786 return LT.first * Entry->Cost;
787
788 static const CostTblEntry AVX1CostTable[] = {
789 // We don't have to scalarize unsupported ops. We can issue two half-sized
790 // operations and we only need to extract the upper YMM half.
791 // Two ops + 1 extract + 1 insert = 4.
792 { ISD::MUL, MVT::v16i16, 4 },
793 { ISD::MUL, MVT::v8i32, 4 },
794 { ISD::SUB, MVT::v32i8, 4 },
795 { ISD::ADD, MVT::v32i8, 4 },
796 { ISD::SUB, MVT::v16i16, 4 },
797 { ISD::ADD, MVT::v16i16, 4 },
798 { ISD::SUB, MVT::v8i32, 4 },
799 { ISD::ADD, MVT::v8i32, 4 },
800 { ISD::SUB, MVT::v4i64, 4 },
801 { ISD::ADD, MVT::v4i64, 4 },
802
803 // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
804 // are lowered as a series of long multiplies(3), shifts(3) and adds(2)
805 // Because we believe v4i64 to be a legal type, we must also include the
806 // extract+insert in the cost table. Therefore, the cost here is 18
807 // instead of 8.
808 { ISD::MUL, MVT::v4i64, 18 },
809
810 { ISD::MUL, MVT::v32i8, 26 }, // extend/pmullw/trunc sequence.
811
812 { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
813 { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
814 { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
815 { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/
816 { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/
817 { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/
818 };
819
820 if (ST->hasAVX())
821 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
822 return LT.first * Entry->Cost;
823
824 static const CostTblEntry SSE42CostTable[] = {
825 { ISD::FADD, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
826 { ISD::FADD, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
827 { ISD::FADD, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
828 { ISD::FADD, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
829
830 { ISD::FSUB, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
831 { ISD::FSUB, MVT::f32 , 1 }, // Nehalem from http://www.agner.org/
832 { ISD::FSUB, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
833 { ISD::FSUB, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
834
835 { ISD::FMUL, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
836 { ISD::FMUL, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
837 { ISD::FMUL, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
838 { ISD::FMUL, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
839
840 { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/
841 { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
842 { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/
843 { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
844 };
845
846 if (ST->hasSSE42())
847 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
848 return LT.first * Entry->Cost;
849
850 static const CostTblEntry SSE41CostTable[] = {
851 { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence.
852 { ISD::SHL, MVT::v32i8, 2*11+2 }, // pblendvb sequence + split.
853 { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence.
854 { ISD::SHL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
855 { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
856 { ISD::SHL, MVT::v8i32, 2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split
857
858 { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence.
859 { ISD::SRL, MVT::v32i8, 2*12+2 }, // pblendvb sequence + split.
860 { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence.
861 { ISD::SRL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
862 { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend.
863 { ISD::SRL, MVT::v8i32, 2*11+2 }, // Shift each lane + blend + split.
864
865 { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence.
866 { ISD::SRA, MVT::v32i8, 2*24+2 }, // pblendvb sequence + split.
867 { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence.
868 { ISD::SRA, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
869 { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
870 { ISD::SRA, MVT::v8i32, 2*12+2 }, // Shift each lane + blend + split.
871
872 { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org)
873 };
874
875 if (ST->hasSSE41())
876 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
877 return LT.first * Entry->Cost;
878
879 static const CostTblEntry SSE2CostTable[] = {
880 // We don't correctly identify costs of casts because they are marked as
881 // custom.
882 { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence.
883 { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence.
884 { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
885 { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
886 { ISD::SHL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
887
888 { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence.
889 { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence.
890 { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
891 { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
892 { ISD::SRL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
893
894 { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence.
895 { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence.
896 { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend.
897 { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence.
898 { ISD::SRA, MVT::v4i64, 2*12+2 }, // srl/xor/sub sequence+split.
899
900 { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence.
901 { ISD::MUL, MVT::v8i16, 1 }, // pmullw
902 { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
903 { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
904
905 { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/
906 { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
907 { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
908 { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
909
910 { ISD::FADD, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
911 { ISD::FADD, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
912
913 { ISD::FSUB, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
914 { ISD::FSUB, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
915 };
916
917 if (ST->hasSSE2())
918 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
919 return LT.first * Entry->Cost;
920
921 static const CostTblEntry SSE1CostTable[] = {
922 { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/
923 { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
924
925 { ISD::FADD, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
926 { ISD::FADD, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
927
928 { ISD::FSUB, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
929 { ISD::FSUB, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
930
931 { ISD::ADD, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
932 { ISD::ADD, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
933 { ISD::ADD, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
934
935 { ISD::SUB, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
936 { ISD::SUB, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
937 { ISD::SUB, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
938 };
939
940 if (ST->hasSSE1())
941 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
942 return LT.first * Entry->Cost;
943
944 // It is not a good idea to vectorize division. We have to scalarize it and
945 // in the process we will often end up having to spilling regular
946 // registers. The overhead of division is going to dominate most kernels
947 // anyways so try hard to prevent vectorization of division - it is
948 // generally a bad idea. Assume somewhat arbitrarily that we have to be able
949 // to hide "20 cycles" for each lane.
950 if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM ||
951 ISD == ISD::UDIV || ISD == ISD::UREM)) {
952 int ScalarCost = getArithmeticInstrCost(
953 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info,
954 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
955 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
956 }
957
958 // Fallback to the default implementation.
959 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info);
960}
961
962int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp,
963 int Index, VectorType *SubTp) {
964 // 64-bit packed float vectors (v2f32) are widened to type v4f32.
965 // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
966 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, BaseTp);
967
968 // Treat Transpose as 2-op shuffles - there's no difference in lowering.
969 if (Kind == TTI::SK_Transpose)
970 Kind = TTI::SK_PermuteTwoSrc;
971
972 // For Broadcasts we are splatting the first element from the first input
973 // register, so only need to reference that input and all the output
974 // registers are the same.
975 if (Kind == TTI::SK_Broadcast)
976 LT.first = 1;
977
978 // Subvector extractions are free if they start at the beginning of a
979 // vector and cheap if the subvectors are aligned.
980 if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
981 int NumElts = LT.second.getVectorNumElements();
982 if ((Index % NumElts) == 0)
983 return 0;
984 std::pair<int, MVT> SubLT = TLI->getTypeLegalizationCost(DL, SubTp);
985 if (SubLT.second.isVector()) {
986 int NumSubElts = SubLT.second.getVectorNumElements();
987 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
988 return SubLT.first;
989 // Handle some cases for widening legalization. For now we only handle
990 // cases where the original subvector was naturally aligned and evenly
991 // fit in its legalized subvector type.
992 // FIXME: Remove some of the alignment restrictions.
993 // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
994 // vectors.
995 int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
996 if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
997 (NumSubElts % OrigSubElts) == 0 &&
998 LT.second.getVectorElementType() ==
999 SubLT.second.getVectorElementType() &&
1000 LT.second.getVectorElementType().getSizeInBits() ==
1001 BaseTp->getElementType()->getPrimitiveSizeInBits()) {
1002 assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&((NumElts >= NumSubElts && NumElts > OrigSubElts
&& "Unexpected number of elements!") ? static_cast<
void> (0) : __assert_fail ("NumElts >= NumSubElts && NumElts > OrigSubElts && \"Unexpected number of elements!\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 1003, __PRETTY_FUNCTION__))
1003 "Unexpected number of elements!")((NumElts >= NumSubElts && NumElts > OrigSubElts
&& "Unexpected number of elements!") ? static_cast<
void> (0) : __assert_fail ("NumElts >= NumSubElts && NumElts > OrigSubElts && \"Unexpected number of elements!\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 1003, __PRETTY_FUNCTION__))
;
1004 auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
1005 LT.second.getVectorNumElements());
1006 auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
1007 SubLT.second.getVectorNumElements());
1008 int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1009 int ExtractCost = getShuffleCost(TTI::SK_ExtractSubvector, VecTy,
1010 ExtractIndex, SubTy);
1011
1012 // If the original size is 32-bits or more, we can use pshufd. Otherwise
1013 // if we have SSSE3 we can use pshufb.
1014 if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1015 return ExtractCost + 1; // pshufd or pshufb
1016
1017 assert(SubTp->getPrimitiveSizeInBits() == 16 &&((SubTp->getPrimitiveSizeInBits() == 16 && "Unexpected vector size"
) ? static_cast<void> (0) : __assert_fail ("SubTp->getPrimitiveSizeInBits() == 16 && \"Unexpected vector size\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 1018, __PRETTY_FUNCTION__))
1018 "Unexpected vector size")((SubTp->getPrimitiveSizeInBits() == 16 && "Unexpected vector size"
) ? static_cast<void> (0) : __assert_fail ("SubTp->getPrimitiveSizeInBits() == 16 && \"Unexpected vector size\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 1018, __PRETTY_FUNCTION__))
;
1019
1020 return ExtractCost + 2; // worst case pshufhw + pshufd
1021 }
1022 }
1023 }
1024
1025 // Handle some common (illegal) sub-vector types as they are often very cheap
1026 // to shuffle even on targets without PSHUFB.
1027 EVT VT = TLI->getValueType(DL, BaseTp);
1028 if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1029 !ST->hasSSSE3()) {
1030 static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
1031 {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw
1032 {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw
1033 {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw
1034 {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw
1035 {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck
1036
1037 {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw
1038 {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw
1039 {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus
1040 {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck
1041
1042 {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw
1043 {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw
1044 {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw
1045 {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw
1046 {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck
1047
1048 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
1049 {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
1050 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw
1051 {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw
1052 {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck
1053 };
1054
1055 if (ST->hasSSE2())
1056 if (const auto *Entry =
1057 CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1058 return Entry->Cost;
1059 }
1060
1061 // We are going to permute multiple sources and the result will be in multiple
1062 // destinations. Providing an accurate cost only for splits where the element
1063 // type remains the same.
1064 if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
1065 MVT LegalVT = LT.second;
1066 if (LegalVT.isVector() &&
1067 LegalVT.getVectorElementType().getSizeInBits() ==
1068 BaseTp->getElementType()->getPrimitiveSizeInBits() &&
1069 LegalVT.getVectorNumElements() <
1070 cast<FixedVectorType>(BaseTp)->getNumElements()) {
1071
1072 unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
1073 unsigned LegalVTSize = LegalVT.getStoreSize();
1074 // Number of source vectors after legalization:
1075 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1076 // Number of destination vectors after legalization:
1077 unsigned NumOfDests = LT.first;
1078
1079 auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
1080 LegalVT.getVectorNumElements());
1081
1082 unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1083 return NumOfShuffles *
1084 getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
1085 }
1086
1087 return BaseT::getShuffleCost(Kind, BaseTp, Index, SubTp);
1088 }
1089
1090 // For 2-input shuffles, we must account for splitting the 2 inputs into many.
1091 if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
1092 // We assume that source and destination have the same vector type.
1093 int NumOfDests = LT.first;
1094 int NumOfShufflesPerDest = LT.first * 2 - 1;
1095 LT.first = NumOfDests * NumOfShufflesPerDest;
1096 }
1097
1098 static const CostTblEntry AVX512VBMIShuffleTbl[] = {
1099 {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
1100 {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
1101
1102 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
1103 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
1104
1105 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
1106 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
1107 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b
1108 };
1109
1110 if (ST->hasVBMI())
1111 if (const auto *Entry =
1112 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1113 return LT.first * Entry->Cost;
1114
1115 static const CostTblEntry AVX512BWShuffleTbl[] = {
1116 {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1117 {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1118
1119 {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
1120 {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
1121 {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
1122
1123 {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
1124 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
1125 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
1126
1127 {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
1128 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
1129 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
1130 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1131
1132 {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
1133 {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
1134 };
1135
1136 if (ST->hasBWI())
1137 if (const auto *Entry =
1138 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1139 return LT.first * Entry->Cost;
1140
1141 static const CostTblEntry AVX512ShuffleTbl[] = {
1142 {TTI::SK_Broadcast, MVT::v8f64, 1}, // vbroadcastpd
1143 {TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps
1144 {TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq
1145 {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd
1146 {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1147 {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1148
1149 {TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd
1150 {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps
1151 {TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq
1152 {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd
1153
1154 {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd
1155 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1156 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // vpermpd
1157 {TTI::SK_PermuteSingleSrc, MVT::v16f32, 1}, // vpermps
1158 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1159 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 1}, // vpermps
1160 {TTI::SK_PermuteSingleSrc, MVT::v8i64, 1}, // vpermq
1161 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1162 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // vpermq
1163 {TTI::SK_PermuteSingleSrc, MVT::v16i32, 1}, // vpermd
1164 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1165 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // vpermd
1166 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1167
1168 {TTI::SK_PermuteTwoSrc, MVT::v8f64, 1}, // vpermt2pd
1169 {TTI::SK_PermuteTwoSrc, MVT::v16f32, 1}, // vpermt2ps
1170 {TTI::SK_PermuteTwoSrc, MVT::v8i64, 1}, // vpermt2q
1171 {TTI::SK_PermuteTwoSrc, MVT::v16i32, 1}, // vpermt2d
1172 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 1}, // vpermt2pd
1173 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 1}, // vpermt2ps
1174 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 1}, // vpermt2q
1175 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 1}, // vpermt2d
1176 {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1}, // vpermt2pd
1177 {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1}, // vpermt2ps
1178 {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1}, // vpermt2q
1179 {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1}, // vpermt2d
1180
1181 // FIXME: This just applies the type legalization cost rules above
1182 // assuming these completely split.
1183 {TTI::SK_PermuteSingleSrc, MVT::v32i16, 14},
1184 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 14},
1185 {TTI::SK_PermuteTwoSrc, MVT::v32i16, 42},
1186 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 42},
1187
1188 {TTI::SK_Select, MVT::v32i16, 1}, // vpternlogq
1189 {TTI::SK_Select, MVT::v64i8, 1}, // vpternlogq
1190 {TTI::SK_Select, MVT::v8f64, 1}, // vblendmpd
1191 {TTI::SK_Select, MVT::v16f32, 1}, // vblendmps
1192 {TTI::SK_Select, MVT::v8i64, 1}, // vblendmq
1193 {TTI::SK_Select, MVT::v16i32, 1}, // vblendmd
1194 };
1195
1196 if (ST->hasAVX512())
1197 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1198 return LT.first * Entry->Cost;
1199
1200 static const CostTblEntry AVX2ShuffleTbl[] = {
1201 {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1202 {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1203 {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1204 {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1205 {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1206 {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1207
1208 {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1209 {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1210 {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1211 {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1212 {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1213 {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1214
1215 {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1216 {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1217
1218 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1219 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1220 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1221 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1222 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1223 // + vpblendvb
1224 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1225 // + vpblendvb
1226
1227 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1228 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1229 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1230 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1231 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1232 // + vpblendvb
1233 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
1234 // + vpblendvb
1235 };
1236
1237 if (ST->hasAVX2())
1238 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1239 return LT.first * Entry->Cost;
1240
1241 static const CostTblEntry XOPShuffleTbl[] = {
1242 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
1243 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
1244 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
1245 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
1246 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1247 // + vinsertf128
1248 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
1249 // + vinsertf128
1250
1251 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1252 // + vinsertf128
1253 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
1254 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
1255 // + vinsertf128
1256 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
1257 };
1258
1259 if (ST->hasXOP())
1260 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1261 return LT.first * Entry->Cost;
1262
1263 static const CostTblEntry AVX1ShuffleTbl[] = {
1264 {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1265 {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1266 {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1267 {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1268 {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1269 {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
1270
1271 {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1272 {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1273 {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1274 {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1275 {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1276 // + vinsertf128
1277 {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
1278 // + vinsertf128
1279
1280 {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
1281 {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
1282 {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
1283 {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
1284 {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
1285 {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
1286
1287 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
1288 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
1289 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1290 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1291 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
1292 // + 2*por + vinsertf128
1293 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
1294 // + 2*por + vinsertf128
1295
1296 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
1297 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
1298 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1299 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1300 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
1301 // + 4*por + vinsertf128
1302 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
1303 // + 4*por + vinsertf128
1304 };
1305
1306 if (ST->hasAVX())
1307 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
1308 return LT.first * Entry->Cost;
1309
1310 static const CostTblEntry SSE41ShuffleTbl[] = {
1311 {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
1312 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1313 {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
1314 {TTI::SK_Select, MVT::v4f32, 1}, // blendps
1315 {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
1316 {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
1317 };
1318
1319 if (ST->hasSSE41())
1320 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
1321 return LT.first * Entry->Cost;
1322
1323 static const CostTblEntry SSSE3ShuffleTbl[] = {
1324 {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
1325 {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
1326
1327 {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
1328 {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
1329
1330 {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
1331 {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
1332
1333 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
1334 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1335
1336 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
1337 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
1338 };
1339
1340 if (ST->hasSSSE3())
1341 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
1342 return LT.first * Entry->Cost;
1343
1344 static const CostTblEntry SSE2ShuffleTbl[] = {
1345 {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
1346 {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
1347 {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
1348 {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
1349 {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
1350
1351 {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
1352 {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
1353 {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
1354 {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
1355 {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
1356 // + 2*pshufd + 2*unpck + packus
1357
1358 {TTI::SK_Select, MVT::v2i64, 1}, // movsd
1359 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1360 {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
1361 {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
1362 {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
1363
1364 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
1365 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
1366 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
1367 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
1368 // + pshufd/unpck
1369 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
1370 // + 2*pshufd + 2*unpck + 2*packus
1371
1372 { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
1373 { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
1374 { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
1375 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
1376 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
1377 };
1378
1379 if (ST->hasSSE2())
1380 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
1381 return LT.first * Entry->Cost;
1382
1383 static const CostTblEntry SSE1ShuffleTbl[] = {
1384 { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
1385 { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
1386 { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
1387 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
1388 { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
1389 };
1390
1391 if (ST->hasSSE1())
1392 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
1393 return LT.first * Entry->Cost;
1394
1395 return BaseT::getShuffleCost(Kind, BaseTp, Index, SubTp);
1396}
1397
1398int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1399 TTI::CastContextHint CCH,
1400 TTI::TargetCostKind CostKind,
1401 const Instruction *I) {
1402 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1403 assert(ISD && "Invalid opcode")((ISD && "Invalid opcode") ? static_cast<void> (
0) : __assert_fail ("ISD && \"Invalid opcode\"", "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 1403, __PRETTY_FUNCTION__))
;
1404
1405 // TODO: Allow non-throughput costs that aren't binary.
1406 auto AdjustCost = [&CostKind](int Cost) {
1407 if (CostKind != TTI::TCK_RecipThroughput)
1408 return Cost == 0 ? 0 : 1;
1409 return Cost;
1410 };
1411
1412 // FIXME: Need a better design of the cost table to handle non-simple types of
1413 // potential massive combinations (elem_num x src_type x dst_type).
1414
1415 static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
1416 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
1417 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
1418
1419 // Mask sign extend has an instruction.
1420 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
1421 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
1422 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
1423 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
1424 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
1425 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
1426 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
1427 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
1428 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
1429 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 },
1430 { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 },
1431
1432 // Mask zero extend is a sext + shift.
1433 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
1434 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
1435 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
1436 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
1437 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
1438 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
1439 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
1440 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
1441 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
1442 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 },
1443 { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 },
1444
1445 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 2 },
1446 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm
1447 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // widen to zmm
1448 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // widen to zmm
1449 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // widen to zmm
1450 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // widen to zmm
1451 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // widen to zmm
1452 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // widen to zmm
1453 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // widen to zmm
1454 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // widen to zmm
1455 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // widen to zmm
1456 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, 2 },
1457 { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, 2 },
1458 };
1459
1460 static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
1461 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
1462 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
1463
1464 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
1465 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
1466
1467 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 },
1468 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 },
1469
1470 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 },
1471 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 },
1472 };
1473
1474 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
1475 // 256-bit wide vectors.
1476
1477 static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
1478 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 },
1479 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 },
1480 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 },
1481
1482 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
1483 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
1484 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
1485 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd
1486 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
1487 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
1488 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
1489 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd
1490 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd
1491 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd
1492 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd
1493 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd
1494 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq
1495 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq
1496 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq
1497 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 },
1498 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 },
1499 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 },
1500 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 },
1501 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 },
1502 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd
1503 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb
1504
1505 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32
1506 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 8 },
1507
1508 // Sign extend is zmm vpternlogd+vptruncdb.
1509 // Zero extend is zmm broadcast load+vptruncdw.
1510 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 3 },
1511 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 4 },
1512 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 3 },
1513 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 4 },
1514 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 3 },
1515 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 4 },
1516 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 3 },
1517 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 4 },
1518
1519 // Sign extend is zmm vpternlogd+vptruncdw.
1520 // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
1521 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 3 },
1522 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 4 },
1523 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 3 },
1524 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 4 },
1525 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 3 },
1526 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 4 },
1527 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 3 },
1528 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
1529
1530 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd
1531 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld
1532 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd
1533 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld
1534 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd
1535 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld
1536 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq
1537 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq
1538 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq
1539 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq
1540
1541 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd
1542 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld
1543 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq
1544 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq
1545
1546 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
1547 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
1548 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
1549 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
1550 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
1551 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
1552 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
1553 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
1554 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
1555 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
1556
1557 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
1558 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
1559
1560 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
1561 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
1562 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 },
1563 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 },
1564 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
1565 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 },
1566 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
1567 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
1568
1569 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
1570 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
1571 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 },
1572 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 },
1573 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
1574 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 },
1575 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
1576 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
1577 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 },
1578 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 },
1579
1580 { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f64, 3 },
1581 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, 3 },
1582 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 3 },
1583 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 3 },
1584
1585 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 },
1586 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 3 },
1587 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 3 },
1588 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 },
1589 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 3 },
1590 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 3 },
1591 };
1592
1593 static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] {
1594 // Mask sign extend has an instruction.
1595 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
1596 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
1597 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
1598 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
1599 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
1600 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
1601 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
1602 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
1603 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
1604
1605 // Mask zero extend is a sext + shift.
1606 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
1607 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
1608 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
1609 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
1610 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
1611 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
1612 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
1613 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
1614 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
1615
1616 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 },
1617 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // vpsllw+vptestmb
1618 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // vpsllw+vptestmw
1619 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // vpsllw+vptestmb
1620 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // vpsllw+vptestmw
1621 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // vpsllw+vptestmb
1622 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // vpsllw+vptestmw
1623 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // vpsllw+vptestmb
1624 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // vpsllw+vptestmw
1625 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // vpsllw+vptestmb
1626 };
1627
1628 static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = {
1629 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
1630 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
1631 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
1632 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
1633
1634 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
1635 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
1636 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
1637 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
1638
1639 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 1 },
1640 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 },
1641 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
1642 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 },
1643
1644 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 1 },
1645 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 },
1646 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
1647 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 },
1648 };
1649
1650 static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = {
1651 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
1652 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
1653 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
1654 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8
1655 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
1656 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
1657 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
1658 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16
1659 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd
1660 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd
1661 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd
1662 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq
1663 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq
1664 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd
1665
1666 // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
1667 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
1668 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 5 },
1669 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 6 },
1670 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 5 },
1671 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 6 },
1672 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 5 },
1673 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 6 },
1674 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 10 },
1675 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 12 },
1676
1677 // sign extend is vpcmpeq+maskedmove+vpmovdw
1678 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
1679 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 4 },
1680 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 5 },
1681 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 4 },
1682 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 5 },
1683 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 4 },
1684 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 5 },
1685 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 10 },
1686 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 12 },
1687
1688 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd
1689 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld
1690 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd
1691 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld
1692 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd
1693 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld
1694 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq
1695 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq
1696 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq
1697 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq
1698
1699 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 2 },
1700 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 },
1701 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 2 },
1702 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 5 },
1703 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 },
1704 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 },
1705 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 },
1706 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
1707 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
1708 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
1709 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
1710 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 },
1711 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
1712 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 5 },
1713
1714 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 },
1715 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 },
1716
1717 { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 3 },
1718 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f32, 3 },
1719
1720 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 },
1721 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 },
1722
1723 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
1724 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
1725 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 1 },
1726 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 },
1727 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 },
1728 };
1729
1730 static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
1731 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
1732 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
1733 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
1734 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
1735 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 1 },
1736 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 1 },
1737 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 1 },
1738 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 1 },
1739 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
1740 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
1741 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
1742 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
1743 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 1 },
1744 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 1 },
1745 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
1746 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
1747 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
1748 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
1749 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
1750 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
1751
1752 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 },
1753 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
1754
1755 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 },
1756 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 },
1757 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 },
1758 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 },
1759
1760 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 },
1761 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 },
1762
1763 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 },
1764 };
1765
1766 static const TypeConversionCostTblEntry AVXConversionTbl[] = {
1767 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 },
1768 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 },
1769 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 },
1770 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 },
1771 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 4 },
1772 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 },
1773 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 4 },
1774 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 4 },
1775 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
1776 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
1777 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
1778 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
1779 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 4 },
1780 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
1781 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 },
1782 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 4 },
1783 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 4 },
1784 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 4 },
1785
1786 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 4 },
1787 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 5 },
1788 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 4 },
1789 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 9 },
1790 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, 11 },
1791
1792 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 4 },
1793 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 },
1794 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
1795 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 4 },
1796 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 4 },
1797 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 },
1798 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 11 },
1799 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 9 },
1800 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 3 },
1801 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 11 },
1802
1803 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
1804 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 },
1805 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 },
1806 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
1807 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i8, 3 },
1808 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 8 },
1809 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 3 },
1810 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i16, 3 },
1811 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 },
1812 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
1813 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
1814 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
1815
1816 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 },
1817 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 },
1818 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 },
1819 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 2 },
1820 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 },
1821 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 5 },
1822 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
1823 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 },
1824 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 },
1825 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 6 },
1826 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 6 },
1827 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 },
1828 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 9 },
1829 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
1830 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 6 },
1831 // The generic code to compute the scalar overhead is currently broken.
1832 // Workaround this limitation by estimating the scalarization overhead
1833 // here. We have roughly 10 instructions per scalar element.
1834 // Multiply that by the vector width.
1835 // FIXME: remove that when PR19268 is fixed.
1836 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 },
1837 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 },
1838
1839 { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 4 },
1840 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f64, 3 },
1841 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f64, 2 },
1842 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 3 },
1843
1844 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f64, 3 },
1845 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f64, 2 },
1846 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f32, 4 },
1847 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 3 },
1848 // This node is expanded into scalarized operations but BasicTTI is overly
1849 // optimistic estimating its cost. It computes 3 per element (one
1850 // vector-extract, one scalar conversion and one vector-insert). The
1851 // problem is that the inserts form a read-modify-write chain so latency
1852 // should be factored in too. Inflating the cost per element by 1.
1853 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 8*4 },
1854 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4*4 },
1855
1856 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 },
1857 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 },
1858 };
1859
1860 static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
1861 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 2 },
1862 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 2 },
1863 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 2 },
1864 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 2 },
1865 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
1866 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
1867
1868 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 },
1869 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 2 },
1870 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 1 },
1871 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 1 },
1872 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
1873 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
1874 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 2 },
1875 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 2 },
1876 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
1877 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
1878 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 4 },
1879 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 4 },
1880 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
1881 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
1882 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
1883 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
1884 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
1885 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
1886
1887 // These truncates end up widening elements.
1888 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ
1889 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ
1890 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD
1891
1892 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 1 },
1893 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 1 },
1894 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1 },
1895 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1 },
1896 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
1897 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
1898 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 3 },
1899 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 },
1900 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1 }, // PSHUFB
1901
1902 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 4 },
1903 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 },
1904
1905 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 3 },
1906 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 3 },
1907
1908 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 3 },
1909 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 3 },
1910 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
1911 };
1912
1913 static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
1914 // These are somewhat magic numbers justified by looking at the output of
1915 // Intel's IACA, running some kernels and making sure when we take
1916 // legalization into account the throughput will be overestimated.
1917 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
1918 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1919 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
1920 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
1921 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
1922 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 2*10 },
1923 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2*10 },
1924 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
1925 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
1926
1927 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1928 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
1929 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
1930 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
1931 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
1932 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 },
1933 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 6 },
1934 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
1935
1936 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 4 },
1937 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 2 },
1938 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 },
1939 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
1940 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
1941 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 4 },
1942
1943 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 1 },
1944
1945 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 6 },
1946 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 6 },
1947
1948 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 },
1949 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 },
1950 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 4 },
1951 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 4 },
1952 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 },
1953 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 2 },
1954 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
1955 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 4 },
1956
1957 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 },
1958 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 },
1959 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
1960 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 3 },
1961 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 },
1962 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 8 },
1963 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
1964 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 2 },
1965 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 6 },
1966 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 6 },
1967 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
1968 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
1969 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 9 },
1970 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 12 },
1971 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
1972 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 2 },
1973 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
1974 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 10 },
1975 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
1976 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 },
1977 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 },
1978 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 8 },
1979 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
1980 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 5 },
1981
1982 // These truncates are really widening elements.
1983 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD
1984 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // PUNPCKLWD+DQ
1985 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // PUNPCKLBW+WD+PSHUFD
1986 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 1 }, // PUNPCKLWD
1987 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD
1988 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW
1989
1990 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // PAND+PACKUSWB
1991 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // PAND+PACKUSWB
1992 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // PAND+PACKUSWB
1993 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 },
1994 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 3 }, // PAND+2*PACKUSWB
1995 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 },
1996 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 3 },
1997 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 3 },
1998 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 },
1999 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 },
2000 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
2001 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 10 },
2002 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB
2003 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW
2004 { ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1 }, // PSHUFD
2005 };
2006
2007 std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
2008 std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);
2009
2010 if (ST->hasSSE2() && !ST->hasAVX()) {
2011 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2012 LTDest.second, LTSrc.second))
2013 return AdjustCost(LTSrc.first * Entry->Cost);
2014 }
2015
2016 EVT SrcTy = TLI->getValueType(DL, Src);
2017 EVT DstTy = TLI->getValueType(DL, Dst);
2018
2019 // The function getSimpleVT only handles simple value types.
2020 if (!SrcTy.isSimple() || !DstTy.isSimple())
2021 return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind));
2022
2023 MVT SimpleSrcTy = SrcTy.getSimpleVT();
2024 MVT SimpleDstTy = DstTy.getSimpleVT();
2025
2026 if (ST->useAVX512Regs()) {
2027 if (ST->hasBWI())
2028 if (const auto *Entry = ConvertCostTableLookup(AVX512BWConversionTbl, ISD,
2029 SimpleDstTy, SimpleSrcTy))
2030 return AdjustCost(Entry->Cost);
2031
2032 if (ST->hasDQI())
2033 if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
2034 SimpleDstTy, SimpleSrcTy))
2035 return AdjustCost(Entry->Cost);
2036
2037 if (ST->hasAVX512())
2038 if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
2039 SimpleDstTy, SimpleSrcTy))
2040 return AdjustCost(Entry->Cost);
2041 }
2042
2043 if (ST->hasBWI())
2044 if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
2045 SimpleDstTy, SimpleSrcTy))
2046 return AdjustCost(Entry->Cost);
2047
2048 if (ST->hasDQI())
2049 if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
2050 SimpleDstTy, SimpleSrcTy))
2051 return AdjustCost(Entry->Cost);
2052
2053 if (ST->hasAVX512())
2054 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2055 SimpleDstTy, SimpleSrcTy))
2056 return AdjustCost(Entry->Cost);
2057
2058 if (ST->hasAVX2()) {
2059 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2060 SimpleDstTy, SimpleSrcTy))
2061 return AdjustCost(Entry->Cost);
2062 }
2063
2064 if (ST->hasAVX()) {
2065 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2066 SimpleDstTy, SimpleSrcTy))
2067 return AdjustCost(Entry->Cost);
2068 }
2069
2070 if (ST->hasSSE41()) {
2071 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
2072 SimpleDstTy, SimpleSrcTy))
2073 return AdjustCost(Entry->Cost);
2074 }
2075
2076 if (ST->hasSSE2()) {
2077 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2078 SimpleDstTy, SimpleSrcTy))
2079 return AdjustCost(Entry->Cost);
2080 }
2081
2082 return AdjustCost(
2083 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2084}
2085
2086int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
2087 CmpInst::Predicate VecPred,
2088 TTI::TargetCostKind CostKind,
2089 const Instruction *I) {
2090 // TODO: Handle other cost kinds.
2091 if (CostKind != TTI::TCK_RecipThroughput)
2092 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2093 I);
2094
2095 // Legalize the type.
2096 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2097
2098 MVT MTy = LT.second;
2099
2100 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2101 assert(ISD && "Invalid opcode")((ISD && "Invalid opcode") ? static_cast<void> (
0) : __assert_fail ("ISD && \"Invalid opcode\"", "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 2101, __PRETTY_FUNCTION__))
;
2102
2103 unsigned ExtraCost = 0;
2104 if (I && (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp)) {
2105 // Some vector comparison predicates cost extra instructions.
2106 if (MTy.isVector() &&
2107 !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
2108 (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
2109 ST->hasBWI())) {
2110 switch (cast<CmpInst>(I)->getPredicate()) {
2111 case CmpInst::Predicate::ICMP_NE:
2112 // xor(cmpeq(x,y),-1)
2113 ExtraCost = 1;
2114 break;
2115 case CmpInst::Predicate::ICMP_SGE:
2116 case CmpInst::Predicate::ICMP_SLE:
2117 // xor(cmpgt(x,y),-1)
2118 ExtraCost = 1;
2119 break;
2120 case CmpInst::Predicate::ICMP_ULT:
2121 case CmpInst::Predicate::ICMP_UGT:
2122 // cmpgt(xor(x,signbit),xor(y,signbit))
2123 // xor(cmpeq(pmaxu(x,y),x),-1)
2124 ExtraCost = 2;
2125 break;
2126 case CmpInst::Predicate::ICMP_ULE:
2127 case CmpInst::Predicate::ICMP_UGE:
2128 if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
2129 (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
2130 // cmpeq(psubus(x,y),0)
2131 // cmpeq(pminu(x,y),x)
2132 ExtraCost = 1;
2133 } else {
2134 // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
2135 ExtraCost = 3;
2136 }
2137 break;
2138 default:
2139 break;
2140 }
2141 }
2142 }
2143
2144 static const CostTblEntry SLMCostTbl[] = {
2145 // slm pcmpeq/pcmpgt throughput is 2
2146 { ISD::SETCC, MVT::v2i64, 2 },
2147 };
2148
2149 static const CostTblEntry AVX512BWCostTbl[] = {
2150 { ISD::SETCC, MVT::v32i16, 1 },
2151 { ISD::SETCC, MVT::v64i8, 1 },
2152
2153 { ISD::SELECT, MVT::v32i16, 1 },
2154 { ISD::SELECT, MVT::v64i8, 1 },
2155 };
2156
2157 static const CostTblEntry AVX512CostTbl[] = {
2158 { ISD::SETCC, MVT::v8i64, 1 },
2159 { ISD::SETCC, MVT::v16i32, 1 },
2160 { ISD::SETCC, MVT::v8f64, 1 },
2161 { ISD::SETCC, MVT::v16f32, 1 },
2162
2163 { ISD::SELECT, MVT::v8i64, 1 },
2164 { ISD::SELECT, MVT::v16i32, 1 },
2165 { ISD::SELECT, MVT::v8f64, 1 },
2166 { ISD::SELECT, MVT::v16f32, 1 },
2167
2168 { ISD::SETCC, MVT::v32i16, 2 }, // FIXME: should probably be 4
2169 { ISD::SETCC, MVT::v64i8, 2 }, // FIXME: should probably be 4
2170
2171 { ISD::SELECT, MVT::v32i16, 2 }, // FIXME: should be 3
2172 { ISD::SELECT, MVT::v64i8, 2 }, // FIXME: should be 3
2173 };
2174
2175 static const CostTblEntry AVX2CostTbl[] = {
2176 { ISD::SETCC, MVT::v4i64, 1 },
2177 { ISD::SETCC, MVT::v8i32, 1 },
2178 { ISD::SETCC, MVT::v16i16, 1 },
2179 { ISD::SETCC, MVT::v32i8, 1 },
2180
2181 { ISD::SELECT, MVT::v4i64, 1 }, // pblendvb
2182 { ISD::SELECT, MVT::v8i32, 1 }, // pblendvb
2183 { ISD::SELECT, MVT::v16i16, 1 }, // pblendvb
2184 { ISD::SELECT, MVT::v32i8, 1 }, // pblendvb
2185 };
2186
2187 static const CostTblEntry AVX1CostTbl[] = {
2188 { ISD::SETCC, MVT::v4f64, 1 },
2189 { ISD::SETCC, MVT::v8f32, 1 },
2190 // AVX1 does not support 8-wide integer compare.
2191 { ISD::SETCC, MVT::v4i64, 4 },
2192 { ISD::SETCC, MVT::v8i32, 4 },
2193 { ISD::SETCC, MVT::v16i16, 4 },
2194 { ISD::SETCC, MVT::v32i8, 4 },
2195
2196 { ISD::SELECT, MVT::v4f64, 1 }, // vblendvpd
2197 { ISD::SELECT, MVT::v8f32, 1 }, // vblendvps
2198 { ISD::SELECT, MVT::v4i64, 1 }, // vblendvpd
2199 { ISD::SELECT, MVT::v8i32, 1 }, // vblendvps
2200 { ISD::SELECT, MVT::v16i16, 3 }, // vandps + vandnps + vorps
2201 { ISD::SELECT, MVT::v32i8, 3 }, // vandps + vandnps + vorps
2202 };
2203
2204 static const CostTblEntry SSE42CostTbl[] = {
2205 { ISD::SETCC, MVT::v2f64, 1 },
2206 { ISD::SETCC, MVT::v4f32, 1 },
2207 { ISD::SETCC, MVT::v2i64, 1 },
2208 };
2209
2210 static const CostTblEntry SSE41CostTbl[] = {
2211 { ISD::SELECT, MVT::v2f64, 1 }, // blendvpd
2212 { ISD::SELECT, MVT::v4f32, 1 }, // blendvps
2213 { ISD::SELECT, MVT::v2i64, 1 }, // pblendvb
2214 { ISD::SELECT, MVT::v4i32, 1 }, // pblendvb
2215 { ISD::SELECT, MVT::v8i16, 1 }, // pblendvb
2216 { ISD::SELECT, MVT::v16i8, 1 }, // pblendvb
2217 };
2218
2219 static const CostTblEntry SSE2CostTbl[] = {
2220 { ISD::SETCC, MVT::v2f64, 2 },
2221 { ISD::SETCC, MVT::f64, 1 },
2222 { ISD::SETCC, MVT::v2i64, 8 },
2223 { ISD::SETCC, MVT::v4i32, 1 },
2224 { ISD::SETCC, MVT::v8i16, 1 },
2225 { ISD::SETCC, MVT::v16i8, 1 },
2226
2227 { ISD::SELECT, MVT::v2f64, 3 }, // andpd + andnpd + orpd
2228 { ISD::SELECT, MVT::v2i64, 3 }, // pand + pandn + por
2229 { ISD::SELECT, MVT::v4i32, 3 }, // pand + pandn + por
2230 { ISD::SELECT, MVT::v8i16, 3 }, // pand + pandn + por
2231 { ISD::SELECT, MVT::v16i8, 3 }, // pand + pandn + por
2232 };
2233
2234 static const CostTblEntry SSE1CostTbl[] = {
2235 { ISD::SETCC, MVT::v4f32, 2 },
2236 { ISD::SETCC, MVT::f32, 1 },
2237
2238 { ISD::SELECT, MVT::v4f32, 3 }, // andps + andnps + orps
2239 };
2240
2241 if (ST->isSLM())
2242 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
2243 return LT.first * (ExtraCost + Entry->Cost);
2244
2245 if (ST->hasBWI())
2246 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
2247 return LT.first * (ExtraCost + Entry->Cost);
2248
2249 if (ST->hasAVX512())
2250 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2251 return LT.first * (ExtraCost + Entry->Cost);
2252
2253 if (ST->hasAVX2())
2254 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
2255 return LT.first * (ExtraCost + Entry->Cost);
2256
2257 if (ST->hasAVX())
2258 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
2259 return LT.first * (ExtraCost + Entry->Cost);
2260
2261 if (ST->hasSSE42())
2262 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
2263 return LT.first * (ExtraCost + Entry->Cost);
2264
2265 if (ST->hasSSE41())
2266 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
2267 return LT.first * (ExtraCost + Entry->Cost);
2268
2269 if (ST->hasSSE2())
2270 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
2271 return LT.first * (ExtraCost + Entry->Cost);
2272
2273 if (ST->hasSSE1())
2274 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
2275 return LT.first * (ExtraCost + Entry->Cost);
2276
2277 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
2278}
2279
2280unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; }
2281
2282int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
2283 const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) {
2284
2285 // Costs should match the codegen from:
2286 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
2287 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
2288 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
2289 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
2290 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
2291
2292 // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
2293 // specialized in these tables yet.
2294 static const CostTblEntry AVX512CDCostTbl[] = {
2295 { ISD::CTLZ, MVT::v8i64, 1 },
2296 { ISD::CTLZ, MVT::v16i32, 1 },
2297 { ISD::CTLZ, MVT::v32i16, 8 },
2298 { ISD::CTLZ, MVT::v64i8, 20 },
2299 { ISD::CTLZ, MVT::v4i64, 1 },
2300 { ISD::CTLZ, MVT::v8i32, 1 },
2301 { ISD::CTLZ, MVT::v16i16, 4 },
2302 { ISD::CTLZ, MVT::v32i8, 10 },
2303 { ISD::CTLZ, MVT::v2i64, 1 },
2304 { ISD::CTLZ, MVT::v4i32, 1 },
2305 { ISD::CTLZ, MVT::v8i16, 4 },
2306 { ISD::CTLZ, MVT::v16i8, 4 },
2307 };
2308 static const CostTblEntry AVX512BWCostTbl[] = {
2309 { ISD::ABS, MVT::v32i16, 1 },
2310 { ISD::ABS, MVT::v64i8, 1 },
2311 { ISD::BITREVERSE, MVT::v8i64, 5 },
2312 { ISD::BITREVERSE, MVT::v16i32, 5 },
2313 { ISD::BITREVERSE, MVT::v32i16, 5 },
2314 { ISD::BITREVERSE, MVT::v64i8, 5 },
2315 { ISD::CTLZ, MVT::v8i64, 23 },
2316 { ISD::CTLZ, MVT::v16i32, 22 },
2317 { ISD::CTLZ, MVT::v32i16, 18 },
2318 { ISD::CTLZ, MVT::v64i8, 17 },
2319 { ISD::CTPOP, MVT::v8i64, 7 },
2320 { ISD::CTPOP, MVT::v16i32, 11 },
2321 { ISD::CTPOP, MVT::v32i16, 9 },
2322 { ISD::CTPOP, MVT::v64i8, 6 },
2323 { ISD::CTTZ, MVT::v8i64, 10 },
2324 { ISD::CTTZ, MVT::v16i32, 14 },
2325 { ISD::CTTZ, MVT::v32i16, 12 },
2326 { ISD::CTTZ, MVT::v64i8, 9 },
2327 { ISD::SADDSAT, MVT::v32i16, 1 },
2328 { ISD::SADDSAT, MVT::v64i8, 1 },
2329 { ISD::SMAX, MVT::v32i16, 1 },
2330 { ISD::SMAX, MVT::v64i8, 1 },
2331 { ISD::SMIN, MVT::v32i16, 1 },
2332 { ISD::SMIN, MVT::v64i8, 1 },
2333 { ISD::SSUBSAT, MVT::v32i16, 1 },
2334 { ISD::SSUBSAT, MVT::v64i8, 1 },
2335 { ISD::UADDSAT, MVT::v32i16, 1 },
2336 { ISD::UADDSAT, MVT::v64i8, 1 },
2337 { ISD::UMAX, MVT::v32i16, 1 },
2338 { ISD::UMAX, MVT::v64i8, 1 },
2339 { ISD::UMIN, MVT::v32i16, 1 },
2340 { ISD::UMIN, MVT::v64i8, 1 },
2341 { ISD::USUBSAT, MVT::v32i16, 1 },
2342 { ISD::USUBSAT, MVT::v64i8, 1 },
2343 };
2344 static const CostTblEntry AVX512CostTbl[] = {
2345 { ISD::ABS, MVT::v8i64, 1 },
2346 { ISD::ABS, MVT::v16i32, 1 },
2347 { ISD::ABS, MVT::v32i16, 2 }, // FIXME: include split
2348 { ISD::ABS, MVT::v64i8, 2 }, // FIXME: include split
2349 { ISD::ABS, MVT::v4i64, 1 },
2350 { ISD::ABS, MVT::v2i64, 1 },
2351 { ISD::BITREVERSE, MVT::v8i64, 36 },
2352 { ISD::BITREVERSE, MVT::v16i32, 24 },
2353 { ISD::BITREVERSE, MVT::v32i16, 10 },
2354 { ISD::BITREVERSE, MVT::v64i8, 10 },
2355 { ISD::CTLZ, MVT::v8i64, 29 },
2356 { ISD::CTLZ, MVT::v16i32, 35 },
2357 { ISD::CTLZ, MVT::v32i16, 28 },
2358 { ISD::CTLZ, MVT::v64i8, 18 },
2359 { ISD::CTPOP, MVT::v8i64, 16 },
2360 { ISD::CTPOP, MVT::v16i32, 24 },
2361 { ISD::CTPOP, MVT::v32i16, 18 },
2362 { ISD::CTPOP, MVT::v64i8, 12 },
2363 { ISD::CTTZ, MVT::v8i64, 20 },
2364 { ISD::CTTZ, MVT::v16i32, 28 },
2365 { ISD::CTTZ, MVT::v32i16, 24 },
2366 { ISD::CTTZ, MVT::v64i8, 18 },
2367 { ISD::SMAX, MVT::v8i64, 1 },
2368 { ISD::SMAX, MVT::v16i32, 1 },
2369 { ISD::SMAX, MVT::v32i16, 2 }, // FIXME: include split
2370 { ISD::SMAX, MVT::v64i8, 2 }, // FIXME: include split
2371 { ISD::SMAX, MVT::v4i64, 1 },
2372 { ISD::SMAX, MVT::v2i64, 1 },
2373 { ISD::SMIN, MVT::v8i64, 1 },
2374 { ISD::SMIN, MVT::v16i32, 1 },
2375 { ISD::SMIN, MVT::v32i16, 2 }, // FIXME: include split
2376 { ISD::SMIN, MVT::v64i8, 2 }, // FIXME: include split
2377 { ISD::SMIN, MVT::v4i64, 1 },
2378 { ISD::SMIN, MVT::v2i64, 1 },
2379 { ISD::UMAX, MVT::v8i64, 1 },
2380 { ISD::UMAX, MVT::v16i32, 1 },
2381 { ISD::UMAX, MVT::v32i16, 2 }, // FIXME: include split
2382 { ISD::UMAX, MVT::v64i8, 2 }, // FIXME: include split
2383 { ISD::UMAX, MVT::v4i64, 1 },
2384 { ISD::UMAX, MVT::v2i64, 1 },
2385 { ISD::UMIN, MVT::v8i64, 1 },
2386 { ISD::UMIN, MVT::v16i32, 1 },
2387 { ISD::UMIN, MVT::v32i16, 2 }, // FIXME: include split
2388 { ISD::UMIN, MVT::v64i8, 2 }, // FIXME: include split
2389 { ISD::UMIN, MVT::v4i64, 1 },
2390 { ISD::UMIN, MVT::v2i64, 1 },
2391 { ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd
2392 { ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq
2393 { ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq
2394 { ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq
2395 { ISD::UADDSAT, MVT::v16i32, 3 }, // not + pminud + paddd
2396 { ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq
2397 { ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq
2398 { ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq
2399 { ISD::SADDSAT, MVT::v32i16, 2 }, // FIXME: include split
2400 { ISD::SADDSAT, MVT::v64i8, 2 }, // FIXME: include split
2401 { ISD::SSUBSAT, MVT::v32i16, 2 }, // FIXME: include split
2402 { ISD::SSUBSAT, MVT::v64i8, 2 }, // FIXME: include split
2403 { ISD::UADDSAT, MVT::v32i16, 2 }, // FIXME: include split
2404 { ISD::UADDSAT, MVT::v64i8, 2 }, // FIXME: include split
2405 { ISD::USUBSAT, MVT::v32i16, 2 }, // FIXME: include split
2406 { ISD::USUBSAT, MVT::v64i8, 2 }, // FIXME: include split
2407 { ISD::FMAXNUM, MVT::f32, 2 },
2408 { ISD::FMAXNUM, MVT::v4f32, 2 },
2409 { ISD::FMAXNUM, MVT::v8f32, 2 },
2410 { ISD::FMAXNUM, MVT::v16f32, 2 },
2411 { ISD::FMAXNUM, MVT::f64, 2 },
2412 { ISD::FMAXNUM, MVT::v2f64, 2 },
2413 { ISD::FMAXNUM, MVT::v4f64, 2 },
2414 { ISD::FMAXNUM, MVT::v8f64, 2 },
2415 };
2416 static const CostTblEntry XOPCostTbl[] = {
2417 { ISD::BITREVERSE, MVT::v4i64, 4 },
2418 { ISD::BITREVERSE, MVT::v8i32, 4 },
2419 { ISD::BITREVERSE, MVT::v16i16, 4 },
2420 { ISD::BITREVERSE, MVT::v32i8, 4 },
2421 { ISD::BITREVERSE, MVT::v2i64, 1 },
2422 { ISD::BITREVERSE, MVT::v4i32, 1 },
2423 { ISD::BITREVERSE, MVT::v8i16, 1 },
2424 { ISD::BITREVERSE, MVT::v16i8, 1 },
2425 { ISD::BITREVERSE, MVT::i64, 3 },
2426 { ISD::BITREVERSE, MVT::i32, 3 },
2427 { ISD::BITREVERSE, MVT::i16, 3 },
2428 { ISD::BITREVERSE, MVT::i8, 3 }
2429 };
2430 static const CostTblEntry AVX2CostTbl[] = {
2431 { ISD::ABS, MVT::v4i64, 2 }, // VBLENDVPD(X,VPSUBQ(0,X),X)
2432 { ISD::ABS, MVT::v8i32, 1 },
2433 { ISD::ABS, MVT::v16i16, 1 },
2434 { ISD::ABS, MVT::v32i8, 1 },
2435 { ISD::BITREVERSE, MVT::v4i64, 5 },
2436 { ISD::BITREVERSE, MVT::v8i32, 5 },
2437 { ISD::BITREVERSE, MVT::v16i16, 5 },
2438 { ISD::BITREVERSE, MVT::v32i8, 5 },
2439 { ISD::BSWAP, MVT::v4i64, 1 },
2440 { ISD::BSWAP, MVT::v8i32, 1 },
2441 { ISD::BSWAP, MVT::v16i16, 1 },
2442 { ISD::CTLZ, MVT::v4i64, 23 },
2443 { ISD::CTLZ, MVT::v8i32, 18 },
2444 { ISD::CTLZ, MVT::v16i16, 14 },
2445 { ISD::CTLZ, MVT::v32i8, 9 },
2446 { ISD::CTPOP, MVT::v4i64, 7 },
2447 { ISD::CTPOP, MVT::v8i32, 11 },
2448 { ISD::CTPOP, MVT::v16i16, 9 },
2449 { ISD::CTPOP, MVT::v32i8, 6 },
2450 { ISD::CTTZ, MVT::v4i64, 10 },
2451 { ISD::CTTZ, MVT::v8i32, 14 },
2452 { ISD::CTTZ, MVT::v16i16, 12 },
2453 { ISD::CTTZ, MVT::v32i8, 9 },
2454 { ISD::SADDSAT, MVT::v16i16, 1 },
2455 { ISD::SADDSAT, MVT::v32i8, 1 },
2456 { ISD::SMAX, MVT::v8i32, 1 },
2457 { ISD::SMAX, MVT::v16i16, 1 },
2458 { ISD::SMAX, MVT::v32i8, 1 },
2459 { ISD::SMIN, MVT::v8i32, 1 },
2460 { ISD::SMIN, MVT::v16i16, 1 },
2461 { ISD::SMIN, MVT::v32i8, 1 },
2462 { ISD::SSUBSAT, MVT::v16i16, 1 },
2463 { ISD::SSUBSAT, MVT::v32i8, 1 },
2464 { ISD::UADDSAT, MVT::v16i16, 1 },
2465 { ISD::UADDSAT, MVT::v32i8, 1 },
2466 { ISD::UADDSAT, MVT::v8i32, 3 }, // not + pminud + paddd
2467 { ISD::UMAX, MVT::v8i32, 1 },
2468 { ISD::UMAX, MVT::v16i16, 1 },
2469 { ISD::UMAX, MVT::v32i8, 1 },
2470 { ISD::UMIN, MVT::v8i32, 1 },
2471 { ISD::UMIN, MVT::v16i16, 1 },
2472 { ISD::UMIN, MVT::v32i8, 1 },
2473 { ISD::USUBSAT, MVT::v16i16, 1 },
2474 { ISD::USUBSAT, MVT::v32i8, 1 },
2475 { ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd
2476 { ISD::FMAXNUM, MVT::v8f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS
2477 { ISD::FMAXNUM, MVT::v4f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD
2478 { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/
2479 { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
2480 { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
2481 { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/
2482 { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
2483 { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
2484 };
2485 static const CostTblEntry AVX1CostTbl[] = {
2486 { ISD::ABS, MVT::v4i64, 5 }, // VBLENDVPD(X,VPSUBQ(0,X),X)
2487 { ISD::ABS, MVT::v8i32, 3 },
2488 { ISD::ABS, MVT::v16i16, 3 },
2489 { ISD::ABS, MVT::v32i8, 3 },
2490 { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert
2491 { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert
2492 { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert
2493 { ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert
2494 { ISD::BSWAP, MVT::v4i64, 4 },
2495 { ISD::BSWAP, MVT::v8i32, 4 },
2496 { ISD::BSWAP, MVT::v16i16, 4 },
2497 { ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert
2498 { ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert
2499 { ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert
2500 { ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
2501 { ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert
2502 { ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert
2503 { ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert
2504 { ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert
2505 { ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert
2506 { ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert
2507 { ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert
2508 { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
2509 { ISD::SADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2510 { ISD::SADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2511 { ISD::SMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
2512 { ISD::SMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2513 { ISD::SMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2514 { ISD::SMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
2515 { ISD::SMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2516 { ISD::SMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2517 { ISD::SSUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2518 { ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2519 { ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2520 { ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2521 { ISD::UADDSAT, MVT::v8i32, 8 }, // 2 x 128-bit Op + extract/insert
2522 { ISD::UMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
2523 { ISD::UMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2524 { ISD::UMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2525 { ISD::UMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
2526 { ISD::UMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2527 { ISD::UMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2528 { ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2529 { ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2530 { ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert
2531 { ISD::FMAXNUM, MVT::f32, 3 }, // MAXSS + CMPUNORDSS + BLENDVPS
2532 { ISD::FMAXNUM, MVT::v4f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS
2533 { ISD::FMAXNUM, MVT::v8f32, 5 }, // MAXPS + CMPUNORDPS + BLENDVPS + ?
2534 { ISD::FMAXNUM, MVT::f64, 3 }, // MAXSD + CMPUNORDSD + BLENDVPD
2535 { ISD::FMAXNUM, MVT::v2f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD
2536 { ISD::FMAXNUM, MVT::v4f64, 5 }, // MAXPD + CMPUNORDPD + BLENDVPD + ?
2537 { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
2538 { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
2539 { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
2540 { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/
2541 { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/
2542 { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/
2543 };
2544 static const CostTblEntry GLMCostTbl[] = {
2545 { ISD::FSQRT, MVT::f32, 19 }, // sqrtss
2546 { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps
2547 { ISD::FSQRT, MVT::f64, 34 }, // sqrtsd
2548 { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd
2549 };
2550 static const CostTblEntry SLMCostTbl[] = {
2551 { ISD::FSQRT, MVT::f32, 20 }, // sqrtss
2552 { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps
2553 { ISD::FSQRT, MVT::f64, 35 }, // sqrtsd
2554 { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd
2555 };
2556 static const CostTblEntry SSE42CostTbl[] = {
2557 { ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd
2558 { ISD::UADDSAT, MVT::v4i32, 3 }, // not + pminud + paddd
2559 { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
2560 { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
2561 };
2562 static const CostTblEntry SSE41CostTbl[] = {
2563 { ISD::ABS, MVT::v2i64, 2 }, // BLENDVPD(X,PSUBQ(0,X),X)
2564 { ISD::SMAX, MVT::v4i32, 1 },
2565 { ISD::SMAX, MVT::v16i8, 1 },
2566 { ISD::SMIN, MVT::v4i32, 1 },
2567 { ISD::SMIN, MVT::v16i8, 1 },
2568 { ISD::UMAX, MVT::v4i32, 1 },
2569 { ISD::UMAX, MVT::v8i16, 1 },
2570 { ISD::UMIN, MVT::v4i32, 1 },
2571 { ISD::UMIN, MVT::v8i16, 1 },
2572 };
2573 static const CostTblEntry SSSE3CostTbl[] = {
2574 { ISD::ABS, MVT::v4i32, 1 },
2575 { ISD::ABS, MVT::v8i16, 1 },
2576 { ISD::ABS, MVT::v16i8, 1 },
2577 { ISD::BITREVERSE, MVT::v2i64, 5 },
2578 { ISD::BITREVERSE, MVT::v4i32, 5 },
2579 { ISD::BITREVERSE, MVT::v8i16, 5 },
2580 { ISD::BITREVERSE, MVT::v16i8, 5 },
2581 { ISD::BSWAP, MVT::v2i64, 1 },
2582 { ISD::BSWAP, MVT::v4i32, 1 },
2583 { ISD::BSWAP, MVT::v8i16, 1 },
2584 { ISD::CTLZ, MVT::v2i64, 23 },
2585 { ISD::CTLZ, MVT::v4i32, 18 },
2586 { ISD::CTLZ, MVT::v8i16, 14 },
2587 { ISD::CTLZ, MVT::v16i8, 9 },
2588 { ISD::CTPOP, MVT::v2i64, 7 },
2589 { ISD::CTPOP, MVT::v4i32, 11 },
2590 { ISD::CTPOP, MVT::v8i16, 9 },
2591 { ISD::CTPOP, MVT::v16i8, 6 },
2592 { ISD::CTTZ, MVT::v2i64, 10 },
2593 { ISD::CTTZ, MVT::v4i32, 14 },
2594 { ISD::CTTZ, MVT::v8i16, 12 },
2595 { ISD::CTTZ, MVT::v16i8, 9 }
2596 };
2597 static const CostTblEntry SSE2CostTbl[] = {
2598 { ISD::ABS, MVT::v2i64, 4 },
2599 { ISD::ABS, MVT::v4i32, 3 },
2600 { ISD::ABS, MVT::v8i16, 2 },
2601 { ISD::ABS, MVT::v16i8, 2 },
2602 { ISD::BITREVERSE, MVT::v2i64, 29 },
2603 { ISD::BITREVERSE, MVT::v4i32, 27 },
2604 { ISD::BITREVERSE, MVT::v8i16, 27 },
2605 { ISD::BITREVERSE, MVT::v16i8, 20 },
2606 { ISD::BSWAP, MVT::v2i64, 7 },
2607 { ISD::BSWAP, MVT::v4i32, 7 },
2608 { ISD::BSWAP, MVT::v8i16, 7 },
2609 { ISD::CTLZ, MVT::v2i64, 25 },
2610 { ISD::CTLZ, MVT::v4i32, 26 },
2611 { ISD::CTLZ, MVT::v8i16, 20 },
2612 { ISD::CTLZ, MVT::v16i8, 17 },
2613 { ISD::CTPOP, MVT::v2i64, 12 },
2614 { ISD::CTPOP, MVT::v4i32, 15 },
2615 { ISD::CTPOP, MVT::v8i16, 13 },
2616 { ISD::CTPOP, MVT::v16i8, 10 },
2617 { ISD::CTTZ, MVT::v2i64, 14 },
2618 { ISD::CTTZ, MVT::v4i32, 18 },
2619 { ISD::CTTZ, MVT::v8i16, 16 },
2620 { ISD::CTTZ, MVT::v16i8, 13 },
2621 { ISD::SADDSAT, MVT::v8i16, 1 },
2622 { ISD::SADDSAT, MVT::v16i8, 1 },
2623 { ISD::SMAX, MVT::v8i16, 1 },
2624 { ISD::SMIN, MVT::v8i16, 1 },
2625 { ISD::SSUBSAT, MVT::v8i16, 1 },
2626 { ISD::SSUBSAT, MVT::v16i8, 1 },
2627 { ISD::UADDSAT, MVT::v8i16, 1 },
2628 { ISD::UADDSAT, MVT::v16i8, 1 },
2629 { ISD::UMAX, MVT::v8i16, 2 },
2630 { ISD::UMAX, MVT::v16i8, 1 },
2631 { ISD::UMIN, MVT::v8i16, 2 },
2632 { ISD::UMIN, MVT::v16i8, 1 },
2633 { ISD::USUBSAT, MVT::v8i16, 1 },
2634 { ISD::USUBSAT, MVT::v16i8, 1 },
2635 { ISD::FMAXNUM, MVT::f64, 4 },
2636 { ISD::FMAXNUM, MVT::v2f64, 4 },
2637 { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/
2638 { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/
2639 };
2640 static const CostTblEntry SSE1CostTbl[] = {
2641 { ISD::FMAXNUM, MVT::f32, 4 },
2642 { ISD::FMAXNUM, MVT::v4f32, 4 },
2643 { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
2644 { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
2645 };
2646 static const CostTblEntry BMI64CostTbl[] = { // 64-bit targets
2647 { ISD::CTTZ, MVT::i64, 1 },
2648 };
2649 static const CostTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
2650 { ISD::CTTZ, MVT::i32, 1 },
2651 { ISD::CTTZ, MVT::i16, 1 },
2652 { ISD::CTTZ, MVT::i8, 1 },
2653 };
2654 static const CostTblEntry LZCNT64CostTbl[] = { // 64-bit targets
2655 { ISD::CTLZ, MVT::i64, 1 },
2656 };
2657 static const CostTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
2658 { ISD::CTLZ, MVT::i32, 1 },
2659 { ISD::CTLZ, MVT::i16, 1 },
2660 { ISD::CTLZ, MVT::i8, 1 },
2661 };
2662 static const CostTblEntry POPCNT64CostTbl[] = { // 64-bit targets
2663 { ISD::CTPOP, MVT::i64, 1 },
2664 };
2665 static const CostTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
2666 { ISD::CTPOP, MVT::i32, 1 },
2667 { ISD::CTPOP, MVT::i16, 1 },
2668 { ISD::CTPOP, MVT::i8, 1 },
2669 };
2670 static const CostTblEntry X64CostTbl[] = { // 64-bit targets
2671 { ISD::ABS, MVT::i64, 2 }, // SUB+CMOV
2672 { ISD::BITREVERSE, MVT::i64, 14 },
2673 { ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV
2674 { ISD::CTTZ, MVT::i64, 3 }, // TEST+BSF+CMOV/BRANCH
2675 { ISD::CTPOP, MVT::i64, 10 },
2676 { ISD::SADDO, MVT::i64, 1 },
2677 { ISD::UADDO, MVT::i64, 1 },
2678 { ISD::UMULO, MVT::i64, 2 }, // mulq + seto
2679 };
2680 static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
2681 { ISD::ABS, MVT::i32, 2 }, // SUB+CMOV
2682 { ISD::ABS, MVT::i16, 2 }, // SUB+CMOV
2683 { ISD::BITREVERSE, MVT::i32, 14 },
2684 { ISD::BITREVERSE, MVT::i16, 14 },
2685 { ISD::BITREVERSE, MVT::i8, 11 },
2686 { ISD::CTLZ, MVT::i32, 4 }, // BSR+XOR or BSR+XOR+CMOV
2687 { ISD::CTLZ, MVT::i16, 4 }, // BSR+XOR or BSR+XOR+CMOV
2688 { ISD::CTLZ, MVT::i8, 4 }, // BSR+XOR or BSR+XOR+CMOV
2689 { ISD::CTTZ, MVT::i32, 3 }, // TEST+BSF+CMOV/BRANCH
2690 { ISD::CTTZ, MVT::i16, 3 }, // TEST+BSF+CMOV/BRANCH
2691 { ISD::CTTZ, MVT::i8, 3 }, // TEST+BSF+CMOV/BRANCH
2692 { ISD::CTPOP, MVT::i32, 8 },
2693 { ISD::CTPOP, MVT::i16, 9 },
2694 { ISD::CTPOP, MVT::i8, 7 },
2695 { ISD::SADDO, MVT::i32, 1 },
2696 { ISD::SADDO, MVT::i16, 1 },
2697 { ISD::SADDO, MVT::i8, 1 },
2698 { ISD::UADDO, MVT::i32, 1 },
2699 { ISD::UADDO, MVT::i16, 1 },
2700 { ISD::UADDO, MVT::i8, 1 },
2701 { ISD::UMULO, MVT::i32, 2 }, // mul + seto
2702 { ISD::UMULO, MVT::i16, 2 },
2703 { ISD::UMULO, MVT::i8, 2 },
2704 };
2705
2706 Type *RetTy = ICA.getReturnType();
2707 Type *OpTy = RetTy;
2708 Intrinsic::ID IID = ICA.getID();
2709 unsigned ISD = ISD::DELETED_NODE;
2710 switch (IID) {
2711 default:
2712 break;
2713 case Intrinsic::abs:
2714 ISD = ISD::ABS;
2715 break;
2716 case Intrinsic::bitreverse:
2717 ISD = ISD::BITREVERSE;
2718 break;
2719 case Intrinsic::bswap:
2720 ISD = ISD::BSWAP;
2721 break;
2722 case Intrinsic::ctlz:
2723 ISD = ISD::CTLZ;
2724 break;
2725 case Intrinsic::ctpop:
2726 ISD = ISD::CTPOP;
2727 break;
2728 case Intrinsic::cttz:
2729 ISD = ISD::CTTZ;
2730 break;
2731 case Intrinsic::maxnum:
2732 case Intrinsic::minnum:
2733 // FMINNUM has same costs so don't duplicate.
2734 ISD = ISD::FMAXNUM;
2735 break;
2736 case Intrinsic::sadd_sat:
2737 ISD = ISD::SADDSAT;
2738 break;
2739 case Intrinsic::smax:
2740 ISD = ISD::SMAX;
2741 break;
2742 case Intrinsic::smin:
2743 ISD = ISD::SMIN;
2744 break;
2745 case Intrinsic::ssub_sat:
2746 ISD = ISD::SSUBSAT;
2747 break;
2748 case Intrinsic::uadd_sat:
2749 ISD = ISD::UADDSAT;
2750 break;
2751 case Intrinsic::umax:
2752 ISD = ISD::UMAX;
2753 break;
2754 case Intrinsic::umin:
2755 ISD = ISD::UMIN;
2756 break;
2757 case Intrinsic::usub_sat:
2758 ISD = ISD::USUBSAT;
2759 break;
2760 case Intrinsic::sqrt:
2761 ISD = ISD::FSQRT;
2762 break;
2763 case Intrinsic::sadd_with_overflow:
2764 case Intrinsic::ssub_with_overflow:
2765 // SSUBO has same costs so don't duplicate.
2766 ISD = ISD::SADDO;
2767 OpTy = RetTy->getContainedType(0);
2768 break;
2769 case Intrinsic::uadd_with_overflow:
2770 case Intrinsic::usub_with_overflow:
2771 // USUBO has same costs so don't duplicate.
2772 ISD = ISD::UADDO;
2773 OpTy = RetTy->getContainedType(0);
2774 break;
2775 case Intrinsic::umul_with_overflow:
2776 case Intrinsic::smul_with_overflow:
2777 // SMULO has same costs so don't duplicate.
2778 ISD = ISD::UMULO;
2779 OpTy = RetTy->getContainedType(0);
2780 break;
2781 }
2782
2783 if (ISD != ISD::DELETED_NODE) {
2784 // Legalize the type.
2785 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, OpTy);
2786 MVT MTy = LT.second;
2787
2788 // Attempt to lookup cost.
2789 if (ISD == ISD::BITREVERSE && ST->hasGFNI() && ST->hasSSSE3() &&
2790 MTy.isVector()) {
2791 // With PSHUFB the code is very similar for all types. If we have integer
2792 // byte operations, we just need a GF2P8AFFINEQB for vXi8. For other types
2793 // we also need a PSHUFB.
2794 unsigned Cost = MTy.getVectorElementType() == MVT::i8 ? 1 : 2;
2795
2796 // Without byte operations, we need twice as many GF2P8AFFINEQB and PSHUFB
2797 // instructions. We also need an extract and an insert.
2798 if (!(MTy.is128BitVector() || (ST->hasAVX2() && MTy.is256BitVector()) ||
2799 (ST->hasBWI() && MTy.is512BitVector())))
2800 Cost = Cost * 2 + 2;
2801
2802 return LT.first * Cost;
2803 }
2804
2805 auto adjustTableCost = [](const CostTblEntry &Entry, int LegalizationCost,
2806 FastMathFlags FMF) {
2807 // If there are no NANs to deal with, then these are reduced to a
2808 // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
2809 // assume is used in the non-fast case.
2810 if (Entry.ISD == ISD::FMAXNUM || Entry.ISD == ISD::FMINNUM) {
2811 if (FMF.noNaNs())
2812 return LegalizationCost * 1;
2813 }
2814 return LegalizationCost * (int)Entry.Cost;
2815 };
2816
2817 if (ST->useGLMDivSqrtCosts())
2818 if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
2819 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2820
2821 if (ST->isSLM())
2822 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
2823 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2824
2825 if (ST->hasCDI())
2826 if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
2827 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2828
2829 if (ST->hasBWI())
2830 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
2831 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2832
2833 if (ST->hasAVX512())
2834 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2835 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2836
2837 if (ST->hasXOP())
2838 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
2839 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2840
2841 if (ST->hasAVX2())
2842 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
2843 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2844
2845 if (ST->hasAVX())
2846 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
2847 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2848
2849 if (ST->hasSSE42())
2850 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
2851 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2852
2853 if (ST->hasSSE41())
2854 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
2855 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2856
2857 if (ST->hasSSSE3())
2858 if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
2859 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2860
2861 if (ST->hasSSE2())
2862 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
2863 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2864
2865 if (ST->hasSSE1())
2866 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
2867 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2868
2869 if (ST->hasBMI()) {
2870 if (ST->is64Bit())
2871 if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
2872 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2873
2874 if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
2875 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2876 }
2877
2878 if (ST->hasLZCNT()) {
2879 if (ST->is64Bit())
2880 if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
2881 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2882
2883 if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
2884 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2885 }
2886
2887 if (ST->hasPOPCNT()) {
2888 if (ST->is64Bit())
2889 if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
2890 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2891
2892 if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
2893 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2894 }
2895
2896 // TODO - add BMI (TZCNT) scalar handling
2897
2898 if (ST->is64Bit())
2899 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
2900 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2901
2902 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
2903 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
2904 }
2905
2906 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
2907}
2908
2909int X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
2910 TTI::TargetCostKind CostKind) {
2911 if (ICA.isTypeBasedOnly())
2912 return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
2913
2914 static const CostTblEntry AVX512CostTbl[] = {
2915 { ISD::ROTL, MVT::v8i64, 1 },
2916 { ISD::ROTL, MVT::v4i64, 1 },
2917 { ISD::ROTL, MVT::v2i64, 1 },
2918 { ISD::ROTL, MVT::v16i32, 1 },
2919 { ISD::ROTL, MVT::v8i32, 1 },
2920 { ISD::ROTL, MVT::v4i32, 1 },
2921 { ISD::ROTR, MVT::v8i64, 1 },
2922 { ISD::ROTR, MVT::v4i64, 1 },
2923 { ISD::ROTR, MVT::v2i64, 1 },
2924 { ISD::ROTR, MVT::v16i32, 1 },
2925 { ISD::ROTR, MVT::v8i32, 1 },
2926 { ISD::ROTR, MVT::v4i32, 1 }
2927 };
2928 // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
2929 static const CostTblEntry XOPCostTbl[] = {
2930 { ISD::ROTL, MVT::v4i64, 4 },
2931 { ISD::ROTL, MVT::v8i32, 4 },
2932 { ISD::ROTL, MVT::v16i16, 4 },
2933 { ISD::ROTL, MVT::v32i8, 4 },
2934 { ISD::ROTL, MVT::v2i64, 1 },
2935 { ISD::ROTL, MVT::v4i32, 1 },
2936 { ISD::ROTL, MVT::v8i16, 1 },
2937 { ISD::ROTL, MVT::v16i8, 1 },
2938 { ISD::ROTR, MVT::v4i64, 6 },
2939 { ISD::ROTR, MVT::v8i32, 6 },
2940 { ISD::ROTR, MVT::v16i16, 6 },
2941 { ISD::ROTR, MVT::v32i8, 6 },
2942 { ISD::ROTR, MVT::v2i64, 2 },
2943 { ISD::ROTR, MVT::v4i32, 2 },
2944 { ISD::ROTR, MVT::v8i16, 2 },
2945 { ISD::ROTR, MVT::v16i8, 2 }
2946 };
2947 static const CostTblEntry X64CostTbl[] = { // 64-bit targets
2948 { ISD::ROTL, MVT::i64, 1 },
2949 { ISD::ROTR, MVT::i64, 1 },
2950 { ISD::FSHL, MVT::i64, 4 }
2951 };
2952 static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
2953 { ISD::ROTL, MVT::i32, 1 },
2954 { ISD::ROTL, MVT::i16, 1 },
2955 { ISD::ROTL, MVT::i8, 1 },
2956 { ISD::ROTR, MVT::i32, 1 },
2957 { ISD::ROTR, MVT::i16, 1 },
2958 { ISD::ROTR, MVT::i8, 1 },
2959 { ISD::FSHL, MVT::i32, 4 },
2960 { ISD::FSHL, MVT::i16, 4 },
2961 { ISD::FSHL, MVT::i8, 4 }
2962 };
2963
2964 Intrinsic::ID IID = ICA.getID();
2965 Type *RetTy = ICA.getReturnType();
2966 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
2967 unsigned ISD = ISD::DELETED_NODE;
2968 switch (IID) {
2969 default:
2970 break;
2971 case Intrinsic::fshl:
2972 ISD = ISD::FSHL;
2973 if (Args[0] == Args[1])
2974 ISD = ISD::ROTL;
2975 break;
2976 case Intrinsic::fshr:
2977 // FSHR has same costs so don't duplicate.
2978 ISD = ISD::FSHL;
2979 if (Args[0] == Args[1])
2980 ISD = ISD::ROTR;
2981 break;
2982 }
2983
2984 if (ISD != ISD::DELETED_NODE) {
2985 // Legalize the type.
2986 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
2987 MVT MTy = LT.second;
2988
2989 // Attempt to lookup cost.
2990 if (ST->hasAVX512())
2991 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2992 return LT.first * Entry->Cost;
2993
2994 if (ST->hasXOP())
2995 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
2996 return LT.first * Entry->Cost;
2997
2998 if (ST->is64Bit())
2999 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
3000 return LT.first * Entry->Cost;
3001
3002 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
3003 return LT.first * Entry->Cost;
3004 }
3005
3006 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
3007}
3008
3009int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
3010 static const CostTblEntry SLMCostTbl[] = {
3011 { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },
3012 { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 },
3013 { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 },
3014 { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 }
3015 };
3016
3017 assert(Val->isVectorTy() && "This must be a vector type")((Val->isVectorTy() && "This must be a vector type"
) ? static_cast<void> (0) : __assert_fail ("Val->isVectorTy() && \"This must be a vector type\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3017, __PRETTY_FUNCTION__))
;
3018 Type *ScalarType = Val->getScalarType();
3019 int RegisterFileMoveCost = 0;
3020
3021 if (Index != -1U && (Opcode == Instruction::ExtractElement ||
3022 Opcode == Instruction::InsertElement)) {
3023 // Legalize the type.
3024 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
3025
3026 // This type is legalized to a scalar type.
3027 if (!LT.second.isVector())
3028 return 0;
3029
3030 // The type may be split. Normalize the index to the new type.
3031 unsigned NumElts = LT.second.getVectorNumElements();
3032 unsigned SubNumElts = NumElts;
3033 Index = Index % NumElts;
3034
3035 // For >128-bit vectors, we need to extract higher 128-bit subvectors.
3036 // For inserts, we also need to insert the subvector back.
3037 if (LT.second.getSizeInBits() > 128) {
3038 assert((LT.second.getSizeInBits() % 128) == 0 && "Illegal vector")(((LT.second.getSizeInBits() % 128) == 0 && "Illegal vector"
) ? static_cast<void> (0) : __assert_fail ("(LT.second.getSizeInBits() % 128) == 0 && \"Illegal vector\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3038, __PRETTY_FUNCTION__))
;
3039 unsigned NumSubVecs = LT.second.getSizeInBits() / 128;
3040 SubNumElts = NumElts / NumSubVecs;
3041 if (SubNumElts <= Index) {
3042 RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
3043 Index %= SubNumElts;
3044 }
3045 }
3046
3047 if (Index == 0) {
3048 // Floating point scalars are already located in index #0.
3049 // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
3050 // true for all.
3051 if (ScalarType->isFloatingPointTy())
3052 return RegisterFileMoveCost;
3053
3054 // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
3055 if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
3056 return 1 + RegisterFileMoveCost;
3057 }
3058
3059 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3060 assert(ISD && "Unexpected vector opcode")((ISD && "Unexpected vector opcode") ? static_cast<
void> (0) : __assert_fail ("ISD && \"Unexpected vector opcode\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3060, __PRETTY_FUNCTION__))
;
3061 MVT MScalarTy = LT.second.getScalarType();
3062 if (ST->isSLM())
3063 if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
3064 return Entry->Cost + RegisterFileMoveCost;
3065
3066 // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
3067 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
3068 (MScalarTy.isInteger() && ST->hasSSE41()))
3069 return 1 + RegisterFileMoveCost;
3070
3071 // Assume insertps is relatively cheap on all targets.
3072 if (MScalarTy == MVT::f32 && ST->hasSSE41() &&
3073 Opcode == Instruction::InsertElement)
3074 return 1 + RegisterFileMoveCost;
3075
3076 // For extractions we just need to shuffle the element to index 0, which
3077 // should be very cheap (assume cost = 1). For insertions we need to shuffle
3078 // the elements to its destination. In both cases we must handle the
3079 // subvector move(s).
3080 // If the vector type is already less than 128-bits then don't reduce it.
3081 // TODO: Under what circumstances should we shuffle using the full width?
3082 int ShuffleCost = 1;
3083 if (Opcode == Instruction::InsertElement) {
3084 auto *SubTy = cast<VectorType>(Val);
3085 EVT VT = TLI->getValueType(DL, Val);
3086 if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
3087 SubTy = FixedVectorType::get(ScalarType, SubNumElts);
3088 ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, 0, SubTy);
3089 }
3090 int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
3091 return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
3092 }
3093
3094 // Add to the base cost if we know that the extracted element of a vector is
3095 // destined to be moved to and used in the integer register file.
3096 if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
3097 RegisterFileMoveCost += 1;
3098
3099 return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
3100}
3101
3102unsigned X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
3103 const APInt &DemandedElts,
3104 bool Insert, bool Extract) {
3105 unsigned Cost = 0;
3106
3107 // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
3108 // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
3109 if (Insert) {
3110 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
3111 MVT MScalarTy = LT.second.getScalarType();
3112
3113 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
3114 (MScalarTy.isInteger() && ST->hasSSE41()) ||
3115 (MScalarTy == MVT::f32 && ST->hasSSE41())) {
3116 // For types we can insert directly, insertion into 128-bit sub vectors is
3117 // cheap, followed by a cheap chain of concatenations.
3118 if (LT.second.getSizeInBits() <= 128) {
3119 Cost +=
3120 BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false);
3121 } else {
3122 // In each 128-lane, if at least one index is demanded but not all
3123 // indices are demanded and this 128-lane is not the first 128-lane of
3124 // the legalized-vector, then this 128-lane needs a extracti128; If in
3125 // each 128-lane, there is at least one demanded index, this 128-lane
3126 // needs a inserti128.
3127
3128 // The following cases will help you build a better understanding:
3129 // Assume we insert several elements into a v8i32 vector in avx2,
3130 // Case#1: inserting into 1th index needs vpinsrd + inserti128.
3131 // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
3132 // inserti128.
3133 // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
3134 unsigned Num128Lanes = LT.second.getSizeInBits() / 128 * LT.first;
3135 unsigned NumElts = LT.second.getVectorNumElements() * LT.first;
3136 APInt WidenedDemandedElts = DemandedElts.zextOrSelf(NumElts);
3137 unsigned Scale = NumElts / Num128Lanes;
3138 // We iterate each 128-lane, and check if we need a
3139 // extracti128/inserti128 for this 128-lane.
3140 for (unsigned I = 0; I < NumElts; I += Scale) {
3141 APInt Mask = WidenedDemandedElts.getBitsSet(NumElts, I, I + Scale);
3142 APInt MaskedDE = Mask & WidenedDemandedElts;
3143 unsigned Population = MaskedDE.countPopulation();
3144 Cost += (Population > 0 && Population != Scale &&
3145 I % LT.second.getVectorNumElements() != 0);
3146 Cost += Population > 0;
3147 }
3148 Cost += DemandedElts.countPopulation();
3149
3150 // For vXf32 cases, insertion into the 0'th index in each v4f32
3151 // 128-bit vector is free.
3152 // NOTE: This assumes legalization widens vXf32 vectors.
3153 if (MScalarTy == MVT::f32)
3154 for (unsigned i = 0, e = cast<FixedVectorType>(Ty)->getNumElements();
3155 i < e; i += 4)
3156 if (DemandedElts[i])
3157 Cost--;
3158 }
3159 } else if (LT.second.isVector()) {
3160 // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
3161 // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
3162 // series of UNPCK followed by CONCAT_VECTORS - all of these can be
3163 // considered cheap.
3164 if (Ty->isIntOrIntVectorTy())
3165 Cost += DemandedElts.countPopulation();
3166
3167 // Get the smaller of the legalized or original pow2-extended number of
3168 // vector elements, which represents the number of unpacks we'll end up
3169 // performing.
3170 unsigned NumElts = LT.second.getVectorNumElements();
3171 unsigned Pow2Elts =
3172 PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements());
3173 Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
3174 }
3175 }
3176
3177 // TODO: Use default extraction for now, but we should investigate extending this
3178 // to handle repeated subvector extraction.
3179 if (Extract)
3180 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, false, Extract);
3181
3182 return Cost;
3183}
3184
3185int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
3186 MaybeAlign Alignment, unsigned AddressSpace,
3187 TTI::TargetCostKind CostKind,
3188 const Instruction *I) {
3189 // TODO: Handle other cost kinds.
3190 if (CostKind != TTI::TCK_RecipThroughput) {
20
Assuming 'CostKind' is not equal to TCK_RecipThroughput
21
Taking true branch
3191 if (isa_and_nonnull<StoreInst>(I)) {
22
Assuming 'I' is a 'StoreInst'
23
Taking true branch
3192 Value *Ptr = I->getOperand(1);
24
Called C++ object pointer is null
3193 // Store instruction with index and scale costs 2 Uops.
3194 // Check the preceding GEP to identify non-const indices.
3195 if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
3196 if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
3197 return TTI::TCC_Basic * 2;
3198 }
3199 }
3200 return TTI::TCC_Basic;
3201 }
3202
3203 // Handle non-power-of-two vectors such as <3 x float>
3204 if (auto *VTy = dyn_cast<FixedVectorType>(Src)) {
3205 unsigned NumElem = VTy->getNumElements();
3206
3207 // Handle a few common cases:
3208 // <3 x float>
3209 if (NumElem == 3 && VTy->getScalarSizeInBits() == 32)
3210 // Cost = 64 bit store + extract + 32 bit store.
3211 return 3;
3212
3213 // <3 x double>
3214 if (NumElem == 3 && VTy->getScalarSizeInBits() == 64)
3215 // Cost = 128 bit store + unpack + 64 bit store.
3216 return 3;
3217
3218 // Assume that all other non-power-of-two numbers are scalarized.
3219 if (!isPowerOf2_32(NumElem)) {
3220 APInt DemandedElts = APInt::getAllOnesValue(NumElem);
3221 int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
3222 AddressSpace, CostKind);
3223 int SplitCost = getScalarizationOverhead(VTy, DemandedElts,
3224 Opcode == Instruction::Load,
3225 Opcode == Instruction::Store);
3226 return NumElem * Cost + SplitCost;
3227 }
3228 }
3229
3230 // Type legalization can't handle structs
3231 if (TLI->getValueType(DL, Src, true) == MVT::Other)
3232 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
3233 CostKind);
3234
3235 // Legalize the type.
3236 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
3237 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&(((Opcode == Instruction::Load || Opcode == Instruction::Store
) && "Invalid Opcode") ? static_cast<void> (0) :
__assert_fail ("(Opcode == Instruction::Load || Opcode == Instruction::Store) && \"Invalid Opcode\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3238, __PRETTY_FUNCTION__))
3238 "Invalid Opcode")(((Opcode == Instruction::Load || Opcode == Instruction::Store
) && "Invalid Opcode") ? static_cast<void> (0) :
__assert_fail ("(Opcode == Instruction::Load || Opcode == Instruction::Store) && \"Invalid Opcode\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3238, __PRETTY_FUNCTION__))
;
3239
3240 // Each load/store unit costs 1.
3241 int Cost = LT.first * 1;
3242
3243 // This isn't exactly right. We're using slow unaligned 32-byte accesses as a
3244 // proxy for a double-pumped AVX memory interface such as on Sandybridge.
3245 if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow())
3246 Cost *= 2;
3247
3248 return Cost;
3249}
3250
3251int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
3252 Align Alignment, unsigned AddressSpace,
3253 TTI::TargetCostKind CostKind) {
3254 bool IsLoad = (Instruction::Load == Opcode);
3255 bool IsStore = (Instruction::Store == Opcode);
3256
3257 auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
3258 if (!SrcVTy)
3259 // To calculate scalar take the regular cost, without mask
3260 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
3261
3262 unsigned NumElem = SrcVTy->getNumElements();
3263 auto *MaskTy =
3264 FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
3265 if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) ||
3266 (IsStore && !isLegalMaskedStore(SrcVTy, Alignment)) ||
3267 !isPowerOf2_32(NumElem)) {
3268 // Scalarization
3269 APInt DemandedElts = APInt::getAllOnesValue(NumElem);
3270 int MaskSplitCost =
3271 getScalarizationOverhead(MaskTy, DemandedElts, false, true);
3272 int ScalarCompareCost = getCmpSelInstrCost(
3273 Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
3274 CmpInst::BAD_ICMP_PREDICATE, CostKind);
3275 int BranchCost = getCFInstrCost(Instruction::Br, CostKind);
3276 int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
3277 int ValueSplitCost =
3278 getScalarizationOverhead(SrcVTy, DemandedElts, IsLoad, IsStore);
3279 int MemopCost =
3280 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
3281 Alignment, AddressSpace, CostKind);
3282 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
3283 }
3284
3285 // Legalize the type.
3286 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
3287 auto VT = TLI->getValueType(DL, SrcVTy);
3288 int Cost = 0;
3289 if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
3290 LT.second.getVectorNumElements() == NumElem)
3291 // Promotion requires expand/truncate for data and a shuffle for mask.
3292 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, 0, nullptr) +
3293 getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, 0, nullptr);
3294
3295 else if (LT.second.getVectorNumElements() > NumElem) {
3296 auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
3297 LT.second.getVectorNumElements());
3298 // Expanding requires fill mask with zeroes
3299 Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy);
3300 }
3301
3302 // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
3303 if (!ST->hasAVX512())
3304 return Cost + LT.first * (IsLoad ? 2 : 8);
3305
3306 // AVX-512 masked load/store is cheapper
3307 return Cost + LT.first;
3308}
3309
3310int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
3311 const SCEV *Ptr) {
3312 // Address computations in vectorized code with non-consecutive addresses will
3313 // likely result in more instructions compared to scalar code where the
3314 // computation can more often be merged into the index mode. The resulting
3315 // extra micro-ops can significantly decrease throughput.
3316 const unsigned NumVectorInstToHideOverhead = 10;
3317
3318 // Cost modeling of Strided Access Computation is hidden by the indexing
3319 // modes of X86 regardless of the stride value. We dont believe that there
3320 // is a difference between constant strided access in gerenal and constant
3321 // strided value which is less than or equal to 64.
3322 // Even in the case of (loop invariant) stride whose value is not known at
3323 // compile time, the address computation will not incur more than one extra
3324 // ADD instruction.
3325 if (Ty->isVectorTy() && SE) {
3326 if (!BaseT::isStridedAccess(Ptr))
3327 return NumVectorInstToHideOverhead;
3328 if (!BaseT::getConstantStrideStep(SE, Ptr))
3329 return 1;
3330 }
3331
3332 return BaseT::getAddressComputationCost(Ty, SE, Ptr);
3333}
3334
3335int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
3336 bool IsPairwise,
3337 TTI::TargetCostKind CostKind) {
3338 // Just use the default implementation for pair reductions.
3339 if (IsPairwise)
3340 return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise, CostKind);
3341
3342 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
3343 // and make it as the cost.
3344
3345 static const CostTblEntry SLMCostTblNoPairWise[] = {
3346 { ISD::FADD, MVT::v2f64, 3 },
3347 { ISD::ADD, MVT::v2i64, 5 },
3348 };
3349
3350 static const CostTblEntry SSE2CostTblNoPairWise[] = {
3351 { ISD::FADD, MVT::v2f64, 2 },
3352 { ISD::FADD, MVT::v4f32, 4 },
3353 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
3354 { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
3355 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
3356 { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
3357 { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
3358 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
3359 { ISD::ADD, MVT::v2i8, 2 },
3360 { ISD::ADD, MVT::v4i8, 2 },
3361 { ISD::ADD, MVT::v8i8, 2 },
3362 { ISD::ADD, MVT::v16i8, 3 },
3363 };
3364
3365 static const CostTblEntry AVX1CostTblNoPairWise[] = {
3366 { ISD::FADD, MVT::v4f64, 3 },
3367 { ISD::FADD, MVT::v4f32, 3 },
3368 { ISD::FADD, MVT::v8f32, 4 },
3369 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
3370 { ISD::ADD, MVT::v4i64, 3 },
3371 { ISD::ADD, MVT::v8i32, 5 },
3372 { ISD::ADD, MVT::v16i16, 5 },
3373 { ISD::ADD, MVT::v32i8, 4 },
3374 };
3375
3376 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3377 assert(ISD && "Invalid opcode")((ISD && "Invalid opcode") ? static_cast<void> (
0) : __assert_fail ("ISD && \"Invalid opcode\"", "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3377, __PRETTY_FUNCTION__))
;
3378
3379 // Before legalizing the type, give a chance to look up illegal narrow types
3380 // in the table.
3381 // FIXME: Is there a better way to do this?
3382 EVT VT = TLI->getValueType(DL, ValTy);
3383 if (VT.isSimple()) {
3384 MVT MTy = VT.getSimpleVT();
3385 if (ST->isSLM())
3386 if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
3387 return Entry->Cost;
3388
3389 if (ST->hasAVX())
3390 if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
3391 return Entry->Cost;
3392
3393 if (ST->hasSSE2())
3394 if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
3395 return Entry->Cost;
3396 }
3397
3398 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
3399
3400 MVT MTy = LT.second;
3401
3402 auto *ValVTy = cast<FixedVectorType>(ValTy);
3403
3404 unsigned ArithmeticCost = 0;
3405 if (LT.first != 1 && MTy.isVector() &&
3406 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
3407 // Type needs to be split. We need LT.first - 1 arithmetic ops.
3408 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
3409 MTy.getVectorNumElements());
3410 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
3411 ArithmeticCost *= LT.first - 1;
3412 }
3413
3414 if (ST->isSLM())
3415 if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
3416 return ArithmeticCost + Entry->Cost;
3417
3418 if (ST->hasAVX())
3419 if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
3420 return ArithmeticCost + Entry->Cost;
3421
3422 if (ST->hasSSE2())
3423 if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
3424 return ArithmeticCost + Entry->Cost;
3425
3426 // FIXME: These assume a naive kshift+binop lowering, which is probably
3427 // conservative in most cases.
3428 static const CostTblEntry AVX512BoolReduction[] = {
3429 { ISD::AND, MVT::v2i1, 3 },
3430 { ISD::AND, MVT::v4i1, 5 },
3431 { ISD::AND, MVT::v8i1, 7 },
3432 { ISD::AND, MVT::v16i1, 9 },
3433 { ISD::AND, MVT::v32i1, 11 },
3434 { ISD::AND, MVT::v64i1, 13 },
3435 { ISD::OR, MVT::v2i1, 3 },
3436 { ISD::OR, MVT::v4i1, 5 },
3437 { ISD::OR, MVT::v8i1, 7 },
3438 { ISD::OR, MVT::v16i1, 9 },
3439 { ISD::OR, MVT::v32i1, 11 },
3440 { ISD::OR, MVT::v64i1, 13 },
3441 };
3442
3443 static const CostTblEntry AVX2BoolReduction[] = {
3444 { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
3445 { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
3446 { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
3447 { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
3448 };
3449
3450 static const CostTblEntry AVX1BoolReduction[] = {
3451 { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
3452 { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
3453 { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
3454 { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
3455 { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
3456 { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
3457 { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
3458 { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
3459 };
3460
3461 static const CostTblEntry SSE2BoolReduction[] = {
3462 { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
3463 { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
3464 { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
3465 { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
3466 { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
3467 { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
3468 { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
3469 { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
3470 };
3471
3472 // Handle bool allof/anyof patterns.
3473 if (ValVTy->getElementType()->isIntegerTy(1)) {
3474 unsigned ArithmeticCost = 0;
3475 if (LT.first != 1 && MTy.isVector() &&
3476 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
3477 // Type needs to be split. We need LT.first - 1 arithmetic ops.
3478 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
3479 MTy.getVectorNumElements());
3480 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
3481 ArithmeticCost *= LT.first - 1;
3482 }
3483
3484 if (ST->hasAVX512())
3485 if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
3486 return ArithmeticCost + Entry->Cost;
3487 if (ST->hasAVX2())
3488 if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
3489 return ArithmeticCost + Entry->Cost;
3490 if (ST->hasAVX())
3491 if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
3492 return ArithmeticCost + Entry->Cost;
3493 if (ST->hasSSE2())
3494 if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
3495 return ArithmeticCost + Entry->Cost;
3496
3497 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, IsPairwise,
3498 CostKind);
3499 }
3500
3501 unsigned NumVecElts = ValVTy->getNumElements();
3502 unsigned ScalarSize = ValVTy->getScalarSizeInBits();
3503
3504 // Special case power of 2 reductions where the scalar type isn't changed
3505 // by type legalization.
3506 if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
3507 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, IsPairwise,
3508 CostKind);
3509
3510 unsigned ReductionCost = 0;
3511
3512 auto *Ty = ValVTy;
3513 if (LT.first != 1 && MTy.isVector() &&
3514 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
3515 // Type needs to be split. We need LT.first - 1 arithmetic ops.
3516 Ty = FixedVectorType::get(ValVTy->getElementType(),
3517 MTy.getVectorNumElements());
3518 ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
3519 ReductionCost *= LT.first - 1;
3520 NumVecElts = MTy.getVectorNumElements();
3521 }
3522
3523 // Now handle reduction with the legal type, taking into account size changes
3524 // at each level.
3525 while (NumVecElts > 1) {
3526 // Determine the size of the remaining vector we need to reduce.
3527 unsigned Size = NumVecElts * ScalarSize;
3528 NumVecElts /= 2;
3529 // If we're reducing from 256/512 bits, use an extract_subvector.
3530 if (Size > 128) {
3531 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
3532 ReductionCost +=
3533 getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts, SubTy);
3534 Ty = SubTy;
3535 } else if (Size == 128) {
3536 // Reducing from 128 bits is a permute of v2f64/v2i64.
3537 FixedVectorType *ShufTy;
3538 if (ValVTy->isFloatingPointTy())
3539 ShufTy =
3540 FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
3541 else
3542 ShufTy =
3543 FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
3544 ReductionCost +=
3545 getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
3546 } else if (Size == 64) {
3547 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
3548 FixedVectorType *ShufTy;
3549 if (ValVTy->isFloatingPointTy())
3550 ShufTy =
3551 FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
3552 else
3553 ShufTy =
3554 FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
3555 ReductionCost +=
3556 getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
3557 } else {
3558 // Reducing from smaller size is a shift by immediate.
3559 auto *ShiftTy = FixedVectorType::get(
3560 Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
3561 ReductionCost += getArithmeticInstrCost(
3562 Instruction::LShr, ShiftTy, CostKind,
3563 TargetTransformInfo::OK_AnyValue,
3564 TargetTransformInfo::OK_UniformConstantValue,
3565 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
3566 }
3567
3568 // Add the arithmetic op for this level.
3569 ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
3570 }
3571
3572 // Add the final extract element to the cost.
3573 return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
3574}
3575
3576int X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy, bool IsUnsigned) {
3577 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
3578
3579 MVT MTy = LT.second;
3580
3581 int ISD;
3582 if (Ty->isIntOrIntVectorTy()) {
3583 ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
3584 } else {
3585 assert(Ty->isFPOrFPVectorTy() &&((Ty->isFPOrFPVectorTy() && "Expected float point or integer vector type."
) ? static_cast<void> (0) : __assert_fail ("Ty->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3586, __PRETTY_FUNCTION__))
3586 "Expected float point or integer vector type.")((Ty->isFPOrFPVectorTy() && "Expected float point or integer vector type."
) ? static_cast<void> (0) : __assert_fail ("Ty->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3586, __PRETTY_FUNCTION__))
;
3587 ISD = ISD::FMINNUM;
3588 }
3589
3590 static const CostTblEntry SSE1CostTbl[] = {
3591 {ISD::FMINNUM, MVT::v4f32, 1},
3592 };
3593
3594 static const CostTblEntry SSE2CostTbl[] = {
3595 {ISD::FMINNUM, MVT::v2f64, 1},
3596 {ISD::SMIN, MVT::v8i16, 1},
3597 {ISD::UMIN, MVT::v16i8, 1},
3598 };
3599
3600 static const CostTblEntry SSE41CostTbl[] = {
3601 {ISD::SMIN, MVT::v4i32, 1},
3602 {ISD::UMIN, MVT::v4i32, 1},
3603 {ISD::UMIN, MVT::v8i16, 1},
3604 {ISD::SMIN, MVT::v16i8, 1},
3605 };
3606
3607 static const CostTblEntry SSE42CostTbl[] = {
3608 {ISD::UMIN, MVT::v2i64, 3}, // xor+pcmpgtq+blendvpd
3609 };
3610
3611 static const CostTblEntry AVX1CostTbl[] = {
3612 {ISD::FMINNUM, MVT::v8f32, 1},
3613 {ISD::FMINNUM, MVT::v4f64, 1},
3614 {ISD::SMIN, MVT::v8i32, 3},
3615 {ISD::UMIN, MVT::v8i32, 3},
3616 {ISD::SMIN, MVT::v16i16, 3},
3617 {ISD::UMIN, MVT::v16i16, 3},
3618 {ISD::SMIN, MVT::v32i8, 3},
3619 {ISD::UMIN, MVT::v32i8, 3},
3620 };
3621
3622 static const CostTblEntry AVX2CostTbl[] = {
3623 {ISD::SMIN, MVT::v8i32, 1},
3624 {ISD::UMIN, MVT::v8i32, 1},
3625 {ISD::SMIN, MVT::v16i16, 1},
3626 {ISD::UMIN, MVT::v16i16, 1},
3627 {ISD::SMIN, MVT::v32i8, 1},
3628 {ISD::UMIN, MVT::v32i8, 1},
3629 };
3630
3631 static const CostTblEntry AVX512CostTbl[] = {
3632 {ISD::FMINNUM, MVT::v16f32, 1},
3633 {ISD::FMINNUM, MVT::v8f64, 1},
3634 {ISD::SMIN, MVT::v2i64, 1},
3635 {ISD::UMIN, MVT::v2i64, 1},
3636 {ISD::SMIN, MVT::v4i64, 1},
3637 {ISD::UMIN, MVT::v4i64, 1},
3638 {ISD::SMIN, MVT::v8i64, 1},
3639 {ISD::UMIN, MVT::v8i64, 1},
3640 {ISD::SMIN, MVT::v16i32, 1},
3641 {ISD::UMIN, MVT::v16i32, 1},
3642 };
3643
3644 static const CostTblEntry AVX512BWCostTbl[] = {
3645 {ISD::SMIN, MVT::v32i16, 1},
3646 {ISD::UMIN, MVT::v32i16, 1},
3647 {ISD::SMIN, MVT::v64i8, 1},
3648 {ISD::UMIN, MVT::v64i8, 1},
3649 };
3650
3651 // If we have a native MIN/MAX instruction for this type, use it.
3652 if (ST->hasBWI())
3653 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3654 return LT.first * Entry->Cost;
3655
3656 if (ST->hasAVX512())
3657 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3658 return LT.first * Entry->Cost;
3659
3660 if (ST->hasAVX2())
3661 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3662 return LT.first * Entry->Cost;
3663
3664 if (ST->hasAVX())
3665 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3666 return LT.first * Entry->Cost;
3667
3668 if (ST->hasSSE42())
3669 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3670 return LT.first * Entry->Cost;
3671
3672 if (ST->hasSSE41())
3673 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3674 return LT.first * Entry->Cost;
3675
3676 if (ST->hasSSE2())
3677 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3678 return LT.first * Entry->Cost;
3679
3680 if (ST->hasSSE1())
3681 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3682 return LT.first * Entry->Cost;
3683
3684 unsigned CmpOpcode;
3685 if (Ty->isFPOrFPVectorTy()) {
3686 CmpOpcode = Instruction::FCmp;
3687 } else {
3688 assert(Ty->isIntOrIntVectorTy() &&((Ty->isIntOrIntVectorTy() && "expecting floating point or integer type for min/max reduction"
) ? static_cast<void> (0) : __assert_fail ("Ty->isIntOrIntVectorTy() && \"expecting floating point or integer type for min/max reduction\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3689, __PRETTY_FUNCTION__))
3689 "expecting floating point or integer type for min/max reduction")((Ty->isIntOrIntVectorTy() && "expecting floating point or integer type for min/max reduction"
) ? static_cast<void> (0) : __assert_fail ("Ty->isIntOrIntVectorTy() && \"expecting floating point or integer type for min/max reduction\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3689, __PRETTY_FUNCTION__))
;
3690 CmpOpcode = Instruction::ICmp;
3691 }
3692
3693 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3694 // Otherwise fall back to cmp+select.
3695 return getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CmpInst::BAD_ICMP_PREDICATE,
3696 CostKind) +
3697 getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
3698 CmpInst::BAD_ICMP_PREDICATE, CostKind);
3699}
3700
3701int X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy,
3702 bool IsPairwise, bool IsUnsigned,
3703 TTI::TargetCostKind CostKind) {
3704 // Just use the default implementation for pair reductions.
3705 if (IsPairwise)
3706 return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned,
3707 CostKind);
3708
3709 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
3710
3711 MVT MTy = LT.second;
3712
3713 int ISD;
3714 if (ValTy->isIntOrIntVectorTy()) {
3715 ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
3716 } else {
3717 assert(ValTy->isFPOrFPVectorTy() &&((ValTy->isFPOrFPVectorTy() && "Expected float point or integer vector type."
) ? static_cast<void> (0) : __assert_fail ("ValTy->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3718, __PRETTY_FUNCTION__))
3718 "Expected float point or integer vector type.")((ValTy->isFPOrFPVectorTy() && "Expected float point or integer vector type."
) ? static_cast<void> (0) : __assert_fail ("ValTy->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3718, __PRETTY_FUNCTION__))
;
3719 ISD = ISD::FMINNUM;
3720 }
3721
3722 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
3723 // and make it as the cost.
3724
3725 static const CostTblEntry SSE2CostTblNoPairWise[] = {
3726 {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
3727 {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
3728 {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
3729 };
3730
3731 static const CostTblEntry SSE41CostTblNoPairWise[] = {
3732 {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
3733 {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
3734 {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
3735 {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
3736 {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
3737 {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
3738 {ISD::SMIN, MVT::v2i8, 3}, // pminsb
3739 {ISD::SMIN, MVT::v4i8, 5}, // pminsb
3740 {ISD::SMIN, MVT::v8i8, 7}, // pminsb
3741 {ISD::SMIN, MVT::v16i8, 6},
3742 {ISD::UMIN, MVT::v2i8, 3}, // same as sse2
3743 {ISD::UMIN, MVT::v4i8, 5}, // same as sse2
3744 {ISD::UMIN, MVT::v8i8, 7}, // same as sse2
3745 {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
3746 };
3747
3748 static const CostTblEntry AVX1CostTblNoPairWise[] = {
3749 {ISD::SMIN, MVT::v16i16, 6},
3750 {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
3751 {ISD::SMIN, MVT::v32i8, 8},
3752 {ISD::UMIN, MVT::v32i8, 8},
3753 };
3754
3755 static const CostTblEntry AVX512BWCostTblNoPairWise[] = {
3756 {ISD::SMIN, MVT::v32i16, 8},
3757 {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
3758 {ISD::SMIN, MVT::v64i8, 10},
3759 {ISD::UMIN, MVT::v64i8, 10},
3760 };
3761
3762 // Before legalizing the type, give a chance to look up illegal narrow types
3763 // in the table.
3764 // FIXME: Is there a better way to do this?
3765 EVT VT = TLI->getValueType(DL, ValTy);
3766 if (VT.isSimple()) {
3767 MVT MTy = VT.getSimpleVT();
3768 if (ST->hasBWI())
3769 if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy))
3770 return Entry->Cost;
3771
3772 if (ST->hasAVX())
3773 if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
3774 return Entry->Cost;
3775
3776 if (ST->hasSSE41())
3777 if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy))
3778 return Entry->Cost;
3779
3780 if (ST->hasSSE2())
3781 if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
3782 return Entry->Cost;
3783 }
3784
3785 auto *ValVTy = cast<FixedVectorType>(ValTy);
3786 unsigned NumVecElts = ValVTy->getNumElements();
3787
3788 auto *Ty = ValVTy;
3789 unsigned MinMaxCost = 0;
3790 if (LT.first != 1 && MTy.isVector() &&
3791 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
3792 // Type needs to be split. We need LT.first - 1 operations ops.
3793 Ty = FixedVectorType::get(ValVTy->getElementType(),
3794 MTy.getVectorNumElements());
3795 auto *SubCondTy = FixedVectorType::get(CondTy->getElementType(),
3796 MTy.getVectorNumElements());
3797 MinMaxCost = getMinMaxCost(Ty, SubCondTy, IsUnsigned);
3798 MinMaxCost *= LT.first - 1;
3799 NumVecElts = MTy.getVectorNumElements();
3800 }
3801
3802 if (ST->hasBWI())
3803 if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy))
3804 return MinMaxCost + Entry->Cost;
3805
3806 if (ST->hasAVX())
3807 if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
3808 return MinMaxCost + Entry->Cost;
3809
3810 if (ST->hasSSE41())
3811 if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy))
3812 return MinMaxCost + Entry->Cost;
3813
3814 if (ST->hasSSE2())
3815 if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
3816 return MinMaxCost + Entry->Cost;
3817
3818 unsigned ScalarSize = ValTy->getScalarSizeInBits();
3819
3820 // Special case power of 2 reductions where the scalar type isn't changed
3821 // by type legalization.
3822 if (!isPowerOf2_32(ValVTy->getNumElements()) ||
3823 ScalarSize != MTy.getScalarSizeInBits())
3824 return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned,
3825 CostKind);
3826
3827 // Now handle reduction with the legal type, taking into account size changes
3828 // at each level.
3829 while (NumVecElts > 1) {
3830 // Determine the size of the remaining vector we need to reduce.
3831 unsigned Size = NumVecElts * ScalarSize;
3832 NumVecElts /= 2;
3833 // If we're reducing from 256/512 bits, use an extract_subvector.
3834 if (Size > 128) {
3835 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
3836 MinMaxCost +=
3837 getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts, SubTy);
3838 Ty = SubTy;
3839 } else if (Size == 128) {
3840 // Reducing from 128 bits is a permute of v2f64/v2i64.
3841 VectorType *ShufTy;
3842 if (ValTy->isFloatingPointTy())
3843 ShufTy =
3844 FixedVectorType::get(Type::getDoubleTy(ValTy->getContext()), 2);
3845 else
3846 ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
3847 MinMaxCost +=
3848 getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
3849 } else if (Size == 64) {
3850 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
3851 FixedVectorType *ShufTy;
3852 if (ValTy->isFloatingPointTy())
3853 ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
3854 else
3855 ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
3856 MinMaxCost +=
3857 getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
3858 } else {
3859 // Reducing from smaller size is a shift by immediate.
3860 auto *ShiftTy = FixedVectorType::get(
3861 Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
3862 MinMaxCost += getArithmeticInstrCost(
3863 Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
3864 TargetTransformInfo::OK_AnyValue,
3865 TargetTransformInfo::OK_UniformConstantValue,
3866 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
3867 }
3868
3869 // Add the arithmetic op for this level.
3870 auto *SubCondTy =
3871 FixedVectorType::get(CondTy->getElementType(), Ty->getNumElements());
3872 MinMaxCost += getMinMaxCost(Ty, SubCondTy, IsUnsigned);
3873 }
3874
3875 // Add the final extract element to the cost.
3876 return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
3877}
3878
3879/// Calculate the cost of materializing a 64-bit value. This helper
3880/// method might only calculate a fraction of a larger immediate. Therefore it
3881/// is valid to return a cost of ZERO.
3882int X86TTIImpl::getIntImmCost(int64_t Val) {
3883 if (Val == 0)
3884 return TTI::TCC_Free;
3885
3886 if (isInt<32>(Val))
3887 return TTI::TCC_Basic;
3888
3889 return 2 * TTI::TCC_Basic;
3890}
3891
3892int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
3893 TTI::TargetCostKind CostKind) {
3894 assert(Ty->isIntegerTy())((Ty->isIntegerTy()) ? static_cast<void> (0) : __assert_fail
("Ty->isIntegerTy()", "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3894, __PRETTY_FUNCTION__))
;
3895
3896 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3897 if (BitSize == 0)
3898 return ~0U;
3899
3900 // Never hoist constants larger than 128bit, because this might lead to
3901 // incorrect code generation or assertions in codegen.
3902 // Fixme: Create a cost model for types larger than i128 once the codegen
3903 // issues have been fixed.
3904 if (BitSize > 128)
3905 return TTI::TCC_Free;
3906
3907 if (Imm == 0)
3908 return TTI::TCC_Free;
3909
3910 // Sign-extend all constants to a multiple of 64-bit.
3911 APInt ImmVal = Imm;
3912 if (BitSize % 64 != 0)
3913 ImmVal = Imm.sext(alignTo(BitSize, 64));
3914
3915 // Split the constant into 64-bit chunks and calculate the cost for each
3916 // chunk.
3917 int Cost = 0;
3918 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
3919 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
3920 int64_t Val = Tmp.getSExtValue();
3921 Cost += getIntImmCost(Val);
3922 }
3923 // We need at least one instruction to materialize the constant.
3924 return std::max(1, Cost);
3925}
3926
3927int X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
3928 const APInt &Imm, Type *Ty,
3929 TTI::TargetCostKind CostKind,
3930 Instruction *Inst) {
3931 assert(Ty->isIntegerTy())((Ty->isIntegerTy()) ? static_cast<void> (0) : __assert_fail
("Ty->isIntegerTy()", "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3931, __PRETTY_FUNCTION__))
;
3932
3933 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3934 // There is no cost model for constants with a bit size of 0. Return TCC_Free
3935 // here, so that constant hoisting will ignore this constant.
3936 if (BitSize == 0)
3937 return TTI::TCC_Free;
3938
3939 unsigned ImmIdx = ~0U;
3940 switch (Opcode) {
3941 default:
3942 return TTI::TCC_Free;
3943 case Instruction::GetElementPtr:
3944 // Always hoist the base address of a GetElementPtr. This prevents the
3945 // creation of new constants for every base constant that gets constant
3946 // folded with the offset.
3947 if (Idx == 0)
3948 return 2 * TTI::TCC_Basic;
3949 return TTI::TCC_Free;
3950 case Instruction::Store:
3951 ImmIdx = 0;
3952 break;
3953 case Instruction::ICmp:
3954 // This is an imperfect hack to prevent constant hoisting of
3955 // compares that might be trying to check if a 64-bit value fits in
3956 // 32-bits. The backend can optimize these cases using a right shift by 32.
3957 // Ideally we would check the compare predicate here. There also other
3958 // similar immediates the backend can use shifts for.
3959 if (Idx == 1 && Imm.getBitWidth() == 64) {
3960 uint64_t ImmVal = Imm.getZExtValue();
3961 if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
3962 return TTI::TCC_Free;
3963 }
3964 ImmIdx = 1;
3965 break;
3966 case Instruction::And:
3967 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
3968 // by using a 32-bit operation with implicit zero extension. Detect such
3969 // immediates here as the normal path expects bit 31 to be sign extended.
3970 if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue()))
3971 return TTI::TCC_Free;
3972 ImmIdx = 1;
3973 break;
3974 case Instruction::Add:
3975 case Instruction::Sub:
3976 // For add/sub, we can use the opposite instruction for INT32_MIN.
3977 if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000)
3978 return TTI::TCC_Free;
3979 ImmIdx = 1;
3980 break;
3981 case Instruction::UDiv:
3982 case Instruction::SDiv:
3983 case Instruction::URem:
3984 case Instruction::SRem:
3985 // Division by constant is typically expanded later into a different
3986 // instruction sequence. This completely changes the constants.
3987 // Report them as "free" to stop ConstantHoist from marking them as opaque.
3988 return TTI::TCC_Free;
3989 case Instruction::Mul:
3990 case Instruction::Or:
3991 case Instruction::Xor:
3992 ImmIdx = 1;
3993 break;
3994 // Always return TCC_Free for the shift value of a shift instruction.
3995 case Instruction::Shl:
3996 case Instruction::LShr:
3997 case Instruction::AShr:
3998 if (Idx == 1)
3999 return TTI::TCC_Free;
4000 break;
4001 case Instruction::Trunc:
4002 case Instruction::ZExt:
4003 case Instruction::SExt:
4004 case Instruction::IntToPtr:
4005 case Instruction::PtrToInt:
4006 case Instruction::BitCast:
4007 case Instruction::PHI:
4008 case Instruction::Call:
4009 case Instruction::Select:
4010 case Instruction::Ret:
4011 case Instruction::Load:
4012 break;
4013 }
4014
4015 if (Idx == ImmIdx) {
4016 int NumConstants = divideCeil(BitSize, 64);
4017 int Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
4018 return (Cost <= NumConstants * TTI::TCC_Basic)
4019 ? static_cast<int>(TTI::TCC_Free)
4020 : Cost;
4021 }
4022
4023 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
4024}
4025
4026int X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
4027 const APInt &Imm, Type *Ty,
4028 TTI::TargetCostKind CostKind) {
4029 assert(Ty->isIntegerTy())((Ty->isIntegerTy()) ? static_cast<void> (0) : __assert_fail
("Ty->isIntegerTy()", "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 4029, __PRETTY_FUNCTION__))
;
4030
4031 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4032 // There is no cost model for constants with a bit size of 0. Return TCC_Free
4033 // here, so that constant hoisting will ignore this constant.
4034 if (BitSize == 0)
4035 return TTI::TCC_Free;
4036
4037 switch (IID) {
4038 default:
4039 return TTI::TCC_Free;
4040 case Intrinsic::sadd_with_overflow:
4041 case Intrinsic::uadd_with_overflow:
4042 case Intrinsic::ssub_with_overflow:
4043 case Intrinsic::usub_with_overflow:
4044 case Intrinsic::smul_with_overflow:
4045 case Intrinsic::umul_with_overflow:
4046 if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
4047 return TTI::TCC_Free;
4048 break;
4049 case Intrinsic::experimental_stackmap:
4050 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
4051 return TTI::TCC_Free;
4052 break;
4053 case Intrinsic::experimental_patchpoint_void:
4054 case Intrinsic::experimental_patchpoint_i64:
4055 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
4056 return TTI::TCC_Free;
4057 break;
4058 }
4059 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
4060}
4061
4062unsigned
4063X86TTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) {
4064 if (CostKind != TTI::TCK_RecipThroughput)
4065 return Opcode == Instruction::PHI ? 0 : 1;
4066 // Branches are assumed to be predicted.
4067 return CostKind == TTI::TCK_RecipThroughput ? 0 : 1;
4068}
4069
4070int X86TTIImpl::getGatherOverhead() const {
4071 // Some CPUs have more overhead for gather. The specified overhead is relative
4072 // to the Load operation. "2" is the number provided by Intel architects. This
4073 // parameter is used for cost estimation of Gather Op and comparison with
4074 // other alternatives.
4075 // TODO: Remove the explicit hasAVX512()?, That would mean we would only
4076 // enable gather with a -march.
4077 if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
4078 return 2;
4079
4080 return 1024;
4081}
4082
4083int X86TTIImpl::getScatterOverhead() const {
4084 if (ST->hasAVX512())
4085 return 2;
4086
4087 return 1024;
4088}
4089
4090// Return an average cost of Gather / Scatter instruction, maybe improved later.
4091// FIXME: Add TargetCostKind support.
4092int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, const Value *Ptr,
4093 Align Alignment, unsigned AddressSpace) {
4094
4095 assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost")((isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost"
) ? static_cast<void> (0) : __assert_fail ("isa<VectorType>(SrcVTy) && \"Unexpected type in getGSVectorCost\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 4095, __PRETTY_FUNCTION__))
;
4096 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
4097
4098 // Try to reduce index size from 64 bit (default for GEP)
4099 // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
4100 // operation will use 16 x 64 indices which do not fit in a zmm and needs
4101 // to split. Also check that the base pointer is the same for all lanes,
4102 // and that there's at most one variable index.
4103 auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
4104 unsigned IndexSize = DL.getPointerSizeInBits();
4105 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
4106 if (IndexSize < 64 || !GEP)
4107 return IndexSize;
4108
4109 unsigned NumOfVarIndices = 0;
4110 const Value *Ptrs = GEP->getPointerOperand();
4111 if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
4112 return IndexSize;
4113 for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
4114 if (isa<Constant>(GEP->getOperand(i)))
4115 continue;
4116 Type *IndxTy = GEP->getOperand(i)->getType();
4117 if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
4118 IndxTy = IndexVTy->getElementType();
4119 if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
4120 !isa<SExtInst>(GEP->getOperand(i))) ||
4121 ++NumOfVarIndices > 1)
4122 return IndexSize; // 64
4123 }
4124 return (unsigned)32;
4125 };
4126
4127 // Trying to reduce IndexSize to 32 bits for vector 16.
4128 // By default the IndexSize is equal to pointer size.
4129 unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
4130 ? getIndexSizeInBits(Ptr, DL)
4131 : DL.getPointerSizeInBits();
4132
4133 auto *IndexVTy = FixedVectorType::get(
4134 IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
4135 std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
4136 std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
4137 int SplitFactor = std::max(IdxsLT.first, SrcLT.first);
4138 if (SplitFactor > 1) {
4139 // Handle splitting of vector of pointers
4140 auto *SplitSrcTy =
4141 FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
4142 return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
4143 AddressSpace);
4144 }
4145
4146 // The gather / scatter cost is given by Intel architects. It is a rough
4147 // number since we are looking at one instruction in a time.
4148 const int GSOverhead = (Opcode == Instruction::Load)
4149 ? getGatherOverhead()
4150 : getScatterOverhead();
4151 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
4152 MaybeAlign(Alignment), AddressSpace,
4153 TTI::TCK_RecipThroughput);
4154}
4155
4156/// Return the cost of full scalarization of gather / scatter operation.
4157///
4158/// Opcode - Load or Store instruction.
4159/// SrcVTy - The type of the data vector that should be gathered or scattered.
4160/// VariableMask - The mask is non-constant at compile time.
4161/// Alignment - Alignment for one element.
4162/// AddressSpace - pointer[s] address space.
4163///
4164/// FIXME: Add TargetCostKind support.
4165int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
4166 bool VariableMask, Align Alignment,
4167 unsigned AddressSpace) {
4168 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
4169 APInt DemandedElts = APInt::getAllOnesValue(VF);
4170 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4171
4172 int MaskUnpackCost = 0;
4173 if (VariableMask) {
4174 auto *MaskTy =
4175 FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
4176 MaskUnpackCost =
4177 getScalarizationOverhead(MaskTy, DemandedElts, false, true);
4178 int ScalarCompareCost = getCmpSelInstrCost(
4179 Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr,
4180 CmpInst::BAD_ICMP_PREDICATE, CostKind);
4181 int BranchCost = getCFInstrCost(Instruction::Br, CostKind);
4182 MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
4183 }
4184
4185 // The cost of the scalar loads/stores.
4186 int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
4187 MaybeAlign(Alignment), AddressSpace,
4188 CostKind);
4189
4190 int InsertExtractCost = 0;
4191 if (Opcode == Instruction::Load)
4192 for (unsigned i = 0; i < VF; ++i)
4193 // Add the cost of inserting each scalar load into the vector
4194 InsertExtractCost +=
4195 getVectorInstrCost(Instruction::InsertElement, SrcVTy, i);
4196 else
4197 for (unsigned i = 0; i < VF; ++i)
4198 // Add the cost of extracting each element out of the data vector
4199 InsertExtractCost +=
4200 getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i);
4201
4202 return MemoryOpCost + MaskUnpackCost + InsertExtractCost;
4203}
4204
4205/// Calculate the cost of Gather / Scatter operation
4206int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
4207 const Value *Ptr, bool VariableMask,
4208 Align Alignment,
4209 TTI::TargetCostKind CostKind,
4210 const Instruction *I = nullptr) {
4211 if (CostKind != TTI::TCK_RecipThroughput) {
4212 if ((Opcode == Instruction::Load &&
4213 isLegalMaskedGather(SrcVTy, Align(Alignment))) ||
4214 (Opcode == Instruction::Store &&
4215 isLegalMaskedScatter(SrcVTy, Align(Alignment))))
4216 return 1;
4217 return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask,
4218 Alignment, CostKind, I);
4219 }
4220
4221 assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter")((SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter"
) ? static_cast<void> (0) : __assert_fail ("SrcVTy->isVectorTy() && \"Unexpected data type for Gather/Scatter\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 4221, __PRETTY_FUNCTION__))
;
4222 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
4223 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
4224 if (!PtrTy && Ptr->getType()->isVectorTy())
4225 PtrTy = dyn_cast<PointerType>(
4226 cast<VectorType>(Ptr->getType())->getElementType());
4227 assert(PtrTy && "Unexpected type for Ptr argument")((PtrTy && "Unexpected type for Ptr argument") ? static_cast
<void> (0) : __assert_fail ("PtrTy && \"Unexpected type for Ptr argument\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 4227, __PRETTY_FUNCTION__))
;
4228 unsigned AddressSpace = PtrTy->getAddressSpace();
4229
4230 bool Scalarize = false;
4231 if ((Opcode == Instruction::Load &&
4232 !isLegalMaskedGather(SrcVTy, Align(Alignment))) ||
4233 (Opcode == Instruction::Store &&
4234 !isLegalMaskedScatter(SrcVTy, Align(Alignment))))
4235 Scalarize = true;
4236 // Gather / Scatter for vector 2 is not profitable on KNL / SKX
4237 // Vector-4 of gather/scatter instruction does not exist on KNL.
4238 // We can extend it to 8 elements, but zeroing upper bits of
4239 // the mask vector will add more instructions. Right now we give the scalar
4240 // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction
4241 // is better in the VariableMask case.
4242 if (ST->hasAVX512() && (VF == 2 || (VF == 4 && !ST->hasVLX())))
4243 Scalarize = true;
4244
4245 if (Scalarize)
4246 return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
4247 AddressSpace);
4248
4249 return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
4250}
4251
4252bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
4253 TargetTransformInfo::LSRCost &C2) {
4254 // X86 specific here are "instruction number 1st priority".
4255 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
4256 C1.NumIVMuls, C1.NumBaseAdds,
4257 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
4258 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
4259 C2.NumIVMuls, C2.NumBaseAdds,
4260 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
4261}
4262
4263bool X86TTIImpl::canMacroFuseCmp() {
4264 return ST->hasMacroFusion() || ST->hasBranchFusion();
4265}
4266
4267bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
4268 if (!ST->hasAVX())
4269 return false;
4270
4271 // The backend can't handle a single element vector.
4272 if (isa<VectorType>(DataTy) &&
4273 cast<FixedVectorType>(DataTy)->getNumElements() == 1)
4274 return false;
4275 Type *ScalarTy = DataTy->getScalarType();
4276
4277 if (ScalarTy->isPointerTy())
4278 return true;
4279
4280 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
4281 return true;
4282
4283 if (!ScalarTy->isIntegerTy())
4284 return false;
4285
4286 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
4287 return IntWidth == 32 || IntWidth == 64 ||
4288 ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
4289}
4290
4291bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) {
4292 return isLegalMaskedLoad(DataType, Alignment);
4293}
4294
4295bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) {
4296 unsigned DataSize = DL.getTypeStoreSize(DataType);
4297 // The only supported nontemporal loads are for aligned vectors of 16 or 32
4298 // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
4299 // (the equivalent stores only require AVX).
4300 if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
4301 return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
4302
4303 return false;
4304}
4305
4306bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) {
4307 unsigned DataSize = DL.getTypeStoreSize(DataType);
4308
4309 // SSE4A supports nontemporal stores of float and double at arbitrary
4310 // alignment.
4311 if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
4312 return true;
4313
4314 // Besides the SSE4A subtarget exception above, only aligned stores are
4315 // available nontemporaly on any other subtarget. And only stores with a size
4316 // of 4..32 bytes (powers of 2, only) are permitted.
4317 if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
4318 !isPowerOf2_32(DataSize))
4319 return false;
4320
4321 // 32-byte vector nontemporal stores are supported by AVX (the equivalent
4322 // loads require AVX2).
4323 if (DataSize == 32)
4324 return ST->hasAVX();
4325 else if (DataSize == 16)
4326 return ST->hasSSE1();
4327 return true;
4328}
4329
4330bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) {
4331 if (!isa<VectorType>(DataTy))
4332 return false;
4333
4334 if (!ST->hasAVX512())
4335 return false;
4336
4337 // The backend can't handle a single element vector.
4338 if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
4339 return false;
4340
4341 Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
4342
4343 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
4344 return true;
4345
4346 if (!ScalarTy->isIntegerTy())
4347 return false;
4348
4349 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
4350 return IntWidth == 32 || IntWidth == 64 ||
4351 ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
4352}
4353
4354bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) {
4355 return isLegalMaskedExpandLoad(DataTy);
4356}
4357
4358bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
4359 // Some CPUs have better gather performance than others.
4360 // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
4361 // enable gather with a -march.
4362 if (!(ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2())))
4363 return false;
4364
4365 // This function is called now in two cases: from the Loop Vectorizer
4366 // and from the Scalarizer.
4367 // When the Loop Vectorizer asks about legality of the feature,
4368 // the vectorization factor is not calculated yet. The Loop Vectorizer
4369 // sends a scalar type and the decision is based on the width of the
4370 // scalar element.
4371 // Later on, the cost model will estimate usage this intrinsic based on
4372 // the vector type.
4373 // The Scalarizer asks again about legality. It sends a vector type.
4374 // In this case we can reject non-power-of-2 vectors.
4375 // We also reject single element vectors as the type legalizer can't
4376 // scalarize it.
4377 if (auto *DataVTy = dyn_cast<FixedVectorType>(DataTy)) {
4378 unsigned NumElts = DataVTy->getNumElements();
4379 if (NumElts == 1)
4380 return false;
4381 }
4382 Type *ScalarTy = DataTy->getScalarType();
4383 if (ScalarTy->isPointerTy())
4384 return true;
4385
4386 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
4387 return true;
4388
4389 if (!ScalarTy->isIntegerTy())
4390 return false;
4391
4392 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
4393 return IntWidth == 32 || IntWidth == 64;
4394}
4395
4396bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) {
4397 // AVX2 doesn't support scatter
4398 if (!ST->hasAVX512())
4399 return false;
4400 return isLegalMaskedGather(DataType, Alignment);
4401}
4402
4403bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
4404 EVT VT = TLI->getValueType(DL, DataType);
4405 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
4406}
4407
4408bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
4409 return false;
4410}
4411
4412bool X86TTIImpl::areInlineCompatible(const Function *Caller,
4413 const Function *Callee) const {
4414 const TargetMachine &TM = getTLI()->getTargetMachine();
4415
4416 // Work this as a subsetting of subtarget features.
4417 const FeatureBitset &CallerBits =
4418 TM.getSubtargetImpl(*Caller)->getFeatureBits();
4419 const FeatureBitset &CalleeBits =
4420 TM.getSubtargetImpl(*Callee)->getFeatureBits();
4421
4422 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
4423 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
4424 return (RealCallerBits & RealCalleeBits) == RealCalleeBits;
4425}
4426
4427bool X86TTIImpl::areFunctionArgsABICompatible(
4428 const Function *Caller, const Function *Callee,
4429 SmallPtrSetImpl<Argument *> &Args) const {
4430 if (!BaseT::areFunctionArgsABICompatible(Caller, Callee, Args))
4431 return false;
4432
4433 // If we get here, we know the target features match. If one function
4434 // considers 512-bit vectors legal and the other does not, consider them
4435 // incompatible.
4436 const TargetMachine &TM = getTLI()->getTargetMachine();
4437
4438 if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
4439 TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs())
4440 return true;
4441
4442 // Consider the arguments compatible if they aren't vectors or aggregates.
4443 // FIXME: Look at the size of vectors.
4444 // FIXME: Look at the element types of aggregates to see if there are vectors.
4445 // FIXME: The API of this function seems intended to allow arguments
4446 // to be removed from the set, but the caller doesn't check if the set
4447 // becomes empty so that may not work in practice.
4448 return llvm::none_of(Args, [](Argument *A) {
4449 auto *EltTy = cast<PointerType>(A->getType())->getElementType();
4450 return EltTy->isVectorTy() || EltTy->isAggregateType();
4451 });
4452}
4453
4454X86TTIImpl::TTI::MemCmpExpansionOptions
4455X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
4456 TTI::MemCmpExpansionOptions Options;
4457 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
4458 Options.NumLoadsPerBlock = 2;
4459 // All GPR and vector loads can be unaligned.
4460 Options.AllowOverlappingLoads = true;
4461 if (IsZeroCmp) {
4462 // Only enable vector loads for equality comparison. Right now the vector
4463 // version is not as fast for three way compare (see #33329).
4464 const unsigned PreferredWidth = ST->getPreferVectorWidth();
4465 if (PreferredWidth >= 512 && ST->hasAVX512()) Options.LoadSizes.push_back(64);
4466 if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
4467 if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
4468 }
4469 if (ST->is64Bit()) {
4470 Options.LoadSizes.push_back(8);
4471 }
4472 Options.LoadSizes.push_back(4);
4473 Options.LoadSizes.push_back(2);
4474 Options.LoadSizes.push_back(1);
4475 return Options;
4476}
4477
4478bool X86TTIImpl::enableInterleavedAccessVectorization() {
4479 // TODO: We expect this to be beneficial regardless of arch,
4480 // but there are currently some unexplained performance artifacts on Atom.
4481 // As a temporary solution, disable on Atom.
4482 return !(ST->isAtom());
4483}
4484
4485// Get estimation for interleaved load/store operations for AVX2.
4486// \p Factor is the interleaved-access factor (stride) - number of
4487// (interleaved) elements in the group.
4488// \p Indices contains the indices for a strided load: when the
4489// interleaved load has gaps they indicate which elements are used.
4490// If Indices is empty (or if the number of indices is equal to the size
4491// of the interleaved-access as given in \p Factor) the access has no gaps.
4492//
4493// As opposed to AVX-512, AVX2 does not have generic shuffles that allow
4494// computing the cost using a generic formula as a function of generic
4495// shuffles. We therefore use a lookup table instead, filled according to
4496// the instruction sequences that codegen currently generates.
4497int X86TTIImpl::getInterleavedMemoryOpCostAVX2(
4498 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
4499 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
4500 TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
4501
4502 if (UseMaskForCond || UseMaskForGaps)
4503 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
4504 Alignment, AddressSpace, CostKind,
4505 UseMaskForCond, UseMaskForGaps);
4506
4507 // We currently Support only fully-interleaved groups, with no gaps.
4508 // TODO: Support also strided loads (interleaved-groups with gaps).
4509 if (Indices.size() && Indices.size() != Factor)
4510 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
4511 Alignment, AddressSpace,
4512 CostKind);
4513
4514 // VecTy for interleave memop is <VF*Factor x Elt>.
4515 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
4516 // VecTy = <12 x i32>.
4517 MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
4518
4519 // This function can be called with VecTy=<6xi128>, Factor=3, in which case
4520 // the VF=2, while v2i128 is an unsupported MVT vector type
4521 // (see MachineValueType.h::getVectorVT()).
4522 if (!LegalVT.isVector())
4523 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
4524 Alignment, AddressSpace,
4525 CostKind);
4526
4527 unsigned VF = VecTy->getNumElements() / Factor;
4528 Type *ScalarTy = VecTy->getElementType();
4529
4530 // Calculate the number of memory operations (NumOfMemOps), required
4531 // for load/store the VecTy.
4532 unsigned VecTySize = DL.getTypeStoreSize(VecTy);
4533 unsigned LegalVTSize = LegalVT.getStoreSize();
4534 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
4535
4536 // Get the cost of one memory operation.
4537 auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
4538 LegalVT.getVectorNumElements());
4539 unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy,
4540 MaybeAlign(Alignment), AddressSpace,
4541 CostKind);
4542
4543 auto *VT = FixedVectorType::get(ScalarTy, VF);
4544 EVT ETy = TLI->getValueType(DL, VT);
4545 if (!ETy.isSimple())
4546 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
4547 Alignment, AddressSpace,
4548 CostKind);
4549
4550 // TODO: Complete for other data-types and strides.
4551 // Each combination of Stride, ElementTy and VF results in a different
4552 // sequence; The cost tables are therefore accessed with:
4553 // Factor (stride) and VectorType=VFxElemType.
4554 // The Cost accounts only for the shuffle sequence;
4555 // The cost of the loads/stores is accounted for separately.
4556 //
4557 static const CostTblEntry AVX2InterleavedLoadTbl[] = {
4558 { 2, MVT::v4i64, 6 }, //(load 8i64 and) deinterleave into 2 x 4i64
4559 { 2, MVT::v4f64, 6 }, //(load 8f64 and) deinterleave into 2 x 4f64
4560
4561 { 3, MVT::v2i8, 10 }, //(load 6i8 and) deinterleave into 3 x 2i8
4562 { 3, MVT::v4i8, 4 }, //(load 12i8 and) deinterleave into 3 x 4i8
4563 { 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8
4564 { 3, MVT::v16i8, 11}, //(load 48i8 and) deinterleave into 3 x 16i8
4565 { 3, MVT::v32i8, 13}, //(load 96i8 and) deinterleave into 3 x 32i8
4566 { 3, MVT::v8f32, 17 }, //(load 24f32 and)deinterleave into 3 x 8f32
4567
4568 { 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8
4569 { 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8
4570 { 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8
4571 { 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8
4572 { 4, MVT::v32i8, 80 }, //(load 128i8 and) deinterleave into 4 x 32i8
4573
4574 { 8, MVT::v8f32, 40 } //(load 64f32 and)deinterleave into 8 x 8f32
4575 };
4576
4577 static const CostTblEntry AVX2InterleavedStoreTbl[] = {
4578 { 2, MVT::v4i64, 6 }, //interleave into 2 x 4i64 into 8i64 (and store)
4579 { 2, MVT::v4f64, 6 }, //interleave into 2 x 4f64 into 8f64 (and store)
4580
4581 { 3, MVT::v2i8, 7 }, //interleave 3 x 2i8 into 6i8 (and store)
4582 { 3, MVT::v4i8, 8 }, //interleave 3 x 4i8 into 12i8 (and store)
4583 { 3, MVT::v8i8, 11 }, //interleave 3 x 8i8 into 24i8 (and store)
4584 { 3, MVT::v16i8, 11 }, //interleave 3 x 16i8 into 48i8 (and store)
4585 { 3, MVT::v32i8, 13 }, //interleave 3 x 32i8 into 96i8 (and store)
4586
4587 { 4, MVT::v2i8, 12 }, //interleave 4 x 2i8 into 8i8 (and store)
4588 { 4, MVT::v4i8, 9 }, //interleave 4 x 4i8 into 16i8 (and store)
4589 { 4, MVT::v8i8, 10 }, //interleave 4 x 8i8 into 32i8 (and store)
4590 { 4, MVT::v16i8, 10 }, //interleave 4 x 16i8 into 64i8 (and store)
4591 { 4, MVT::v32i8, 12 } //interleave 4 x 32i8 into 128i8 (and store)
4592 };
4593
4594 if (Opcode == Instruction::Load) {
4595 if (const auto *Entry =
4596 CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT()))
4597 return NumOfMemOps * MemOpCost + Entry->Cost;
4598 } else {
4599 assert(Opcode == Instruction::Store &&((Opcode == Instruction::Store && "Expected Store Instruction at this point"
) ? static_cast<void> (0) : __assert_fail ("Opcode == Instruction::Store && \"Expected Store Instruction at this point\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 4600, __PRETTY_FUNCTION__))
4600 "Expected Store Instruction at this point")((Opcode == Instruction::Store && "Expected Store Instruction at this point"
) ? static_cast<void> (0) : __assert_fail ("Opcode == Instruction::Store && \"Expected Store Instruction at this point\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 4600, __PRETTY_FUNCTION__))
;
4601 if (const auto *Entry =
4602 CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT()))
4603 return NumOfMemOps * MemOpCost + Entry->Cost;
4604 }
4605
4606 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
4607 Alignment, AddressSpace, CostKind);
4608}
4609
4610// Get estimation for interleaved load/store operations and strided load.
4611// \p Indices contains indices for strided load.
4612// \p Factor - the factor of interleaving.
4613// AVX-512 provides 3-src shuffles that significantly reduces the cost.
4614int X86TTIImpl::getInterleavedMemoryOpCostAVX512(
4615 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
4616 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
4617 TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
4618
4619 if (UseMaskForCond || UseMaskForGaps)
4620 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
4621 Alignment, AddressSpace, CostKind,
4622 UseMaskForCond, UseMaskForGaps);
4623
4624 // VecTy for interleave memop is <VF*Factor x Elt>.
4625 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
4626 // VecTy = <12 x i32>.
4627
4628 // Calculate the number of memory operations (NumOfMemOps), required
4629 // for load/store the VecTy.
4630 MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
4631 unsigned VecTySize = DL.getTypeStoreSize(VecTy);
4632 unsigned LegalVTSize = LegalVT.getStoreSize();
4633 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
4634
4635 // Get the cost of one memory operation.
4636 auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
4637 LegalVT.getVectorNumElements());
4638 unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy,
4639 MaybeAlign(Alignment), AddressSpace,
4640 CostKind);
4641
4642 unsigned VF = VecTy->getNumElements() / Factor;
4643 MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
4644
4645 if (Opcode == Instruction::Load) {
4646 // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
4647 // contain the cost of the optimized shuffle sequence that the
4648 // X86InterleavedAccess pass will generate.
4649 // The cost of loads and stores are computed separately from the table.
4650
4651 // X86InterleavedAccess support only the following interleaved-access group.
4652 static const CostTblEntry AVX512InterleavedLoadTbl[] = {
4653 {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
4654 {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
4655 {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
4656 };
4657
4658 if (const auto *Entry =
4659 CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
4660 return NumOfMemOps * MemOpCost + Entry->Cost;
4661 //If an entry does not exist, fallback to the default implementation.
4662
4663 // Kind of shuffle depends on number of loaded values.
4664 // If we load the entire data in one register, we can use a 1-src shuffle.
4665 // Otherwise, we'll merge 2 sources in each operation.
4666 TTI::ShuffleKind ShuffleKind =
4667 (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
4668
4669 unsigned ShuffleCost =
4670 getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr);
4671
4672 unsigned NumOfLoadsInInterleaveGrp =
4673 Indices.size() ? Indices.size() : Factor;
4674 auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
4675 VecTy->getNumElements() / Factor);
4676 unsigned NumOfResults =
4677 getTLI()->getTypeLegalizationCost(DL, ResultTy).first *
4678 NumOfLoadsInInterleaveGrp;
4679
4680 // About a half of the loads may be folded in shuffles when we have only
4681 // one result. If we have more than one result, we do not fold loads at all.
4682 unsigned NumOfUnfoldedLoads =
4683 NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
4684
4685 // Get a number of shuffle operations per result.
4686 unsigned NumOfShufflesPerResult =
4687 std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
4688
4689 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
4690 // When we have more than one destination, we need additional instructions
4691 // to keep sources.
4692 unsigned NumOfMoves = 0;
4693 if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
4694 NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
4695
4696 int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
4697 NumOfUnfoldedLoads * MemOpCost + NumOfMoves;
4698
4699 return Cost;
4700 }
4701
4702 // Store.
4703 assert(Opcode == Instruction::Store &&((Opcode == Instruction::Store && "Expected Store Instruction at this point"
) ? static_cast<void> (0) : __assert_fail ("Opcode == Instruction::Store && \"Expected Store Instruction at this point\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 4704, __PRETTY_FUNCTION__))
4704 "Expected Store Instruction at this point")((Opcode == Instruction::Store && "Expected Store Instruction at this point"
) ? static_cast<void> (0) : __assert_fail ("Opcode == Instruction::Store && \"Expected Store Instruction at this point\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 4704, __PRETTY_FUNCTION__))
;
4705 // X86InterleavedAccess support only the following interleaved-access group.
4706 static const CostTblEntry AVX512InterleavedStoreTbl[] = {
4707 {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
4708 {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
4709 {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
4710
4711 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
4712 {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
4713 {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
4714 {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
4715 };
4716
4717 if (const auto *Entry =
4718 CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
4719 return NumOfMemOps * MemOpCost + Entry->Cost;
4720 //If an entry does not exist, fallback to the default implementation.
4721
4722 // There is no strided stores meanwhile. And store can't be folded in
4723 // shuffle.
4724 unsigned NumOfSources = Factor; // The number of values to be merged.
4725 unsigned ShuffleCost =
4726 getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr);
4727 unsigned NumOfShufflesPerStore = NumOfSources - 1;
4728
4729 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
4730 // We need additional instructions to keep sources.
4731 unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
4732 int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
4733 NumOfMoves;
4734 return Cost;
4735}
4736
4737int X86TTIImpl::getInterleavedMemoryOpCost(
4738 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
4739 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
4740 bool UseMaskForCond, bool UseMaskForGaps) {
4741 auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) {
4742 Type *EltTy = cast<VectorType>(VecTy)->getElementType();
4743 if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
4744 EltTy->isIntegerTy(32) || EltTy->isPointerTy())
4745 return true;
4746 if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8))
4747 return HasBW;
4748 return false;
4749 };
4750 if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
1
Calling 'X86Subtarget::hasAVX512'
4
Returning from 'X86Subtarget::hasAVX512'
4751 return getInterleavedMemoryOpCostAVX512(
4752 Opcode, cast<FixedVectorType>(VecTy), Factor, Indices, Alignment,
4753 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
4754 if (ST->hasAVX2())
5
Calling 'X86Subtarget::hasAVX2'
8
Returning from 'X86Subtarget::hasAVX2'
9
Taking false branch
4755 return getInterleavedMemoryOpCostAVX2(
4756 Opcode, cast<FixedVectorType>(VecTy), Factor, Indices, Alignment,
4757 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
4758
4759 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
10
Calling 'BasicTTIImplBase::getInterleavedMemoryOpCost'
4760 Alignment, AddressSpace, CostKind,
4761 UseMaskForCond, UseMaskForGaps);
4762}

/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86Subtarget.h

1//===-- X86Subtarget.h - Define Subtarget for the X86 ----------*- C++ -*--===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file declares the X86 specific subclass of TargetSubtargetInfo.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_LIB_TARGET_X86_X86SUBTARGET_H
14#define LLVM_LIB_TARGET_X86_X86SUBTARGET_H
15
16#include "X86FrameLowering.h"
17#include "X86ISelLowering.h"
18#include "X86InstrInfo.h"
19#include "X86SelectionDAGInfo.h"
20#include "llvm/ADT/Triple.h"
21#include "llvm/CodeGen/TargetSubtargetInfo.h"
22#include "llvm/IR/CallingConv.h"
23#include <climits>
24#include <memory>
25
26#define GET_SUBTARGETINFO_HEADER
27#include "X86GenSubtargetInfo.inc"
28
29namespace llvm {
30
31class CallLowering;
32class GlobalValue;
33class InstructionSelector;
34class LegalizerInfo;
35class RegisterBankInfo;
36class StringRef;
37class TargetMachine;
38
39/// The X86 backend supports a number of different styles of PIC.
40///
41namespace PICStyles {
42
43enum class Style {
44 StubPIC, // Used on i386-darwin in pic mode.
45 GOT, // Used on 32 bit elf on when in pic mode.
46 RIPRel, // Used on X86-64 when in pic mode.
47 None // Set when not in pic mode.
48};
49
50} // end namespace PICStyles
51
52class X86Subtarget final : public X86GenSubtargetInfo {
53 // NOTE: Do not add anything new to this list. Coarse, CPU name based flags
54 // are not a good idea. We should be migrating away from these.
55 enum X86ProcFamilyEnum {
56 Others,
57 IntelAtom,
58 IntelSLM
59 };
60
61 enum X86SSEEnum {
62 NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F
63 };
64
65 enum X863DNowEnum {
66 NoThreeDNow, MMX, ThreeDNow, ThreeDNowA
67 };
68
69 /// X86 processor family: Intel Atom, and others
70 X86ProcFamilyEnum X86ProcFamily = Others;
71
72 /// Which PIC style to use
73 PICStyles::Style PICStyle;
74
75 const TargetMachine &TM;
76
77 /// SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or none supported.
78 X86SSEEnum X86SSELevel = NoSSE;
79
80 /// MMX, 3DNow, 3DNow Athlon, or none supported.
81 X863DNowEnum X863DNowLevel = NoThreeDNow;
82
83 /// True if the processor supports X87 instructions.
84 bool HasX87 = false;
85
86 /// True if the processor supports CMPXCHG8B.
87 bool HasCmpxchg8b = false;
88
89 /// True if this processor has NOPL instruction
90 /// (generally pentium pro+).
91 bool HasNOPL = false;
92
93 /// True if this processor has conditional move instructions
94 /// (generally pentium pro+).
95 bool HasCMov = false;
96
97 /// True if the processor supports X86-64 instructions.
98 bool HasX86_64 = false;
99
100 /// True if the processor supports POPCNT.
101 bool HasPOPCNT = false;
102
103 /// True if the processor supports SSE4A instructions.
104 bool HasSSE4A = false;
105
106 /// Target has AES instructions
107 bool HasAES = false;
108 bool HasVAES = false;
109
110 /// Target has FXSAVE/FXRESTOR instructions
111 bool HasFXSR = false;
112
113 /// Target has XSAVE instructions
114 bool HasXSAVE = false;
115
116 /// Target has XSAVEOPT instructions
117 bool HasXSAVEOPT = false;
118
119 /// Target has XSAVEC instructions
120 bool HasXSAVEC = false;
121
122 /// Target has XSAVES instructions
123 bool HasXSAVES = false;
124
125 /// Target has carry-less multiplication
126 bool HasPCLMUL = false;
127 bool HasVPCLMULQDQ = false;
128
129 /// Target has Galois Field Arithmetic instructions
130 bool HasGFNI = false;
131
132 /// Target has 3-operand fused multiply-add
133 bool HasFMA = false;
134
135 /// Target has 4-operand fused multiply-add
136 bool HasFMA4 = false;
137
138 /// Target has XOP instructions
139 bool HasXOP = false;
140
141 /// Target has TBM instructions.
142 bool HasTBM = false;
143
144 /// Target has LWP instructions
145 bool HasLWP = false;
146
147 /// True if the processor has the MOVBE instruction.
148 bool HasMOVBE = false;
149
150 /// True if the processor has the RDRAND instruction.
151 bool HasRDRAND = false;
152
153 /// Processor has 16-bit floating point conversion instructions.
154 bool HasF16C = false;
155
156 /// Processor has FS/GS base insturctions.
157 bool HasFSGSBase = false;
158
159 /// Processor has LZCNT instruction.
160 bool HasLZCNT = false;
161
162 /// Processor has BMI1 instructions.
163 bool HasBMI = false;
164
165 /// Processor has BMI2 instructions.
166 bool HasBMI2 = false;
167
168 /// Processor has VBMI instructions.
169 bool HasVBMI = false;
170
171 /// Processor has VBMI2 instructions.
172 bool HasVBMI2 = false;
173
174 /// Processor has Integer Fused Multiply Add
175 bool HasIFMA = false;
176
177 /// Processor has RTM instructions.
178 bool HasRTM = false;
179
180 /// Processor has ADX instructions.
181 bool HasADX = false;
182
183 /// Processor has SHA instructions.
184 bool HasSHA = false;
185
186 /// Processor has PRFCHW instructions.
187 bool HasPRFCHW = false;
188
189 /// Processor has RDSEED instructions.
190 bool HasRDSEED = false;
191
192 /// Processor has LAHF/SAHF instructions in 64-bit mode.
193 bool HasLAHFSAHF64 = false;
194
195 /// Processor has MONITORX/MWAITX instructions.
196 bool HasMWAITX = false;
197
198 /// Processor has Cache Line Zero instruction
199 bool HasCLZERO = false;
200
201 /// Processor has Cache Line Demote instruction
202 bool HasCLDEMOTE = false;
203
204 /// Processor has MOVDIRI instruction (direct store integer).
205 bool HasMOVDIRI = false;
206
207 /// Processor has MOVDIR64B instruction (direct store 64 bytes).
208 bool HasMOVDIR64B = false;
209
210 /// Processor has ptwrite instruction.
211 bool HasPTWRITE = false;
212
213 /// Processor has Prefetch with intent to Write instruction
214 bool HasPREFETCHWT1 = false;
215
216 /// True if SHLD instructions are slow.
217 bool IsSHLDSlow = false;
218
219 /// True if the PMULLD instruction is slow compared to PMULLW/PMULHW and
220 // PMULUDQ.
221 bool IsPMULLDSlow = false;
222
223 /// True if the PMADDWD instruction is slow compared to PMULLD.
224 bool IsPMADDWDSlow = false;
225
226 /// True if unaligned memory accesses of 16-bytes are slow.
227 bool IsUAMem16Slow = false;
228
229 /// True if unaligned memory accesses of 32-bytes are slow.
230 bool IsUAMem32Slow = false;
231
232 /// True if SSE operations can have unaligned memory operands.
233 /// This may require setting a configuration bit in the processor.
234 bool HasSSEUnalignedMem = false;
235
236 /// True if this processor has the CMPXCHG16B instruction;
237 /// this is true for most x86-64 chips, but not the first AMD chips.
238 bool HasCmpxchg16b = false;
239
240 /// True if the LEA instruction should be used for adjusting
241 /// the stack pointer. This is an optimization for Intel Atom processors.
242 bool UseLeaForSP = false;
243
244 /// True if POPCNT instruction has a false dependency on the destination register.
245 bool HasPOPCNTFalseDeps = false;
246
247 /// True if LZCNT/TZCNT instructions have a false dependency on the destination register.
248 bool HasLZCNTFalseDeps = false;
249
250 /// True if its preferable to combine to a single shuffle using a variable
251 /// mask over multiple fixed shuffles.
252 bool HasFastVariableShuffle = false;
253
254 /// True if vzeroupper instructions should be inserted after code that uses
255 /// ymm or zmm registers.
256 bool InsertVZEROUPPER = false;
257
258 /// True if there is no performance penalty for writing NOPs with up to
259 /// 7 bytes.
260 bool HasFast7ByteNOP = false;
261
262 /// True if there is no performance penalty for writing NOPs with up to
263 /// 11 bytes.
264 bool HasFast11ByteNOP = false;
265
266 /// True if there is no performance penalty for writing NOPs with up to
267 /// 15 bytes.
268 bool HasFast15ByteNOP = false;
269
270 /// True if gather is reasonably fast. This is true for Skylake client and
271 /// all AVX-512 CPUs.
272 bool HasFastGather = false;
273
274 /// True if hardware SQRTSS instruction is at least as fast (latency) as
275 /// RSQRTSS followed by a Newton-Raphson iteration.
276 bool HasFastScalarFSQRT = false;
277
278 /// True if hardware SQRTPS/VSQRTPS instructions are at least as fast
279 /// (throughput) as RSQRTPS/VRSQRTPS followed by a Newton-Raphson iteration.
280 bool HasFastVectorFSQRT = false;
281
282 /// True if 8-bit divisions are significantly faster than
283 /// 32-bit divisions and should be used when possible.
284 bool HasSlowDivide32 = false;
285
286 /// True if 32-bit divides are significantly faster than
287 /// 64-bit divisions and should be used when possible.
288 bool HasSlowDivide64 = false;
289
290 /// True if LZCNT instruction is fast.
291 bool HasFastLZCNT = false;
292
293 /// True if SHLD based rotate is fast.
294 bool HasFastSHLDRotate = false;
295
296 /// True if the processor supports macrofusion.
297 bool HasMacroFusion = false;
298
299 /// True if the processor supports branch fusion.
300 bool HasBranchFusion = false;
301
302 /// True if the processor has enhanced REP MOVSB/STOSB.
303 bool HasERMSB = false;
304
305 /// True if the processor has fast short REP MOV.
306 bool HasFSRM = false;
307
308 /// True if the short functions should be padded to prevent
309 /// a stall when returning too early.
310 bool PadShortFunctions = false;
311
312 /// True if two memory operand instructions should use a temporary register
313 /// instead.
314 bool SlowTwoMemOps = false;
315
316 /// True if the LEA instruction inputs have to be ready at address generation
317 /// (AG) time.
318 bool LEAUsesAG = false;
319
320 /// True if the LEA instruction with certain arguments is slow
321 bool SlowLEA = false;
322
323 /// True if the LEA instruction has all three source operands: base, index,
324 /// and offset or if the LEA instruction uses base and index registers where
325 /// the base is EBP, RBP,or R13
326 bool Slow3OpsLEA = false;
327
328 /// True if INC and DEC instructions are slow when writing to flags
329 bool SlowIncDec = false;
330
331 /// Processor has AVX-512 PreFetch Instructions
332 bool HasPFI = false;
333
334 /// Processor has AVX-512 Exponential and Reciprocal Instructions
335 bool HasERI = false;
336
337 /// Processor has AVX-512 Conflict Detection Instructions
338 bool HasCDI = false;
339
340 /// Processor has AVX-512 population count Instructions
341 bool HasVPOPCNTDQ = false;
342
343 /// Processor has AVX-512 Doubleword and Quadword instructions
344 bool HasDQI = false;
345
346 /// Processor has AVX-512 Byte and Word instructions
347 bool HasBWI = false;
348
349 /// Processor has AVX-512 Vector Length eXtenstions
350 bool HasVLX = false;
351
352 /// Processor has PKU extenstions
353 bool HasPKU = false;
354
355 /// Processor has AVX-512 Vector Neural Network Instructions
356 bool HasVNNI = false;
357
358 /// Processor has AVX Vector Neural Network Instructions
359 bool HasAVXVNNI = false;
360
361 /// Processor has AVX-512 bfloat16 floating-point extensions
362 bool HasBF16 = false;
363
364 /// Processor supports ENQCMD instructions
365 bool HasENQCMD = false;
366
367 /// Processor has AVX-512 Bit Algorithms instructions
368 bool HasBITALG = false;
369
370 /// Processor has AVX-512 vp2intersect instructions
371 bool HasVP2INTERSECT = false;
372
373 /// Processor supports CET SHSTK - Control-Flow Enforcement Technology
374 /// using Shadow Stack
375 bool HasSHSTK = false;
376
377 /// Processor supports Invalidate Process-Context Identifier
378 bool HasINVPCID = false;
379
380 /// Processor has Software Guard Extensions
381 bool HasSGX = false;
382
383 /// Processor supports Flush Cache Line instruction
384 bool HasCLFLUSHOPT = false;
385
386 /// Processor supports Cache Line Write Back instruction
387 bool HasCLWB = false;
388
389 /// Processor supports Write Back No Invalidate instruction
390 bool HasWBNOINVD = false;
391
392 /// Processor support RDPID instruction
393 bool HasRDPID = false;
394
395 /// Processor supports WaitPKG instructions
396 bool HasWAITPKG = false;
397
398 /// Processor supports PCONFIG instruction
399 bool HasPCONFIG = false;
400
401 /// Processor support key locker instructions
402 bool HasKL = false;
403
404 /// Processor support key locker wide instructions
405 bool HasWIDEKL = false;
406
407 /// Processor supports HRESET instruction
408 bool HasHRESET = false;
409
410 /// Processor supports SERIALIZE instruction
411 bool HasSERIALIZE = false;
412
413 /// Processor supports TSXLDTRK instruction
414 bool HasTSXLDTRK = false;
415
416 /// Processor has AMX support
417 bool HasAMXTILE = false;
418 bool HasAMXBF16 = false;
419 bool HasAMXINT8 = false;
420
421 /// Processor supports User Level Interrupt instructions
422 bool HasUINTR = false;
423
424 /// Processor has a single uop BEXTR implementation.
425 bool HasFastBEXTR = false;
426
427 /// Try harder to combine to horizontal vector ops if they are fast.
428 bool HasFastHorizontalOps = false;
429
430 /// Prefer a left/right scalar logical shifts pair over a shift+and pair.
431 bool HasFastScalarShiftMasks = false;
432
433 /// Prefer a left/right vector logical shifts pair over a shift+and pair.
434 bool HasFastVectorShiftMasks = false;
435
436 /// Use a retpoline thunk rather than indirect calls to block speculative
437 /// execution.
438 bool UseRetpolineIndirectCalls = false;
439
440 /// Use a retpoline thunk or remove any indirect branch to block speculative
441 /// execution.
442 bool UseRetpolineIndirectBranches = false;
443
444 /// Deprecated flag, query `UseRetpolineIndirectCalls` and
445 /// `UseRetpolineIndirectBranches` instead.
446 bool DeprecatedUseRetpoline = false;
447
448 /// When using a retpoline thunk, call an externally provided thunk rather
449 /// than emitting one inside the compiler.
450 bool UseRetpolineExternalThunk = false;
451
452 /// Prevent generation of indirect call/branch instructions from memory,
453 /// and force all indirect call/branch instructions from a register to be
454 /// preceded by an LFENCE. Also decompose RET instructions into a
455 /// POP+LFENCE+JMP sequence.
456 bool UseLVIControlFlowIntegrity = false;
457
458 /// Enable Speculative Execution Side Effect Suppression
459 bool UseSpeculativeExecutionSideEffectSuppression = false;
460
461 /// Insert LFENCE instructions to prevent data speculatively injected into
462 /// loads from being used maliciously.
463 bool UseLVILoadHardening = false;
464
465 /// Use software floating point for code generation.
466 bool UseSoftFloat = false;
467
468 /// Use alias analysis during code generation.
469 bool UseAA = false;
470
471 /// The minimum alignment known to hold of the stack frame on
472 /// entry to the function and which must be maintained by every function.
473 Align stackAlignment = Align(4);
474
475 Align TileConfigAlignment = Align(4);
476
477 /// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops.
478 ///
479 // FIXME: this is a known good value for Yonah. How about others?
480 unsigned MaxInlineSizeThreshold = 128;
481
482 /// Indicates target prefers 128 bit instructions.
483 bool Prefer128Bit = false;
484
485 /// Indicates target prefers 256 bit instructions.
486 bool Prefer256Bit = false;
487
488 /// Indicates target prefers AVX512 mask registers.
489 bool PreferMaskRegisters = false;
490
491 /// Use Goldmont specific floating point div/sqrt costs.
492 bool UseGLMDivSqrtCosts = false;
493
494 /// What processor and OS we're targeting.
495 Triple TargetTriple;
496
497 /// GlobalISel related APIs.
498 std::unique_ptr<CallLowering> CallLoweringInfo;
499 std::unique_ptr<LegalizerInfo> Legalizer;
500 std::unique_ptr<RegisterBankInfo> RegBankInfo;
501 std::unique_ptr<InstructionSelector> InstSelector;
502
503private:
504 /// Override the stack alignment.
505 MaybeAlign StackAlignOverride;
506
507 /// Preferred vector width from function attribute.
508 unsigned PreferVectorWidthOverride;
509
510 /// Resolved preferred vector width from function attribute and subtarget
511 /// features.
512 unsigned PreferVectorWidth = UINT32_MAX(4294967295U);
513
514 /// Required vector width from function attribute.
515 unsigned RequiredVectorWidth;
516
517 /// True if compiling for 64-bit, false for 16-bit or 32-bit.
518 bool In64BitMode = false;
519
520 /// True if compiling for 32-bit, false for 16-bit or 64-bit.
521 bool In32BitMode = false;
522
523 /// True if compiling for 16-bit, false for 32-bit or 64-bit.
524 bool In16BitMode = false;
525
526 X86SelectionDAGInfo TSInfo;
527 // Ordering here is important. X86InstrInfo initializes X86RegisterInfo which
528 // X86TargetLowering needs.
529 X86InstrInfo InstrInfo;
530 X86TargetLowering TLInfo;
531 X86FrameLowering FrameLowering;
532
533public:
534 /// This constructor initializes the data members to match that
535 /// of the specified triple.
536 ///
537 X86Subtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU, StringRef FS,
538 const X86TargetMachine &TM, MaybeAlign StackAlignOverride,
539 unsigned PreferVectorWidthOverride,
540 unsigned RequiredVectorWidth);
541
542 const X86TargetLowering *getTargetLowering() const override {
543 return &TLInfo;
544 }
545
546 const X86InstrInfo *getInstrInfo() const override { return &InstrInfo; }
547
548 const X86FrameLowering *getFrameLowering() const override {
549 return &FrameLowering;
550 }
551
552 const X86SelectionDAGInfo *getSelectionDAGInfo() const override {
553 return &TSInfo;
554 }
555
556 const X86RegisterInfo *getRegisterInfo() const override {
557 return &getInstrInfo()->getRegisterInfo();
558 }
559
560 unsigned getTileConfigSize() const { return 64; }
561 Align getTileConfigAlignment() const { return TileConfigAlignment; }
562
563 /// Returns the minimum alignment known to hold of the
564 /// stack frame on entry to the function and which must be maintained by every
565 /// function for this subtarget.
566 Align getStackAlignment() const { return stackAlignment; }
567
568 /// Returns the maximum memset / memcpy size
569 /// that still makes it profitable to inline the call.
570 unsigned getMaxInlineSizeThreshold() const { return MaxInlineSizeThreshold; }
571
572 /// ParseSubtargetFeatures - Parses features string setting specified
573 /// subtarget options. Definition of function is auto generated by tblgen.
574 void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
575
576 /// Methods used by Global ISel
577 const CallLowering *getCallLowering() const override;
578 InstructionSelector *getInstructionSelector() const override;
579 const LegalizerInfo *getLegalizerInfo() const override;
580 const RegisterBankInfo *getRegBankInfo() const override;
581
582private:
583 /// Initialize the full set of dependencies so we can use an initializer
584 /// list for X86Subtarget.
585 X86Subtarget &initializeSubtargetDependencies(StringRef CPU,
586 StringRef TuneCPU,
587 StringRef FS);
588 void initSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
589
590public:
591 /// Is this x86_64? (disregarding specific ABI / programming model)
592 bool is64Bit() const {
593 return In64BitMode;
594 }
595
596 bool is32Bit() const {
597 return In32BitMode;
598 }
599
600 bool is16Bit() const {
601 return In16BitMode;
602 }
603
604 /// Is this x86_64 with the ILP32 programming model (x32 ABI)?
605 bool isTarget64BitILP32() const {
606 return In64BitMode && (TargetTriple.getEnvironment() == Triple::GNUX32 ||
607 TargetTriple.isOSNaCl());
608 }
609
610 /// Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
611 bool isTarget64BitLP64() const {
612 return In64BitMode && (TargetTriple.getEnvironment() != Triple::GNUX32 &&
613 !TargetTriple.isOSNaCl());
614 }
615
616 PICStyles::Style getPICStyle() const { return PICStyle; }
617 void setPICStyle(PICStyles::Style Style) { PICStyle = Style; }
618
619 bool hasX87() const { return HasX87; }
620 bool hasCmpxchg8b() const { return HasCmpxchg8b; }
621 bool hasNOPL() const { return HasNOPL; }
622 // SSE codegen depends on cmovs, and all SSE1+ processors support them.
623 // All 64-bit processors support cmov.
624 bool hasCMov() const { return HasCMov || X86SSELevel >= SSE1 || is64Bit(); }
625 bool hasSSE1() const { return X86SSELevel >= SSE1; }
626 bool hasSSE2() const { return X86SSELevel >= SSE2; }
627 bool hasSSE3() const { return X86SSELevel >= SSE3; }
628 bool hasSSSE3() const { return X86SSELevel >= SSSE3; }
629 bool hasSSE41() const { return X86SSELevel >= SSE41; }
630 bool hasSSE42() const { return X86SSELevel >= SSE42; }
631 bool hasAVX() const { return X86SSELevel >= AVX; }
632 bool hasAVX2() const { return X86SSELevel >= AVX2; }
6
Assuming field 'X86SSELevel' is < AVX2
7
Returning zero, which participates in a condition later
633 bool hasAVX512() const { return X86SSELevel >= AVX512F; }
2
Assuming field 'X86SSELevel' is < AVX512F
3
Returning zero, which participates in a condition later
634 bool hasInt256() const { return hasAVX2(); }
635 bool hasSSE4A() const { return HasSSE4A; }
636 bool hasMMX() const { return X863DNowLevel >= MMX; }
637 bool has3DNow() const { return X863DNowLevel >= ThreeDNow; }
638 bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; }
639 bool hasPOPCNT() const { return HasPOPCNT; }
640 bool hasAES() const { return HasAES; }
641 bool hasVAES() const { return HasVAES; }
642 bool hasFXSR() const { return HasFXSR; }
643 bool hasXSAVE() const { return HasXSAVE; }
644 bool hasXSAVEOPT() const { return HasXSAVEOPT; }
645 bool hasXSAVEC() const { return HasXSAVEC; }
646 bool hasXSAVES() const { return HasXSAVES; }
647 bool hasPCLMUL() const { return HasPCLMUL; }
648 bool hasVPCLMULQDQ() const { return HasVPCLMULQDQ; }
649 bool hasGFNI() const { return HasGFNI; }
650 // Prefer FMA4 to FMA - its better for commutation/memory folding and
651 // has equal or better performance on all supported targets.
652 bool hasFMA() const { return HasFMA; }
653 bool hasFMA4() const { return HasFMA4; }
654 bool hasAnyFMA() const { return hasFMA() || hasFMA4(); }
655 bool hasXOP() const { return HasXOP; }
656 bool hasTBM() const { return HasTBM; }
657 bool hasLWP() const { return HasLWP; }
658 bool hasMOVBE() const { return HasMOVBE; }
659 bool hasRDRAND() const { return HasRDRAND; }
660 bool hasF16C() const { return HasF16C; }
661 bool hasFSGSBase() const { return HasFSGSBase; }
662 bool hasLZCNT() const { return HasLZCNT; }
663 bool hasBMI() const { return HasBMI; }
664 bool hasBMI2() const { return HasBMI2; }
665 bool hasVBMI() const { return HasVBMI; }
666 bool hasVBMI2() const { return HasVBMI2; }
667 bool hasIFMA() const { return HasIFMA; }
668 bool hasRTM() const { return HasRTM; }
669 bool hasADX() const { return HasADX; }
670 bool hasSHA() const { return HasSHA; }
671 bool hasPRFCHW() const { return HasPRFCHW; }
672 bool hasPREFETCHWT1() const { return HasPREFETCHWT1; }
673 bool hasPrefetchW() const {
674 // The PREFETCHW instruction was added with 3DNow but later CPUs gave it
675 // its own CPUID bit as part of deprecating 3DNow. Intel eventually added
676 // it and KNL has another that prefetches to L2 cache. We assume the
677 // L1 version exists if the L2 version does.
678 return has3DNow() || hasPRFCHW() || hasPREFETCHWT1();
679 }
680 bool hasSSEPrefetch() const {
681 // We implicitly enable these when we have a write prefix supporting cache
682 // level OR if we have prfchw, but don't already have a read prefetch from
683 // 3dnow.
684 return hasSSE1() || (hasPRFCHW() && !has3DNow()) || hasPREFETCHWT1();
685 }
686 bool hasRDSEED() const { return HasRDSEED; }
687 bool hasLAHFSAHF() const { return HasLAHFSAHF64 || !is64Bit(); }
688 bool hasMWAITX() const { return HasMWAITX; }
689 bool hasCLZERO() const { return HasCLZERO; }
690 bool hasCLDEMOTE() const { return HasCLDEMOTE; }
691 bool hasMOVDIRI() const { return HasMOVDIRI; }
692 bool hasMOVDIR64B() const { return HasMOVDIR64B; }
693 bool hasPTWRITE() const { return HasPTWRITE; }
694 bool isSHLDSlow() const { return IsSHLDSlow; }
695 bool isPMULLDSlow() const { return IsPMULLDSlow; }
696 bool isPMADDWDSlow() const { return IsPMADDWDSlow; }
697 bool isUnalignedMem16Slow() const { return IsUAMem16Slow; }
698 bool isUnalignedMem32Slow() const { return IsUAMem32Slow; }
699 bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; }
700 bool hasCmpxchg16b() const { return HasCmpxchg16b && is64Bit(); }
701 bool useLeaForSP() const { return UseLeaForSP; }
702 bool hasPOPCNTFalseDeps() const { return HasPOPCNTFalseDeps; }
703 bool hasLZCNTFalseDeps() const { return HasLZCNTFalseDeps; }
704 bool hasFastVariableShuffle() const {
705 return HasFastVariableShuffle;
706 }
707 bool insertVZEROUPPER() const { return InsertVZEROUPPER; }
708 bool hasFastGather() const { return HasFastGather; }
709 bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }
710 bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
711 bool hasFastLZCNT() const { return HasFastLZCNT; }
712 bool hasFastSHLDRotate() const { return HasFastSHLDRotate; }
713 bool hasFastBEXTR() const { return HasFastBEXTR; }
714 bool hasFastHorizontalOps() const { return HasFastHorizontalOps; }
715 bool hasFastScalarShiftMasks() const { return HasFastScalarShiftMasks; }
716 bool hasFastVectorShiftMasks() const { return HasFastVectorShiftMasks; }
717 bool hasMacroFusion() const { return HasMacroFusion; }
718 bool hasBranchFusion() const { return HasBranchFusion; }
719 bool hasERMSB() const { return HasERMSB; }
720 bool hasFSRM() const { return HasFSRM; }
721 bool hasSlowDivide32() const { return HasSlowDivide32; }
722 bool hasSlowDivide64() const { return HasSlowDivide64; }
723 bool padShortFunctions() const { return PadShortFunctions; }
724 bool slowTwoMemOps() const { return SlowTwoMemOps; }
725 bool LEAusesAG() const { return LEAUsesAG; }
726 bool slowLEA() const { return SlowLEA; }
727 bool slow3OpsLEA() const { return Slow3OpsLEA; }
728 bool slowIncDec() const { return SlowIncDec; }
729 bool hasCDI() const { return HasCDI; }
730 bool hasVPOPCNTDQ() const { return HasVPOPCNTDQ; }
731 bool hasPFI() const { return HasPFI; }
732 bool hasERI() const { return HasERI; }
733 bool hasDQI() const { return HasDQI; }
734 bool hasBWI() const { return HasBWI; }
735 bool hasVLX() const { return HasVLX; }
736 bool hasPKU() const { return HasPKU; }
737 bool hasVNNI() const { return HasVNNI; }
738 bool hasBF16() const { return HasBF16; }
739 bool hasVP2INTERSECT() const { return HasVP2INTERSECT; }
740 bool hasBITALG() const { return HasBITALG; }
741 bool hasSHSTK() const { return HasSHSTK; }
742 bool hasCLFLUSHOPT() const { return HasCLFLUSHOPT; }
743 bool hasCLWB() const { return HasCLWB; }
744 bool hasWBNOINVD() const { return HasWBNOINVD; }
745 bool hasRDPID() const { return HasRDPID; }
746 bool hasWAITPKG() const { return HasWAITPKG; }
747 bool hasPCONFIG() const { return HasPCONFIG; }
748 bool hasSGX() const { return HasSGX; }
749 bool hasINVPCID() const { return HasINVPCID; }
750 bool hasENQCMD() const { return HasENQCMD; }
751 bool hasKL() const { return HasKL; }
752 bool hasWIDEKL() const { return HasWIDEKL; }
753 bool hasHRESET() const { return HasHRESET; }
754 bool hasSERIALIZE() const { return HasSERIALIZE; }
755 bool hasTSXLDTRK() const { return HasTSXLDTRK; }
756 bool hasUINTR() const { return HasUINTR; }
757 bool useRetpolineIndirectCalls() const { return UseRetpolineIndirectCalls; }
758 bool useRetpolineIndirectBranches() const {
759 return UseRetpolineIndirectBranches;
760 }
761 bool hasAVXVNNI() const { return HasAVXVNNI; }
762 bool hasAMXTILE() const { return HasAMXTILE; }
763 bool hasAMXBF16() const { return HasAMXBF16; }
764 bool hasAMXINT8() const { return HasAMXINT8; }
765 bool useRetpolineExternalThunk() const { return UseRetpolineExternalThunk; }
766
767 // These are generic getters that OR together all of the thunk types
768 // supported by the subtarget. Therefore useIndirectThunk*() will return true
769 // if any respective thunk feature is enabled.
770 bool useIndirectThunkCalls() const {
771 return useRetpolineIndirectCalls() || useLVIControlFlowIntegrity();
772 }
773 bool useIndirectThunkBranches() const {
774 return useRetpolineIndirectBranches() || useLVIControlFlowIntegrity();
775 }
776
777 bool preferMaskRegisters() const { return PreferMaskRegisters; }
778 bool useGLMDivSqrtCosts() const { return UseGLMDivSqrtCosts; }
779 bool useLVIControlFlowIntegrity() const { return UseLVIControlFlowIntegrity; }
780 bool useLVILoadHardening() const { return UseLVILoadHardening; }
781 bool useSpeculativeExecutionSideEffectSuppression() const {
782 return UseSpeculativeExecutionSideEffectSuppression;
783 }
784
785 unsigned getPreferVectorWidth() const { return PreferVectorWidth; }
786 unsigned getRequiredVectorWidth() const { return RequiredVectorWidth; }
787
788 // Helper functions to determine when we should allow widening to 512-bit
789 // during codegen.
790 // TODO: Currently we're always allowing widening on CPUs without VLX,
791 // because for many cases we don't have a better option.
792 bool canExtendTo512DQ() const {
793 return hasAVX512() && (!hasVLX() || getPreferVectorWidth() >= 512);
794 }
795 bool canExtendTo512BW() const {
796 return hasBWI() && canExtendTo512DQ();
797 }
798
799 // If there are no 512-bit vectors and we prefer not to use 512-bit registers,
800 // disable them in the legalizer.
801 bool useAVX512Regs() const {
802 return hasAVX512() && (canExtendTo512DQ() || RequiredVectorWidth > 256);
803 }
804
805 bool useBWIRegs() const {
806 return hasBWI() && useAVX512Regs();
807 }
808
809 bool isXRaySupported() const override { return is64Bit(); }
810
811 /// TODO: to be removed later and replaced with suitable properties
812 bool isAtom() const { return X86ProcFamily == IntelAtom; }
813 bool isSLM() const { return X86ProcFamily == IntelSLM; }
814 bool useSoftFloat() const { return UseSoftFloat; }
815 bool useAA() const override { return UseAA; }
816
817 /// Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
818 /// no-sse2). There isn't any reason to disable it if the target processor
819 /// supports it.
820 bool hasMFence() const { return hasSSE2() || is64Bit(); }
821
822 const Triple &getTargetTriple() const { return TargetTriple; }
823
824 bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
825 bool isTargetFreeBSD() const { return TargetTriple.isOSFreeBSD(); }
826 bool isTargetDragonFly() const { return TargetTriple.isOSDragonFly(); }
827 bool isTargetSolaris() const { return TargetTriple.isOSSolaris(); }
828 bool isTargetPS4() const { return TargetTriple.isPS4CPU(); }
829
830 bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
831 bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); }
832 bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
833
834 bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
835 bool isTargetKFreeBSD() const { return TargetTriple.isOSKFreeBSD(); }
836 bool isTargetGlibc() const { return TargetTriple.isOSGlibc(); }
837 bool isTargetAndroid() const { return TargetTriple.isAndroid(); }
838 bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
839 bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); }
840 bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); }
841 bool isTargetMCU() const { return TargetTriple.isOSIAMCU(); }
842 bool isTargetFuchsia() const { return TargetTriple.isOSFuchsia(); }
843
844 bool isTargetWindowsMSVC() const {
845 return TargetTriple.isWindowsMSVCEnvironment();
846 }
847
848 bool isTargetWindowsCoreCLR() const {
849 return TargetTriple.isWindowsCoreCLREnvironment();
850 }
851
852 bool isTargetWindowsCygwin() const {
853 return TargetTriple.isWindowsCygwinEnvironment();
854 }
855
856 bool isTargetWindowsGNU() const {
857 return TargetTriple.isWindowsGNUEnvironment();
858 }
859
860 bool isTargetWindowsItanium() const {
861 return TargetTriple.isWindowsItaniumEnvironment();
862 }
863
864 bool isTargetCygMing() const { return TargetTriple.isOSCygMing(); }
865
866 bool isOSWindows() const { return TargetTriple.isOSWindows(); }
867
868 bool isTargetWin64() const { return In64BitMode && isOSWindows(); }
869
870 bool isTargetWin32() const { return !In64BitMode && isOSWindows(); }
871
872 bool isPICStyleGOT() const { return PICStyle == PICStyles::Style::GOT; }
873 bool isPICStyleRIPRel() const { return PICStyle == PICStyles::Style::RIPRel; }
874
875 bool isPICStyleStubPIC() const {
876 return PICStyle == PICStyles::Style::StubPIC;
877 }
878
879 bool isPositionIndependent() const;
880
881 bool isCallingConvWin64(CallingConv::ID CC) const {
882 switch (CC) {
883 // On Win64, all these conventions just use the default convention.
884 case CallingConv::C:
885 case CallingConv::Fast:
886 case CallingConv::Tail:
887 case CallingConv::Swift:
888 case CallingConv::X86_FastCall:
889 case CallingConv::X86_StdCall:
890 case CallingConv::X86_ThisCall:
891 case CallingConv::X86_VectorCall:
892 case CallingConv::Intel_OCL_BI:
893 return isTargetWin64();
894 // This convention allows using the Win64 convention on other targets.
895 case CallingConv::Win64:
896 return true;
897 // This convention allows using the SysV convention on Windows targets.
898 case CallingConv::X86_64_SysV:
899 return false;
900 // Otherwise, who knows what this is.
901 default:
902 return false;
903 }
904 }
905
906 /// Classify a global variable reference for the current subtarget according
907 /// to how we should reference it in a non-pcrel context.
908 unsigned char classifyLocalReference(const GlobalValue *GV) const;
909
910 unsigned char classifyGlobalReference(const GlobalValue *GV,
911 const Module &M) const;
912 unsigned char classifyGlobalReference(const GlobalValue *GV) const;
913
914 /// Classify a global function reference for the current subtarget.
915 unsigned char classifyGlobalFunctionReference(const GlobalValue *GV,
916 const Module &M) const;
917 unsigned char classifyGlobalFunctionReference(const GlobalValue *GV) const;
918
919 /// Classify a blockaddress reference for the current subtarget according to
920 /// how we should reference it in a non-pcrel context.
921 unsigned char classifyBlockAddressReference() const;
922
923 /// Return true if the subtarget allows calls to immediate address.
924 bool isLegalToCallImmediateAddr() const;
925
926 /// If we are using indirect thunks, we need to expand indirectbr to avoid it
927 /// lowering to an actual indirect jump.
928 bool enableIndirectBrExpand() const override {
929 return useIndirectThunkBranches();
930 }
931
932 /// Enable the MachineScheduler pass for all X86 subtargets.
933 bool enableMachineScheduler() const override { return true; }
934
935 bool enableEarlyIfConversion() const override;
936
937 void getPostRAMutations(std::vector<std::unique_ptr<ScheduleDAGMutation>>
938 &Mutations) const override;
939
940 AntiDepBreakMode getAntiDepBreakMode() const override {
941 return TargetSubtargetInfo::ANTIDEP_CRITICAL;
942 }
943
944 bool enableAdvancedRASplitCost() const override { return true; }
945};
946
947} // end namespace llvm
948
949#endif // LLVM_LIB_TARGET_X86_X86SUBTARGET_H

/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h

1//===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file provides a helper that implements much of the TTI interface in
11/// terms of the target-independent code generator and TargetLowering
12/// interfaces.
13//
14//===----------------------------------------------------------------------===//
15
16#ifndef LLVM_CODEGEN_BASICTTIIMPL_H
17#define LLVM_CODEGEN_BASICTTIIMPL_H
18
19#include "llvm/ADT/APInt.h"
20#include "llvm/ADT/ArrayRef.h"
21#include "llvm/ADT/BitVector.h"
22#include "llvm/ADT/SmallPtrSet.h"
23#include "llvm/ADT/SmallVector.h"
24#include "llvm/Analysis/LoopInfo.h"
25#include "llvm/Analysis/TargetTransformInfo.h"
26#include "llvm/Analysis/TargetTransformInfoImpl.h"
27#include "llvm/CodeGen/ISDOpcodes.h"
28#include "llvm/CodeGen/TargetLowering.h"
29#include "llvm/CodeGen/TargetSubtargetInfo.h"
30#include "llvm/CodeGen/ValueTypes.h"
31#include "llvm/IR/BasicBlock.h"
32#include "llvm/IR/Constant.h"
33#include "llvm/IR/Constants.h"
34#include "llvm/IR/DataLayout.h"
35#include "llvm/IR/DerivedTypes.h"
36#include "llvm/IR/InstrTypes.h"
37#include "llvm/IR/Instruction.h"
38#include "llvm/IR/Instructions.h"
39#include "llvm/IR/Intrinsics.h"
40#include "llvm/IR/Operator.h"
41#include "llvm/IR/Type.h"
42#include "llvm/IR/Value.h"
43#include "llvm/Support/Casting.h"
44#include "llvm/Support/CommandLine.h"
45#include "llvm/Support/ErrorHandling.h"
46#include "llvm/Support/MachineValueType.h"
47#include "llvm/Support/MathExtras.h"
48#include <algorithm>
49#include <cassert>
50#include <cstdint>
51#include <limits>
52#include <utility>
53
54namespace llvm {
55
56class Function;
57class GlobalValue;
58class LLVMContext;
59class ScalarEvolution;
60class SCEV;
61class TargetMachine;
62
63extern cl::opt<unsigned> PartialUnrollingThreshold;
64
65/// Base class which can be used to help build a TTI implementation.
66///
67/// This class provides as much implementation of the TTI interface as is
68/// possible using the target independent parts of the code generator.
69///
70/// In order to subclass it, your class must implement a getST() method to
71/// return the subtarget, and a getTLI() method to return the target lowering.
72/// We need these methods implemented in the derived class so that this class
73/// doesn't have to duplicate storage for them.
74template <typename T>
75class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
76private:
77 using BaseT = TargetTransformInfoImplCRTPBase<T>;
78 using TTI = TargetTransformInfo;
79
80 /// Helper function to access this as a T.
81 T *thisT() { return static_cast<T *>(this); }
82
83 /// Estimate a cost of Broadcast as an extract and sequence of insert
84 /// operations.
85 unsigned getBroadcastShuffleOverhead(FixedVectorType *VTy) {
86 unsigned Cost = 0;
87 // Broadcast cost is equal to the cost of extracting the zero'th element
88 // plus the cost of inserting it into every element of the result vector.
89 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 0);
90
91 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
92 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i);
93 }
94 return Cost;
95 }
96
97 /// Estimate a cost of shuffle as a sequence of extract and insert
98 /// operations.
99 unsigned getPermuteShuffleOverhead(FixedVectorType *VTy) {
100 unsigned Cost = 0;
101 // Shuffle cost is equal to the cost of extracting element from its argument
102 // plus the cost of inserting them onto the result vector.
103
104 // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
105 // index 0 of first vector, index 1 of second vector,index 2 of first
106 // vector and finally index 3 of second vector and insert them at index
107 // <0,1,2,3> of result vector.
108 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
109 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i);
110 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, i);
111 }
112 return Cost;
113 }
114
115 /// Estimate a cost of subvector extraction as a sequence of extract and
116 /// insert operations.
117 unsigned getExtractSubvectorOverhead(VectorType *VTy, int Index,
118 FixedVectorType *SubVTy) {
119 assert(VTy && SubVTy &&((VTy && SubVTy && "Can only extract subvectors from vectors"
) ? static_cast<void> (0) : __assert_fail ("VTy && SubVTy && \"Can only extract subvectors from vectors\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 120, __PRETTY_FUNCTION__))
120 "Can only extract subvectors from vectors")((VTy && SubVTy && "Can only extract subvectors from vectors"
) ? static_cast<void> (0) : __assert_fail ("VTy && SubVTy && \"Can only extract subvectors from vectors\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 120, __PRETTY_FUNCTION__))
;
121 int NumSubElts = SubVTy->getNumElements();
122 assert((!isa<FixedVectorType>(VTy) ||(((!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <=
(int)cast<FixedVectorType>(VTy)->getNumElements()) &&
"SK_ExtractSubvector index out of range") ? static_cast<void
> (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_ExtractSubvector index out of range\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 125, __PRETTY_FUNCTION__))
123 (Index + NumSubElts) <=(((!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <=
(int)cast<FixedVectorType>(VTy)->getNumElements()) &&
"SK_ExtractSubvector index out of range") ? static_cast<void
> (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_ExtractSubvector index out of range\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 125, __PRETTY_FUNCTION__))
124 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&(((!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <=
(int)cast<FixedVectorType>(VTy)->getNumElements()) &&
"SK_ExtractSubvector index out of range") ? static_cast<void
> (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_ExtractSubvector index out of range\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 125, __PRETTY_FUNCTION__))
125 "SK_ExtractSubvector index out of range")(((!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <=
(int)cast<FixedVectorType>(VTy)->getNumElements()) &&
"SK_ExtractSubvector index out of range") ? static_cast<void
> (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_ExtractSubvector index out of range\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 125, __PRETTY_FUNCTION__))
;
126
127 unsigned Cost = 0;
128 // Subvector extraction cost is equal to the cost of extracting element from
129 // the source type plus the cost of inserting them into the result vector
130 // type.
131 for (int i = 0; i != NumSubElts; ++i) {
132 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
133 i + Index);
134 Cost +=
135 thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, i);
136 }
137 return Cost;
138 }
139
140 /// Estimate a cost of subvector insertion as a sequence of extract and
141 /// insert operations.
142 unsigned getInsertSubvectorOverhead(VectorType *VTy, int Index,
143 FixedVectorType *SubVTy) {
144 assert(VTy && SubVTy &&((VTy && SubVTy && "Can only insert subvectors into vectors"
) ? static_cast<void> (0) : __assert_fail ("VTy && SubVTy && \"Can only insert subvectors into vectors\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 145, __PRETTY_FUNCTION__))
145 "Can only insert subvectors into vectors")((VTy && SubVTy && "Can only insert subvectors into vectors"
) ? static_cast<void> (0) : __assert_fail ("VTy && SubVTy && \"Can only insert subvectors into vectors\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 145, __PRETTY_FUNCTION__))
;
146 int NumSubElts = SubVTy->getNumElements();
147 assert((!isa<FixedVectorType>(VTy) ||(((!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <=
(int)cast<FixedVectorType>(VTy)->getNumElements()) &&
"SK_InsertSubvector index out of range") ? static_cast<void
> (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_InsertSubvector index out of range\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 150, __PRETTY_FUNCTION__))
148 (Index + NumSubElts) <=(((!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <=
(int)cast<FixedVectorType>(VTy)->getNumElements()) &&
"SK_InsertSubvector index out of range") ? static_cast<void
> (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_InsertSubvector index out of range\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 150, __PRETTY_FUNCTION__))
149 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&(((!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <=
(int)cast<FixedVectorType>(VTy)->getNumElements()) &&
"SK_InsertSubvector index out of range") ? static_cast<void
> (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_InsertSubvector index out of range\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 150, __PRETTY_FUNCTION__))
150 "SK_InsertSubvector index out of range")(((!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <=
(int)cast<FixedVectorType>(VTy)->getNumElements()) &&
"SK_InsertSubvector index out of range") ? static_cast<void
> (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_InsertSubvector index out of range\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 150, __PRETTY_FUNCTION__))
;
151
152 unsigned Cost = 0;
153 // Subvector insertion cost is equal to the cost of extracting element from
154 // the source type plus the cost of inserting them into the result vector
155 // type.
156 for (int i = 0; i != NumSubElts; ++i) {
157 Cost +=
158 thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, i);
159 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
160 i + Index);
161 }
162 return Cost;
163 }
164
165 /// Local query method delegates up to T which *must* implement this!
166 const TargetSubtargetInfo *getST() const {
167 return static_cast<const T *>(this)->getST();
168 }
169
170 /// Local query method delegates up to T which *must* implement this!
171 const TargetLoweringBase *getTLI() const {
172 return static_cast<const T *>(this)->getTLI();
173 }
174
175 static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
176 switch (M) {
177 case TTI::MIM_Unindexed:
178 return ISD::UNINDEXED;
179 case TTI::MIM_PreInc:
180 return ISD::PRE_INC;
181 case TTI::MIM_PreDec:
182 return ISD::PRE_DEC;
183 case TTI::MIM_PostInc:
184 return ISD::POST_INC;
185 case TTI::MIM_PostDec:
186 return ISD::POST_DEC;
187 }
188 llvm_unreachable("Unexpected MemIndexedMode")::llvm::llvm_unreachable_internal("Unexpected MemIndexedMode"
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 188)
;
189 }
190
191protected:
192 explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
193 : BaseT(DL) {}
194 virtual ~BasicTTIImplBase() = default;
195
196 using TargetTransformInfoImplBase::DL;
197
198public:
199 /// \name Scalar TTI Implementations
200 /// @{
201 bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth,
202 unsigned AddressSpace, unsigned Alignment,
203 bool *Fast) const {
204 EVT E = EVT::getIntegerVT(Context, BitWidth);
205 return getTLI()->allowsMisalignedMemoryAccesses(
206 E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast);
207 }
208
209 bool hasBranchDivergence() { return false; }
210
211 bool useGPUDivergenceAnalysis() { return false; }
212
213 bool isSourceOfDivergence(const Value *V) { return false; }
214
215 bool isAlwaysUniform(const Value *V) { return false; }
216
217 unsigned getFlatAddressSpace() {
218 // Return an invalid address space.
219 return -1;
220 }
221
222 bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
223 Intrinsic::ID IID) const {
224 return false;
225 }
226
227 bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
228 return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
229 }
230
231 unsigned getAssumedAddrSpace(const Value *V) const {
232 return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
233 }
234
235 Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
236 Value *NewV) const {
237 return nullptr;
238 }
239
240 bool isLegalAddImmediate(int64_t imm) {
241 return getTLI()->isLegalAddImmediate(imm);
242 }
243
244 bool isLegalICmpImmediate(int64_t imm) {
245 return getTLI()->isLegalICmpImmediate(imm);
246 }
247
248 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
249 bool HasBaseReg, int64_t Scale,
250 unsigned AddrSpace, Instruction *I = nullptr) {
251 TargetLoweringBase::AddrMode AM;
252 AM.BaseGV = BaseGV;
253 AM.BaseOffs = BaseOffset;
254 AM.HasBaseReg = HasBaseReg;
255 AM.Scale = Scale;
256 return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
257 }
258
259 bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty,
260 const DataLayout &DL) const {
261 EVT VT = getTLI()->getValueType(DL, Ty);
262 return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
263 }
264
265 bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty,
266 const DataLayout &DL) const {
267 EVT VT = getTLI()->getValueType(DL, Ty);
268 return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
269 }
270
271 bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) {
272 return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
273 }
274
275 bool isNumRegsMajorCostOfLSR() {
276 return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR();
277 }
278
279 bool isProfitableLSRChainElement(Instruction *I) {
280 return TargetTransformInfoImplBase::isProfitableLSRChainElement(I);
281 }
282
283 int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
284 bool HasBaseReg, int64_t Scale, unsigned AddrSpace) {
285 TargetLoweringBase::AddrMode AM;
286 AM.BaseGV = BaseGV;
287 AM.BaseOffs = BaseOffset;
288 AM.HasBaseReg = HasBaseReg;
289 AM.Scale = Scale;
290 return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace);
291 }
292
293 bool isTruncateFree(Type *Ty1, Type *Ty2) {
294 return getTLI()->isTruncateFree(Ty1, Ty2);
295 }
296
297 bool isProfitableToHoist(Instruction *I) {
298 return getTLI()->isProfitableToHoist(I);
299 }
300
301 bool useAA() const { return getST()->useAA(); }
302
303 bool isTypeLegal(Type *Ty) {
304 EVT VT = getTLI()->getValueType(DL, Ty);
305 return getTLI()->isTypeLegal(VT);
306 }
307
308 unsigned getRegUsageForType(Type *Ty) {
309 return getTLI()->getTypeLegalizationCost(DL, Ty).first;
310 }
311
312 int getGEPCost(Type *PointeeType, const Value *Ptr,
313 ArrayRef<const Value *> Operands) {
314 return BaseT::getGEPCost(PointeeType, Ptr, Operands);
315 }
316
317 unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
318 unsigned &JumpTableSize,
319 ProfileSummaryInfo *PSI,
320 BlockFrequencyInfo *BFI) {
321 /// Try to find the estimated number of clusters. Note that the number of
322 /// clusters identified in this function could be different from the actual
323 /// numbers found in lowering. This function ignore switches that are
324 /// lowered with a mix of jump table / bit test / BTree. This function was
325 /// initially intended to be used when estimating the cost of switch in
326 /// inline cost heuristic, but it's a generic cost model to be used in other
327 /// places (e.g., in loop unrolling).
328 unsigned N = SI.getNumCases();
329 const TargetLoweringBase *TLI = getTLI();
330 const DataLayout &DL = this->getDataLayout();
331
332 JumpTableSize = 0;
333 bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());
334
335 // Early exit if both a jump table and bit test are not allowed.
336 if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
337 return N;
338
339 APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
340 APInt MinCaseVal = MaxCaseVal;
341 for (auto CI : SI.cases()) {
342 const APInt &CaseVal = CI.getCaseValue()->getValue();
343 if (CaseVal.sgt(MaxCaseVal))
344 MaxCaseVal = CaseVal;
345 if (CaseVal.slt(MinCaseVal))
346 MinCaseVal = CaseVal;
347 }
348
349 // Check if suitable for a bit test
350 if (N <= DL.getIndexSizeInBits(0u)) {
351 SmallPtrSet<const BasicBlock *, 4> Dests;
352 for (auto I : SI.cases())
353 Dests.insert(I.getCaseSuccessor());
354
355 if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
356 DL))
357 return 1;
358 }
359
360 // Check if suitable for a jump table.
361 if (IsJTAllowed) {
362 if (N < 2 || N < TLI->getMinimumJumpTableEntries())
363 return N;
364 uint64_t Range =
365 (MaxCaseVal - MinCaseVal)
366 .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
367 // Check whether a range of clusters is dense enough for a jump table
368 if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) {
369 JumpTableSize = Range;
370 return 1;
371 }
372 }
373 return N;
374 }
375
376 bool shouldBuildLookupTables() {
377 const TargetLoweringBase *TLI = getTLI();
378 return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
379 TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
380 }
381
382 bool haveFastSqrt(Type *Ty) {
383 const TargetLoweringBase *TLI = getTLI();
384 EVT VT = TLI->getValueType(DL, Ty);
385 return TLI->isTypeLegal(VT) &&
386 TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
387 }
388
389 bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
390 return true;
391 }
392
393 unsigned getFPOpCost(Type *Ty) {
394 // Check whether FADD is available, as a proxy for floating-point in
395 // general.
396 const TargetLoweringBase *TLI = getTLI();
397 EVT VT = TLI->getValueType(DL, Ty);
398 if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT))
399 return TargetTransformInfo::TCC_Basic;
400 return TargetTransformInfo::TCC_Expensive;
401 }
402
403 unsigned getInliningThresholdMultiplier() { return 1; }
404
405 int getInlinerVectorBonusPercent() { return 150; }
406
407 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
408 TTI::UnrollingPreferences &UP) {
409 // This unrolling functionality is target independent, but to provide some
410 // motivation for its intended use, for x86:
411
412 // According to the Intel 64 and IA-32 Architectures Optimization Reference
413 // Manual, Intel Core models and later have a loop stream detector (and
414 // associated uop queue) that can benefit from partial unrolling.
415 // The relevant requirements are:
416 // - The loop must have no more than 4 (8 for Nehalem and later) branches
417 // taken, and none of them may be calls.
418 // - The loop can have no more than 18 (28 for Nehalem and later) uops.
419
420 // According to the Software Optimization Guide for AMD Family 15h
421 // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
422 // and loop buffer which can benefit from partial unrolling.
423 // The relevant requirements are:
424 // - The loop must have fewer than 16 branches
425 // - The loop must have less than 40 uops in all executed loop branches
426
427 // The number of taken branches in a loop is hard to estimate here, and
428 // benchmarking has revealed that it is better not to be conservative when
429 // estimating the branch count. As a result, we'll ignore the branch limits
430 // until someone finds a case where it matters in practice.
431
432 unsigned MaxOps;
433 const TargetSubtargetInfo *ST = getST();
434 if (PartialUnrollingThreshold.getNumOccurrences() > 0)
435 MaxOps = PartialUnrollingThreshold;
436 else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
437 MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
438 else
439 return;
440
441 // Scan the loop: don't unroll loops with calls.
442 for (BasicBlock *BB : L->blocks()) {
443 for (Instruction &I : *BB) {
444 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
445 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
446 if (!thisT()->isLoweredToCall(F))
447 continue;
448 }
449
450 return;
451 }
452 }
453 }
454
455 // Enable runtime and partial unrolling up to the specified size.
456 // Enable using trip count upper bound to unroll loops.
457 UP.Partial = UP.Runtime = UP.UpperBound = true;
458 UP.PartialThreshold = MaxOps;
459
460 // Avoid unrolling when optimizing for size.
461 UP.OptSizeThreshold = 0;
462 UP.PartialOptSizeThreshold = 0;
463
464 // Set number of instructions optimized when "back edge"
465 // becomes "fall through" to default value of 2.
466 UP.BEInsns = 2;
467 }
468
469 void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
470 TTI::PeelingPreferences &PP) {
471 PP.PeelCount = 0;
472 PP.AllowPeeling = true;
473 PP.AllowLoopNestsPeeling = false;
474 PP.PeelProfiledIterations = true;
475 }
476
477 bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
478 AssumptionCache &AC,
479 TargetLibraryInfo *LibInfo,
480 HardwareLoopInfo &HWLoopInfo) {
481 return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
482 }
483
484 bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
485 AssumptionCache &AC, TargetLibraryInfo *TLI,
486 DominatorTree *DT,
487 const LoopAccessInfo *LAI) {
488 return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
489 }
490
491 bool emitGetActiveLaneMask() {
492 return BaseT::emitGetActiveLaneMask();
493 }
494
495 Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
496 IntrinsicInst &II) {
497 return BaseT::instCombineIntrinsic(IC, II);
498 }
499
500 Optional<Value *> simplifyDemandedUseBitsIntrinsic(InstCombiner &IC,
501 IntrinsicInst &II,
502 APInt DemandedMask,
503 KnownBits &Known,
504 bool &KnownBitsComputed) {
505 return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
506 KnownBitsComputed);
507 }
508
509 Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
510 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
511 APInt &UndefElts2, APInt &UndefElts3,
512 std::function<void(Instruction *, unsigned, APInt, APInt &)>
513 SimplifyAndSetOp) {
514 return BaseT::simplifyDemandedVectorEltsIntrinsic(
515 IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
516 SimplifyAndSetOp);
517 }
518
519 int getInstructionLatency(const Instruction *I) {
520 if (isa<LoadInst>(I))
521 return getST()->getSchedModel().DefaultLoadLatency;
522
523 return BaseT::getInstructionLatency(I);
524 }
525
526 virtual Optional<unsigned>
527 getCacheSize(TargetTransformInfo::CacheLevel Level) const {
528 return Optional<unsigned>(
529 getST()->getCacheSize(static_cast<unsigned>(Level)));
530 }
531
532 virtual Optional<unsigned>
533 getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const {
534 Optional<unsigned> TargetResult =
535 getST()->getCacheAssociativity(static_cast<unsigned>(Level));
536
537 if (TargetResult)
538 return TargetResult;
539
540 return BaseT::getCacheAssociativity(Level);
541 }
542
543 virtual unsigned getCacheLineSize() const {
544 return getST()->getCacheLineSize();
545 }
546
547 virtual unsigned getPrefetchDistance() const {
548 return getST()->getPrefetchDistance();
549 }
550
551 virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
552 unsigned NumStridedMemAccesses,
553 unsigned NumPrefetches,
554 bool HasCall) const {
555 return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
556 NumPrefetches, HasCall);
557 }
558
559 virtual unsigned getMaxPrefetchIterationsAhead() const {
560 return getST()->getMaxPrefetchIterationsAhead();
561 }
562
563 virtual bool enableWritePrefetching() const {
564 return getST()->enableWritePrefetching();
565 }
566
567 /// @}
568
569 /// \name Vector TTI Implementations
570 /// @{
571
572 unsigned getRegisterBitWidth(bool Vector) const { return 32; }
573
574 Optional<unsigned> getMaxVScale() const { return None; }
575
576 /// Estimate the overhead of scalarizing an instruction. Insert and Extract
577 /// are set if the demanded result elements need to be inserted and/or
578 /// extracted from vectors.
579 unsigned getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts,
580 bool Insert, bool Extract) {
581 /// FIXME: a bitfield is not a reasonable abstraction for talking about
582 /// which elements are needed from a scalable vector
583 auto *Ty = cast<FixedVectorType>(InTy);
584
585 assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&((DemandedElts.getBitWidth() == Ty->getNumElements() &&
"Vector size mismatch") ? static_cast<void> (0) : __assert_fail
("DemandedElts.getBitWidth() == Ty->getNumElements() && \"Vector size mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 586, __PRETTY_FUNCTION__))
586 "Vector size mismatch")((DemandedElts.getBitWidth() == Ty->getNumElements() &&
"Vector size mismatch") ? static_cast<void> (0) : __assert_fail
("DemandedElts.getBitWidth() == Ty->getNumElements() && \"Vector size mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 586, __PRETTY_FUNCTION__))
;
587
588 unsigned Cost = 0;
589
590 for (int i = 0, e = Ty->getNumElements(); i < e; ++i) {
591 if (!DemandedElts[i])
592 continue;
593 if (Insert)
594 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, i);
595 if (Extract)
596 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
597 }
598
599 return Cost;
600 }
601
602 /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
603 unsigned getScalarizationOverhead(VectorType *InTy, bool Insert,
604 bool Extract) {
605 auto *Ty = cast<FixedVectorType>(InTy);
606
607 APInt DemandedElts = APInt::getAllOnesValue(Ty->getNumElements());
608 return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
609 }
610
611 /// Estimate the overhead of scalarizing an instruction's unique
612 /// non-constant operands. The types of the arguments are ordinarily
613 /// scalar, in which case the costs are multiplied with VF.
614 unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
615 unsigned VF) {
616 unsigned Cost = 0;
617 SmallPtrSet<const Value*, 4> UniqueOperands;
618 for (const Value *A : Args) {
619 // Disregard things like metadata arguments.
620 Type *Ty = A->getType();
621 if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
622 !Ty->isPtrOrPtrVectorTy())
623 continue;
624
625 if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
626 auto *VecTy = dyn_cast<VectorType>(Ty);
627 if (VecTy) {
628 // If A is a vector operand, VF should be 1 or correspond to A.
629 assert((VF == 1 ||(((VF == 1 || VF == cast<FixedVectorType>(VecTy)->getNumElements
()) && "Vector argument does not match VF") ? static_cast
<void> (0) : __assert_fail ("(VF == 1 || VF == cast<FixedVectorType>(VecTy)->getNumElements()) && \"Vector argument does not match VF\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 631, __PRETTY_FUNCTION__))
630 VF == cast<FixedVectorType>(VecTy)->getNumElements()) &&(((VF == 1 || VF == cast<FixedVectorType>(VecTy)->getNumElements
()) && "Vector argument does not match VF") ? static_cast
<void> (0) : __assert_fail ("(VF == 1 || VF == cast<FixedVectorType>(VecTy)->getNumElements()) && \"Vector argument does not match VF\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 631, __PRETTY_FUNCTION__))
631 "Vector argument does not match VF")(((VF == 1 || VF == cast<FixedVectorType>(VecTy)->getNumElements
()) && "Vector argument does not match VF") ? static_cast
<void> (0) : __assert_fail ("(VF == 1 || VF == cast<FixedVectorType>(VecTy)->getNumElements()) && \"Vector argument does not match VF\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 631, __PRETTY_FUNCTION__))
;
632 }
633 else
634 VecTy = FixedVectorType::get(Ty, VF);
635
636 Cost += getScalarizationOverhead(VecTy, false, true);
637 }
638 }
639
640 return Cost;
641 }
642
643 unsigned getScalarizationOverhead(VectorType *InTy,
644 ArrayRef<const Value *> Args) {
645 auto *Ty = cast<FixedVectorType>(InTy);
646
647 unsigned Cost = 0;
648
649 Cost += getScalarizationOverhead(Ty, true, false);
650 if (!Args.empty())
651 Cost += getOperandsScalarizationOverhead(Args, Ty->getNumElements());
652 else
653 // When no information on arguments is provided, we add the cost
654 // associated with one argument as a heuristic.
655 Cost += getScalarizationOverhead(Ty, false, true);
656
657 return Cost;
658 }
659
660 unsigned getMaxInterleaveFactor(unsigned VF) { return 1; }
661
662 unsigned getArithmeticInstrCost(
663 unsigned Opcode, Type *Ty,
664 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
665 TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
666 TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
667 TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
668 TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
669 ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
670 const Instruction *CxtI = nullptr) {
671 // Check if any of the operands are vector operands.
672 const TargetLoweringBase *TLI = getTLI();
673 int ISD = TLI->InstructionOpcodeToISD(Opcode);
674 assert(ISD && "Invalid opcode")((ISD && "Invalid opcode") ? static_cast<void> (
0) : __assert_fail ("ISD && \"Invalid opcode\"", "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 674, __PRETTY_FUNCTION__))
;
675
676 // TODO: Handle more cost kinds.
677 if (CostKind != TTI::TCK_RecipThroughput)
678 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
679 Opd1Info, Opd2Info,
680 Opd1PropInfo, Opd2PropInfo,
681 Args, CxtI);
682
683 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
684
685 bool IsFloat = Ty->isFPOrFPVectorTy();
686 // Assume that floating point arithmetic operations cost twice as much as
687 // integer operations.
688 unsigned OpCost = (IsFloat ? 2 : 1);
689
690 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
691 // The operation is legal. Assume it costs 1.
692 // TODO: Once we have extract/insert subvector cost we need to use them.
693 return LT.first * OpCost;
694 }
695
696 if (!TLI->isOperationExpand(ISD, LT.second)) {
697 // If the operation is custom lowered, then assume that the code is twice
698 // as expensive.
699 return LT.first * 2 * OpCost;
700 }
701
702 // Else, assume that we need to scalarize this op.
703 // TODO: If one of the types get legalized by splitting, handle this
704 // similarly to what getCastInstrCost() does.
705 if (auto *VTy = dyn_cast<VectorType>(Ty)) {
706 unsigned Num = cast<FixedVectorType>(VTy)->getNumElements();
707 unsigned Cost = thisT()->getArithmeticInstrCost(
708 Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
709 Opd1PropInfo, Opd2PropInfo, Args, CxtI);
710 // Return the cost of multiple scalar invocation plus the cost of
711 // inserting and extracting the values.
712 return getScalarizationOverhead(VTy, Args) + Num * Cost;
713 }
714
715 // We don't know anything about this scalar instruction.
716 return OpCost;
717 }
718
719 unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index,
720 VectorType *SubTp) {
721
722 switch (Kind) {
723 case TTI::SK_Broadcast:
724 return getBroadcastShuffleOverhead(cast<FixedVectorType>(Tp));
725 case TTI::SK_Select:
726 case TTI::SK_Reverse:
727 case TTI::SK_Transpose:
728 case TTI::SK_PermuteSingleSrc:
729 case TTI::SK_PermuteTwoSrc:
730 return getPermuteShuffleOverhead(cast<FixedVectorType>(Tp));
731 case TTI::SK_ExtractSubvector:
732 return getExtractSubvectorOverhead(Tp, Index,
733 cast<FixedVectorType>(SubTp));
734 case TTI::SK_InsertSubvector:
735 return getInsertSubvectorOverhead(Tp, Index,
736 cast<FixedVectorType>(SubTp));
737 }
738 llvm_unreachable("Unknown TTI::ShuffleKind")::llvm::llvm_unreachable_internal("Unknown TTI::ShuffleKind",
"/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 738)
;
739 }
740
741 unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
742 TTI::CastContextHint CCH,
743 TTI::TargetCostKind CostKind,
744 const Instruction *I = nullptr) {
745 if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0)
746 return 0;
747
748 const TargetLoweringBase *TLI = getTLI();
749 int ISD = TLI->InstructionOpcodeToISD(Opcode);
750 assert(ISD && "Invalid opcode")((ISD && "Invalid opcode") ? static_cast<void> (
0) : __assert_fail ("ISD && \"Invalid opcode\"", "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 750, __PRETTY_FUNCTION__))
;
751 std::pair<unsigned, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, Src);
752 std::pair<unsigned, MVT> DstLT = TLI->getTypeLegalizationCost(DL, Dst);
753
754 TypeSize SrcSize = SrcLT.second.getSizeInBits();
755 TypeSize DstSize = DstLT.second.getSizeInBits();
756 bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy();
757 bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy();
758
759 switch (Opcode) {
760 default:
761 break;
762 case Instruction::Trunc:
763 // Check for NOOP conversions.
764 if (TLI->isTruncateFree(SrcLT.second, DstLT.second))
765 return 0;
766 LLVM_FALLTHROUGH[[gnu::fallthrough]];
767 case Instruction::BitCast:
768 // Bitcast between types that are legalized to the same type are free and
769 // assume int to/from ptr of the same size is also free.
770 if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst &&
771 SrcSize == DstSize)
772 return 0;
773 break;
774 case Instruction::FPExt:
775 if (I && getTLI()->isExtFree(I))
776 return 0;
777 break;
778 case Instruction::ZExt:
779 if (TLI->isZExtFree(SrcLT.second, DstLT.second))
780 return 0;
781 LLVM_FALLTHROUGH[[gnu::fallthrough]];
782 case Instruction::SExt:
783 if (I && getTLI()->isExtFree(I))
784 return 0;
785
786 // If this is a zext/sext of a load, return 0 if the corresponding
787 // extending load exists on target.
788 if (CCH == TTI::CastContextHint::Normal) {
789 EVT ExtVT = EVT::getEVT(Dst);
790 EVT LoadVT = EVT::getEVT(Src);
791 unsigned LType =
792 ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
793 if (TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
794 return 0;
795 }
796 break;
797 case Instruction::AddrSpaceCast:
798 if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(),
799 Dst->getPointerAddressSpace()))
800 return 0;
801 break;
802 }
803
804 auto *SrcVTy = dyn_cast<VectorType>(Src);
805 auto *DstVTy = dyn_cast<VectorType>(Dst);
806
807 // If the cast is marked as legal (or promote) then assume low cost.
808 if (SrcLT.first == DstLT.first &&
809 TLI->isOperationLegalOrPromote(ISD, DstLT.second))
810 return SrcLT.first;
811
812 // Handle scalar conversions.
813 if (!SrcVTy && !DstVTy) {
814 // Just check the op cost. If the operation is legal then assume it costs
815 // 1.
816 if (!TLI->isOperationExpand(ISD, DstLT.second))
817 return 1;
818
819 // Assume that illegal scalar instruction are expensive.
820 return 4;
821 }
822
823 // Check vector-to-vector casts.
824 if (DstVTy && SrcVTy) {
825 // If the cast is between same-sized registers, then the check is simple.
826 if (SrcLT.first == DstLT.first && SrcSize == DstSize) {
827
828 // Assume that Zext is done using AND.
829 if (Opcode == Instruction::ZExt)
830 return SrcLT.first;
831
832 // Assume that sext is done using SHL and SRA.
833 if (Opcode == Instruction::SExt)
834 return SrcLT.first * 2;
835
836 // Just check the op cost. If the operation is legal then assume it
837 // costs
838 // 1 and multiply by the type-legalization overhead.
839 if (!TLI->isOperationExpand(ISD, DstLT.second))
840 return SrcLT.first * 1;
841 }
842
843 // If we are legalizing by splitting, query the concrete TTI for the cost
844 // of casting the original vector twice. We also need to factor in the
845 // cost of the split itself. Count that as 1, to be consistent with
846 // TLI->getTypeLegalizationCost().
847 bool SplitSrc =
848 TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
849 TargetLowering::TypeSplitVector;
850 bool SplitDst =
851 TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
852 TargetLowering::TypeSplitVector;
853 if ((SplitSrc || SplitDst) &&
854 cast<FixedVectorType>(SrcVTy)->getNumElements() > 1 &&
855 cast<FixedVectorType>(DstVTy)->getNumElements() > 1) {
856 Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy);
857 Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy);
858 T *TTI = static_cast<T *>(this);
859 // If both types need to be split then the split is free.
860 unsigned SplitCost =
861 (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0;
862 return SplitCost +
863 (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH,
864 CostKind, I));
865 }
866
867 // In other cases where the source or destination are illegal, assume
868 // the operation will get scalarized.
869 unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements();
870 unsigned Cost = thisT()->getCastInstrCost(
871 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I);
872
873 // Return the cost of multiple scalar invocation plus the cost of
874 // inserting and extracting the values.
875 return getScalarizationOverhead(DstVTy, true, true) + Num * Cost;
876 }
877
878 // We already handled vector-to-vector and scalar-to-scalar conversions.
879 // This
880 // is where we handle bitcast between vectors and scalars. We need to assume
881 // that the conversion is scalarized in one way or another.
882 if (Opcode == Instruction::BitCast) {
883 // Illegal bitcasts are done by storing and loading from a stack slot.
884 return (SrcVTy ? getScalarizationOverhead(SrcVTy, false, true) : 0) +
885 (DstVTy ? getScalarizationOverhead(DstVTy, true, false) : 0);
886 }
887
888 llvm_unreachable("Unhandled cast")::llvm::llvm_unreachable_internal("Unhandled cast", "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 888)
;
889 }
890
891 unsigned getExtractWithExtendCost(unsigned Opcode, Type *Dst,
892 VectorType *VecTy, unsigned Index) {
893 return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
894 Index) +
895 thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
896 TTI::CastContextHint::None, TTI::TCK_RecipThroughput);
897 }
898
899 unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) {
900 return BaseT::getCFInstrCost(Opcode, CostKind);
901 }
902
903 unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
904 CmpInst::Predicate VecPred,
905 TTI::TargetCostKind CostKind,
906 const Instruction *I = nullptr) {
907 const TargetLoweringBase *TLI = getTLI();
908 int ISD = TLI->InstructionOpcodeToISD(Opcode);
909 assert(ISD && "Invalid opcode")((ISD && "Invalid opcode") ? static_cast<void> (
0) : __assert_fail ("ISD && \"Invalid opcode\"", "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 909, __PRETTY_FUNCTION__))
;
910
911 // TODO: Handle other cost kinds.
912 if (CostKind != TTI::TCK_RecipThroughput)
913 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
914 I);
915
916 // Selects on vectors are actually vector selects.
917 if (ISD == ISD::SELECT) {
918 assert(CondTy && "CondTy must exist")((CondTy && "CondTy must exist") ? static_cast<void
> (0) : __assert_fail ("CondTy && \"CondTy must exist\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 918, __PRETTY_FUNCTION__))
;
919 if (CondTy->isVectorTy())
920 ISD = ISD::VSELECT;
921 }
922 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
923
924 if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
925 !TLI->isOperationExpand(ISD, LT.second)) {
926 // The operation is legal. Assume it costs 1. Multiply
927 // by the type-legalization overhead.
928 return LT.first * 1;
929 }
930
931 // Otherwise, assume that the cast is scalarized.
932 // TODO: If one of the types get legalized by splitting, handle this
933 // similarly to what getCastInstrCost() does.
934 if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) {
935 unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements();
936 if (CondTy)
937 CondTy = CondTy->getScalarType();
938 unsigned Cost = thisT()->getCmpSelInstrCost(
939 Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I);
940
941 // Return the cost of multiple scalar invocation plus the cost of
942 // inserting and extracting the values.
943 return getScalarizationOverhead(ValVTy, true, false) + Num * Cost;
944 }
945
946 // Unknown scalar opcode.
947 return 1;
948 }
949
950 unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
951 std::pair<unsigned, MVT> LT =
952 getTLI()->getTypeLegalizationCost(DL, Val->getScalarType());
953
954 return LT.first;
955 }
956
957 unsigned getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
958 unsigned AddressSpace,
959 TTI::TargetCostKind CostKind,
960 const Instruction *I = nullptr) {
961 assert(!Src->isVoidTy() && "Invalid type")((!Src->isVoidTy() && "Invalid type") ? static_cast
<void> (0) : __assert_fail ("!Src->isVoidTy() && \"Invalid type\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 961, __PRETTY_FUNCTION__))
;
962 // Assume types, such as structs, are expensive.
963 if (getTLI()->getValueType(DL, Src, true) == MVT::Other)
964 return 4;
965 std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Src);
966
967 // Assuming that all loads of legal types cost 1.
968 unsigned Cost = LT.first;
969 if (CostKind != TTI::TCK_RecipThroughput)
970 return Cost;
971
972 if (Src->isVectorTy() &&
973 // In practice it's not currently possible to have a change in lane
974 // length for extending loads or truncating stores so both types should
975 // have the same scalable property.
976 TypeSize::isKnownLT(Src->getPrimitiveSizeInBits(),
977 LT.second.getSizeInBits())) {
978 // This is a vector load that legalizes to a larger type than the vector
979 // itself. Unless the corresponding extending load or truncating store is
980 // legal, then this will scalarize.
981 TargetLowering::LegalizeAction LA = TargetLowering::Expand;
982 EVT MemVT = getTLI()->getValueType(DL, Src);
983 if (Opcode == Instruction::Store)
984 LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
985 else
986 LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
987
988 if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
989 // This is a vector load/store for some illegal type that is scalarized.
990 // We must account for the cost of building or decomposing the vector.
991 Cost += getScalarizationOverhead(cast<VectorType>(Src),
992 Opcode != Instruction::Store,
993 Opcode == Instruction::Store);
994 }
995 }
996
997 return Cost;
998 }
999
1000 unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
1001 const Value *Ptr, bool VariableMask,
1002 Align Alignment, TTI::TargetCostKind CostKind,
1003 const Instruction *I = nullptr) {
1004 auto *VT = cast<FixedVectorType>(DataTy);
1005 // Assume the target does not have support for gather/scatter operations
1006 // and provide a rough estimate.
1007 //
1008 // First, compute the cost of extracting the individual addresses and the
1009 // individual memory operations.
1010 int LoadCost =
1011 VT->getNumElements() *
1012 (getVectorInstrCost(
1013 Instruction::ExtractElement,
1014 FixedVectorType::get(PointerType::get(VT->getElementType(), 0),
1015 VT->getNumElements()),
1016 -1) +
1017 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind));
1018
1019 // Next, compute the cost of packing the result in a vector.
1020 int PackingCost = getScalarizationOverhead(VT, Opcode != Instruction::Store,
1021 Opcode == Instruction::Store);
1022
1023 int ConditionalCost = 0;
1024 if (VariableMask) {
1025 // Compute the cost of conditionally executing the memory operations with
1026 // variable masks. This includes extracting the individual conditions, a
1027 // branches and PHIs to combine the results.
1028 // NOTE: Estimating the cost of conditionally executing the memory
1029 // operations accurately is quite difficult and the current solution
1030 // provides a very rough estimate only.
1031 ConditionalCost =
1032 VT->getNumElements() *
1033 (getVectorInstrCost(
1034 Instruction::ExtractElement,
1035 FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()),
1036 VT->getNumElements()),
1037 -1) +
1038 getCFInstrCost(Instruction::Br, CostKind) +
1039 getCFInstrCost(Instruction::PHI, CostKind));
1040 }
1041
1042 return LoadCost + PackingCost + ConditionalCost;
1043 }
1044
1045 unsigned getInterleavedMemoryOpCost(
1046 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1047 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1048 bool UseMaskForCond = false, bool UseMaskForGaps = false) {
1049 auto *VT = cast<FixedVectorType>(VecTy);
11
'VecTy' is a 'FixedVectorType'
1050
1051 unsigned NumElts = VT->getNumElements();
1052 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor")((Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor"
) ? static_cast<void> (0) : __assert_fail ("Factor > 1 && NumElts % Factor == 0 && \"Invalid interleave factor\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 1052, __PRETTY_FUNCTION__))
;
12
Assuming 'Factor' is > 1
13
Assuming the condition is true
14
'?' condition is true
1053
1054 unsigned NumSubElts = NumElts / Factor;
1055 auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts);
1056
1057 // Firstly, the cost of load/store operation.
1058 unsigned Cost;
1059 if (UseMaskForCond || UseMaskForGaps)
15
Assuming 'UseMaskForCond' is false
16
Assuming 'UseMaskForGaps' is false
17
Taking false branch
1060 Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment,
1061 AddressSpace, CostKind);
1062 else
1063 Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace,
18
Passing null pointer value via 6th parameter 'I'
19
Calling 'X86TTIImpl::getMemoryOpCost'
1064 CostKind);
1065
1066 // Legalize the vector type, and get the legalized and unlegalized type
1067 // sizes.
1068 MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
1069 unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy);
1070 unsigned VecTyLTSize = VecTyLT.getStoreSize();
1071
1072 // Return the ceiling of dividing A by B.
1073 auto ceil = [](unsigned A, unsigned B) { return (A + B - 1) / B; };
1074
1075 // Scale the cost of the memory operation by the fraction of legalized
1076 // instructions that will actually be used. We shouldn't account for the
1077 // cost of dead instructions since they will be removed.
1078 //
1079 // E.g., An interleaved load of factor 8:
1080 // %vec = load <16 x i64>, <16 x i64>* %ptr
1081 // %v0 = shufflevector %vec, undef, <0, 8>
1082 //
1083 // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
1084 // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
1085 // type). The other loads are unused.
1086 //
1087 // We only scale the cost of loads since interleaved store groups aren't
1088 // allowed to have gaps.
1089 if (Opcode == Instruction::Load && VecTySize > VecTyLTSize) {
1090 // The number of loads of a legal type it will take to represent a load
1091 // of the unlegalized vector type.
1092 unsigned NumLegalInsts = ceil(VecTySize, VecTyLTSize);
1093
1094 // The number of elements of the unlegalized type that correspond to a
1095 // single legal instruction.
1096 unsigned NumEltsPerLegalInst = ceil(NumElts, NumLegalInsts);
1097
1098 // Determine which legal instructions will be used.
1099 BitVector UsedInsts(NumLegalInsts, false);
1100 for (unsigned Index : Indices)
1101 for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
1102 UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
1103
1104 // Scale the cost of the load by the fraction of legal instructions that
1105 // will be used.
1106 Cost *= UsedInsts.count() / NumLegalInsts;
1107 }
1108
1109 // Then plus the cost of interleave operation.
1110 if (Opcode == Instruction::Load) {
1111 // The interleave cost is similar to extract sub vectors' elements
1112 // from the wide vector, and insert them into sub vectors.
1113 //
1114 // E.g. An interleaved load of factor 2 (with one member of index 0):
1115 // %vec = load <8 x i32>, <8 x i32>* %ptr
1116 // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0
1117 // The cost is estimated as extract elements at 0, 2, 4, 6 from the
1118 // <8 x i32> vector and insert them into a <4 x i32> vector.
1119
1120 assert(Indices.size() <= Factor &&((Indices.size() <= Factor && "Interleaved memory op has too many members"
) ? static_cast<void> (0) : __assert_fail ("Indices.size() <= Factor && \"Interleaved memory op has too many members\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 1121, __PRETTY_FUNCTION__))
1121 "Interleaved memory op has too many members")((Indices.size() <= Factor && "Interleaved memory op has too many members"
) ? static_cast<void> (0) : __assert_fail ("Indices.size() <= Factor && \"Interleaved memory op has too many members\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 1121, __PRETTY_FUNCTION__))
;
1122
1123 for (unsigned Index : Indices) {
1124 assert(Index < Factor && "Invalid index for interleaved memory op")((Index < Factor && "Invalid index for interleaved memory op"
) ? static_cast<void> (0) : __assert_fail ("Index < Factor && \"Invalid index for interleaved memory op\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 1124, __PRETTY_FUNCTION__))
;
1125
1126 // Extract elements from loaded vector for each sub vector.
1127 for (unsigned i = 0; i < NumSubElts; i++)
1128 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VT,
1129 Index + i * Factor);
1130 }
1131
1132 unsigned InsSubCost = 0;
1133 for (unsigned i = 0; i < NumSubElts; i++)
1134 InsSubCost +=
1135 thisT()->getVectorInstrCost(Instruction::InsertElement, SubVT, i);
1136
1137 Cost += Indices.size() * InsSubCost;
1138 } else {
1139 // The interleave cost is extract all elements from sub vectors, and
1140 // insert them into the wide vector.
1141 //
1142 // E.g. An interleaved store of factor 2:
1143 // %v0_v1 = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7>
1144 // store <8 x i32> %interleaved.vec, <8 x i32>* %ptr
1145 // The cost is estimated as extract all elements from both <4 x i32>
1146 // vectors and insert into the <8 x i32> vector.
1147
1148 unsigned ExtSubCost = 0;
1149 for (unsigned i = 0; i < NumSubElts; i++)
1150 ExtSubCost +=
1151 thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i);
1152 Cost += ExtSubCost * Factor;
1153
1154 for (unsigned i = 0; i < NumElts; i++)
1155 Cost += static_cast<T *>(this)
1156 ->getVectorInstrCost(Instruction::InsertElement, VT, i);
1157 }
1158
1159 if (!UseMaskForCond)
1160 return Cost;
1161
1162 Type *I8Type = Type::getInt8Ty(VT->getContext());
1163 auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
1164 SubVT = FixedVectorType::get(I8Type, NumSubElts);
1165
1166 // The Mask shuffling cost is extract all the elements of the Mask
1167 // and insert each of them Factor times into the wide vector:
1168 //
1169 // E.g. an interleaved group with factor 3:
1170 // %mask = icmp ult <8 x i32> %vec1, %vec2
1171 // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
1172 // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
1173 // The cost is estimated as extract all mask elements from the <8xi1> mask
1174 // vector and insert them factor times into the <24xi1> shuffled mask
1175 // vector.
1176 for (unsigned i = 0; i < NumSubElts; i++)
1177 Cost +=
1178 thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i);
1179
1180 for (unsigned i = 0; i < NumElts; i++)
1181 Cost +=
1182 thisT()->getVectorInstrCost(Instruction::InsertElement, MaskVT, i);
1183
1184 // The Gaps mask is invariant and created outside the loop, therefore the
1185 // cost of creating it is not accounted for here. However if we have both
1186 // a MaskForGaps and some other mask that guards the execution of the
1187 // memory access, we need to account for the cost of And-ing the two masks
1188 // inside the loop.
1189 if (UseMaskForGaps)
1190 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT,
1191 CostKind);
1192
1193 return Cost;
1194 }
1195
1196 /// Get intrinsic cost based on arguments.
1197 unsigned getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1198 TTI::TargetCostKind CostKind) {
1199 // Check for generically free intrinsics.
1200 if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0)
1201 return 0;
1202
1203 // Assume that target intrinsics are cheap.
1204 Intrinsic::ID IID = ICA.getID();
1205 if (Function::isTargetIntrinsic(IID))
1206 return TargetTransformInfo::TCC_Basic;
1207
1208 if (ICA.isTypeBasedOnly())
1209 return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
1210
1211 Type *RetTy = ICA.getReturnType();
1212
1213 ElementCount VF = ICA.getVectorFactor();
1214 ElementCount RetVF =
1215 (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
1216 : ElementCount::getFixed(1));
1217 assert((RetVF.isScalar() || VF.isScalar()) &&(((RetVF.isScalar() || VF.isScalar()) && "VF > 1 and RetVF is a vector type"
) ? static_cast<void> (0) : __assert_fail ("(RetVF.isScalar() || VF.isScalar()) && \"VF > 1 and RetVF is a vector type\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 1218, __PRETTY_FUNCTION__))
1218 "VF > 1 and RetVF is a vector type")(((RetVF.isScalar() || VF.isScalar()) && "VF > 1 and RetVF is a vector type"
) ? static_cast<void> (0) : __assert_fail ("(RetVF.isScalar() || VF.isScalar()) && \"VF > 1 and RetVF is a vector type\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 1218, __PRETTY_FUNCTION__))
;
1219 const IntrinsicInst *I = ICA.getInst();
1220 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
1221 FastMathFlags FMF = ICA.getFlags();
1222 switch (IID) {
1223 default:
1224 break;
1225
1226 case Intrinsic::cttz:
1227 // FIXME: If necessary, this should go in target-specific overrides.
1228 if (VF.isScalar() && RetVF.isScalar() &&
1229 getTLI()->isCheapToSpeculateCttz())
1230 return TargetTransformInfo::TCC_Basic;
1231 break;
1232
1233 case Intrinsic::ctlz:
1234 // FIXME: If necessary, this should go in target-specific overrides.
1235 if (VF.isScalar() && RetVF.isScalar() &&
1236 getTLI()->isCheapToSpeculateCtlz())
1237 return TargetTransformInfo::TCC_Basic;
1238 break;
1239
1240 case Intrinsic::memcpy:
1241 return thisT()->getMemcpyCost(ICA.getInst());
1242
1243 case Intrinsic::masked_scatter: {
1244 assert(VF.isScalar() && "Can't vectorize types here.")((VF.isScalar() && "Can't vectorize types here.") ? static_cast
<void> (0) : __assert_fail ("VF.isScalar() && \"Can't vectorize types here.\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 1244, __PRETTY_FUNCTION__))
;
1245 const Value *Mask = Args[3];
1246 bool VarMask = !isa<Constant>(Mask);
1247 Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue();
1248 return thisT()->getGatherScatterOpCost(Instruction::Store,
1249 Args[0]->getType(), Args[1],
1250 VarMask, Alignment, CostKind, I);
1251 }
1252 case Intrinsic::masked_gather: {
1253 assert(VF.isScalar() && "Can't vectorize types here.")((VF.isScalar() && "Can't vectorize types here.") ? static_cast
<void> (0) : __assert_fail ("VF.isScalar() && \"Can't vectorize types here.\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 1253, __PRETTY_FUNCTION__))
;
1254 const Value *Mask = Args[2];
1255 bool VarMask = !isa<Constant>(Mask);
1256 Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
1257 return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
1258 VarMask, Alignment, CostKind, I);
1259 }
1260 case Intrinsic::experimental_vector_extract: {
1261 // FIXME: Handle case where a scalable vector is extracted from a scalable
1262 // vector
1263 if (isa<ScalableVectorType>(RetTy))
1264 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1265 unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue();
1266 return thisT()->getShuffleCost(TTI::SK_ExtractSubvector,
1267 cast<VectorType>(Args[0]->getType()),
1268 Index, cast<VectorType>(RetTy));
1269 }
1270 case Intrinsic::experimental_vector_insert: {
1271 // FIXME: Handle case where a scalable vector is inserted into a scalable
1272 // vector
1273 if (isa<ScalableVectorType>(Args[1]->getType()))
1274 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1275 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1276 return thisT()->getShuffleCost(
1277 TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()), Index,
1278 cast<VectorType>(Args[1]->getType()));
1279 }
1280 case Intrinsic::vector_reduce_add:
1281 case Intrinsic::vector_reduce_mul:
1282 case Intrinsic::vector_reduce_and:
1283 case Intrinsic::vector_reduce_or:
1284 case Intrinsic::vector_reduce_xor:
1285 case Intrinsic::vector_reduce_smax:
1286 case Intrinsic::vector_reduce_smin:
1287 case Intrinsic::vector_reduce_fmax:
1288 case Intrinsic::vector_reduce_fmin:
1289 case Intrinsic::vector_reduce_umax:
1290 case Intrinsic::vector_reduce_umin: {
1291 if (isa<ScalableVectorType>(RetTy))
1292 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1293 IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, 1, I);
1294 return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1295 }
1296 case Intrinsic::vector_reduce_fadd:
1297 case Intrinsic::vector_reduce_fmul: {
1298 if (isa<ScalableVectorType>(RetTy))
1299 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1300 IntrinsicCostAttributes Attrs(
1301 IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, 1, I);
1302 return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1303 }
1304 case Intrinsic::fshl:
1305 case Intrinsic::fshr: {
1306 if (isa<ScalableVectorType>(RetTy))
1307 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1308 const Value *X = Args[0];
1309 const Value *Y = Args[1];
1310 const Value *Z = Args[2];
1311 TTI::OperandValueProperties OpPropsX, OpPropsY, OpPropsZ, OpPropsBW;
1312 TTI::OperandValueKind OpKindX = TTI::getOperandInfo(X, OpPropsX);
1313 TTI::OperandValueKind OpKindY = TTI::getOperandInfo(Y, OpPropsY);
1314 TTI::OperandValueKind OpKindZ = TTI::getOperandInfo(Z, OpPropsZ);
1315 TTI::OperandValueKind OpKindBW = TTI::OK_UniformConstantValue;
1316 OpPropsBW = isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2
1317 : TTI::OP_None;
1318 // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
1319 // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
1320 unsigned Cost = 0;
1321 Cost +=
1322 thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
1323 Cost +=
1324 thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
1325 Cost += thisT()->getArithmeticInstrCost(
1326 BinaryOperator::Shl, RetTy, CostKind, OpKindX, OpKindZ, OpPropsX);
1327 Cost += thisT()->getArithmeticInstrCost(
1328 BinaryOperator::LShr, RetTy, CostKind, OpKindY, OpKindZ, OpPropsY);
1329 // Non-constant shift amounts requires a modulo.
1330 if (OpKindZ != TTI::OK_UniformConstantValue &&
1331 OpKindZ != TTI::OK_NonUniformConstantValue)
1332 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
1333 CostKind, OpKindZ, OpKindBW,
1334 OpPropsZ, OpPropsBW);
1335 // For non-rotates (X != Y) we must add shift-by-zero handling costs.
1336 if (X != Y) {
1337 Type *CondTy = RetTy->getWithNewBitWidth(1);
1338 Cost +=
1339 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1340 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1341 Cost +=
1342 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1343 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1344 }
1345 return Cost;
1346 }
1347 }
1348 // TODO: Handle the remaining intrinsic with scalable vector type
1349 if (isa<ScalableVectorType>(RetTy))
1350 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1351
1352 // Assume that we need to scalarize this intrinsic.
1353 SmallVector<Type *, 4> Types;
1354 for (const Value *Op : Args) {
1355 Type *OpTy = Op->getType();
1356 assert(VF.isScalar() || !OpTy->isVectorTy())((VF.isScalar() || !OpTy->isVectorTy()) ? static_cast<void
> (0) : __assert_fail ("VF.isScalar() || !OpTy->isVectorTy()"
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 1356, __PRETTY_FUNCTION__))
;
1357 Types.push_back(VF.isScalar()
1358 ? OpTy
1359 : FixedVectorType::get(OpTy, VF.getKnownMinValue()));
1360 }
1361
1362 if (VF.isVector() && !RetTy->isVoidTy())
1363 RetTy = FixedVectorType::get(RetTy, VF.getKnownMinValue());
1364
1365 // Compute the scalarization overhead based on Args for a vector
1366 // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while
1367 // CostModel will pass a vector RetTy and VF is 1.
1368 unsigned ScalarizationCost = std::numeric_limits<unsigned>::max();
1369 if (RetVF.isVector() || VF.isVector()) {
1370 ScalarizationCost = 0;
1371 if (!RetTy->isVoidTy())
1372 ScalarizationCost +=
1373 getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
1374 ScalarizationCost +=
1375 getOperandsScalarizationOverhead(Args, VF.getKnownMinValue());
1376 }
1377
1378 IntrinsicCostAttributes Attrs(IID, RetTy, Types, FMF, ScalarizationCost, I);
1379 return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1380 }
1381
1382 /// Get intrinsic cost based on argument types.
1383 /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
1384 /// cost of scalarizing the arguments and the return value will be computed
1385 /// based on types.
1386 unsigned getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1387 TTI::TargetCostKind CostKind) {
1388 Intrinsic::ID IID = ICA.getID();
1389 Type *RetTy = ICA.getReturnType();
1390 const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes();
1391 FastMathFlags FMF = ICA.getFlags();
1392 unsigned ScalarizationCostPassed = ICA.getScalarizationCost();
1393 bool SkipScalarizationCost = ICA.skipScalarizationCost();
1394
1395 VectorType *VecOpTy = nullptr;
1396 if (!Tys.empty()) {
1397 // The vector reduction operand is operand 0 except for fadd/fmul.
1398 // Their operand 0 is a scalar start value, so the vector op is operand 1.
1399 unsigned VecTyIndex = 0;
1400 if (IID == Intrinsic::vector_reduce_fadd ||
1401 IID == Intrinsic::vector_reduce_fmul)
1402 VecTyIndex = 1;
1403 assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes")((Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes"
) ? static_cast<void> (0) : __assert_fail ("Tys.size() > VecTyIndex && \"Unexpected IntrinsicCostAttributes\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 1403, __PRETTY_FUNCTION__))
;
1404 VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]);
1405 }
1406
1407 // Library call cost - other than size, make it expensive.
1408 unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
1409 SmallVector<unsigned, 2> ISDs;
1410 switch (IID) {
1411 default: {
1412 // Assume that we need to scalarize this intrinsic.
1413 unsigned ScalarizationCost = ScalarizationCostPassed;
1414 unsigned ScalarCalls = 1;
1415 Type *ScalarRetTy = RetTy;
1416 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
1417 if (!SkipScalarizationCost)
1418 ScalarizationCost = getScalarizationOverhead(RetVTy, true, false);
1419 ScalarCalls = std::max(ScalarCalls,
1420 cast<FixedVectorType>(RetVTy)->getNumElements());
1421 ScalarRetTy = RetTy->getScalarType();
1422 }
1423 SmallVector<Type *, 4> ScalarTys;
1424 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1425 Type *Ty = Tys[i];
1426 if (auto *VTy = dyn_cast<VectorType>(Ty)) {
1427 if (!SkipScalarizationCost)
1428 ScalarizationCost += getScalarizationOverhead(VTy, false, true);
1429 ScalarCalls = std::max(ScalarCalls,
1430 cast<FixedVectorType>(VTy)->getNumElements());
1431 Ty = Ty->getScalarType();
1432 }
1433 ScalarTys.push_back(Ty);
1434 }
1435 if (ScalarCalls == 1)
1436 return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
1437
1438 IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);
1439 unsigned ScalarCost =
1440 thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);
1441
1442 return ScalarCalls * ScalarCost + ScalarizationCost;
1443 }
1444 // Look for intrinsics that can be lowered directly or turned into a scalar
1445 // intrinsic call.
1446 case Intrinsic::sqrt:
1447 ISDs.push_back(ISD::FSQRT);
1448 break;
1449 case Intrinsic::sin:
1450 ISDs.push_back(ISD::FSIN);
1451 break;
1452 case Intrinsic::cos:
1453 ISDs.push_back(ISD::FCOS);
1454 break;
1455 case Intrinsic::exp:
1456 ISDs.push_back(ISD::FEXP);
1457 break;
1458 case Intrinsic::exp2:
1459 ISDs.push_back(ISD::FEXP2);
1460 break;
1461 case Intrinsic::log:
1462 ISDs.push_back(ISD::FLOG);
1463 break;
1464 case Intrinsic::log10:
1465 ISDs.push_back(ISD::FLOG10);
1466 break;
1467 case Intrinsic::log2:
1468 ISDs.push_back(ISD::FLOG2);
1469 break;
1470 case Intrinsic::fabs:
1471 ISDs.push_back(ISD::FABS);
1472 break;
1473 case Intrinsic::canonicalize:
1474 ISDs.push_back(ISD::FCANONICALIZE);
1475 break;
1476 case Intrinsic::minnum:
1477 ISDs.push_back(ISD::FMINNUM);
1478 break;
1479 case Intrinsic::maxnum:
1480 ISDs.push_back(ISD::FMAXNUM);
1481 break;
1482 case Intrinsic::minimum:
1483 ISDs.push_back(ISD::FMINIMUM);
1484 break;
1485 case Intrinsic::maximum:
1486 ISDs.push_back(ISD::FMAXIMUM);
1487 break;
1488 case Intrinsic::copysign:
1489 ISDs.push_back(ISD::FCOPYSIGN);
1490 break;
1491 case Intrinsic::floor:
1492 ISDs.push_back(ISD::FFLOOR);
1493 break;
1494 case Intrinsic::ceil:
1495 ISDs.push_back(ISD::FCEIL);
1496 break;
1497 case Intrinsic::trunc:
1498 ISDs.push_back(ISD::FTRUNC);
1499 break;
1500 case Intrinsic::nearbyint:
1501 ISDs.push_back(ISD::FNEARBYINT);
1502 break;
1503 case Intrinsic::rint:
1504 ISDs.push_back(ISD::FRINT);
1505 break;
1506 case Intrinsic::round:
1507 ISDs.push_back(ISD::FROUND);
1508 break;
1509 case Intrinsic::roundeven:
1510 ISDs.push_back(ISD::FROUNDEVEN);
1511 break;
1512 case Intrinsic::pow:
1513 ISDs.push_back(ISD::FPOW);
1514 break;
1515 case Intrinsic::fma:
1516 ISDs.push_back(ISD::FMA);
1517 break;
1518 case Intrinsic::fmuladd:
1519 ISDs.push_back(ISD::FMA);
1520 break;
1521 case Intrinsic::experimental_constrained_fmuladd:
1522 ISDs.push_back(ISD::STRICT_FMA);
1523 break;
1524 // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
1525 case Intrinsic::lifetime_start:
1526 case Intrinsic::lifetime_end:
1527 case Intrinsic::sideeffect:
1528 case Intrinsic::pseudoprobe:
1529 return 0;
1530 case Intrinsic::masked_store: {
1531 Type *Ty = Tys[0];
1532 Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
1533 return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0,
1534 CostKind);
1535 }
1536 case Intrinsic::masked_load: {
1537 Type *Ty = RetTy;
1538 Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
1539 return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
1540 CostKind);
1541 }
1542 case Intrinsic::vector_reduce_add:
1543 return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy,
1544 /*IsPairwiseForm=*/false,
1545 CostKind);
1546 case Intrinsic::vector_reduce_mul:
1547 return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy,
1548 /*IsPairwiseForm=*/false,
1549 CostKind);
1550 case Intrinsic::vector_reduce_and:
1551 return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy,
1552 /*IsPairwiseForm=*/false,
1553 CostKind);
1554 case Intrinsic::vector_reduce_or:
1555 return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy,
1556 /*IsPairwiseForm=*/false,
1557 CostKind);
1558 case Intrinsic::vector_reduce_xor:
1559 return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy,
1560 /*IsPairwiseForm=*/false,
1561 CostKind);
1562 case Intrinsic::vector_reduce_fadd:
1563 // FIXME: Add new flag for cost of strict reductions.
1564 return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy,
1565 /*IsPairwiseForm=*/false,
1566 CostKind);
1567 case Intrinsic::vector_reduce_fmul:
1568 // FIXME: Add new flag for cost of strict reductions.
1569 return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy,
1570 /*IsPairwiseForm=*/false,
1571 CostKind);
1572 case Intrinsic::vector_reduce_smax:
1573 case Intrinsic::vector_reduce_smin:
1574 case Intrinsic::vector_reduce_fmax:
1575 case Intrinsic::vector_reduce_fmin:
1576 return thisT()->getMinMaxReductionCost(
1577 VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
1578 /*IsPairwiseForm=*/false,
1579 /*IsUnsigned=*/false, CostKind);
1580 case Intrinsic::vector_reduce_umax:
1581 case Intrinsic::vector_reduce_umin:
1582 return thisT()->getMinMaxReductionCost(
1583 VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
1584 /*IsPairwiseForm=*/false,
1585 /*IsUnsigned=*/true, CostKind);
1586 case Intrinsic::abs:
1587 case Intrinsic::smax:
1588 case Intrinsic::smin:
1589 case Intrinsic::umax:
1590 case Intrinsic::umin: {
1591 // abs(X) = select(icmp(X,0),X,sub(0,X))
1592 // minmax(X,Y) = select(icmp(X,Y),X,Y)
1593 Type *CondTy = RetTy->getWithNewBitWidth(1);
1594 unsigned Cost = 0;
1595 // TODO: Ideally getCmpSelInstrCost would accept an icmp condition code.
1596 Cost +=
1597 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1598 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1599 Cost +=
1600 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1601 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1602 // TODO: Should we add an OperandValueProperties::OP_Zero property?
1603 if (IID == Intrinsic::abs)
1604 Cost += thisT()->getArithmeticInstrCost(
1605 BinaryOperator::Sub, RetTy, CostKind, TTI::OK_UniformConstantValue);
1606 return Cost;
1607 }
1608 case Intrinsic::sadd_sat:
1609 case Intrinsic::ssub_sat: {
1610 Type *CondTy = RetTy->getWithNewBitWidth(1);
1611
1612 Type *OpTy = StructType::create({RetTy, CondTy});
1613 Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
1614 ? Intrinsic::sadd_with_overflow
1615 : Intrinsic::ssub_with_overflow;
1616
1617 // SatMax -> Overflow && SumDiff < 0
1618 // SatMin -> Overflow && SumDiff >= 0
1619 unsigned Cost = 0;
1620 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
1621 ScalarizationCostPassed);
1622 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1623 Cost +=
1624 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1625 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1626 Cost += 2 * thisT()->getCmpSelInstrCost(
1627 BinaryOperator::Select, RetTy, CondTy,
1628 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1629 return Cost;
1630 }
1631 case Intrinsic::uadd_sat:
1632 case Intrinsic::usub_sat: {
1633 Type *CondTy = RetTy->getWithNewBitWidth(1);
1634
1635 Type *OpTy = StructType::create({RetTy, CondTy});
1636 Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
1637 ? Intrinsic::uadd_with_overflow
1638 : Intrinsic::usub_with_overflow;
1639
1640 unsigned Cost = 0;
1641 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
1642 ScalarizationCostPassed);
1643 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1644 Cost +=
1645 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1646 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1647 return Cost;
1648 }
1649 case Intrinsic::smul_fix:
1650 case Intrinsic::umul_fix: {
1651 unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
1652 Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
1653
1654 unsigned ExtOp =
1655 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
1656 TTI::CastContextHint CCH = TTI::CastContextHint::None;
1657
1658 unsigned Cost = 0;
1659 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
1660 Cost +=
1661 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
1662 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
1663 CCH, CostKind);
1664 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy,
1665 CostKind, TTI::OK_AnyValue,
1666 TTI::OK_UniformConstantValue);
1667 Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind,
1668 TTI::OK_AnyValue,
1669 TTI::OK_UniformConstantValue);
1670 Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
1671 return Cost;
1672 }
1673 case Intrinsic::sadd_with_overflow:
1674 case Intrinsic::ssub_with_overflow: {
1675 Type *SumTy = RetTy->getContainedType(0);
1676 Type *OverflowTy = RetTy->getContainedType(1);
1677 unsigned Opcode = IID == Intrinsic::sadd_with_overflow
1678 ? BinaryOperator::Add
1679 : BinaryOperator::Sub;
1680
1681 // LHSSign -> LHS >= 0
1682 // RHSSign -> RHS >= 0
1683 // SumSign -> Sum >= 0
1684 //
1685 // Add:
1686 // Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
1687 // Sub:
1688 // Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
1689 unsigned Cost = 0;
1690 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
1691 Cost += 3 * thisT()->getCmpSelInstrCost(
1692 Instruction::ICmp, SumTy, OverflowTy,
1693 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1694 Cost += 2 * thisT()->getCmpSelInstrCost(
1695 Instruction::Select, OverflowTy, OverflowTy,
1696 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1697 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, OverflowTy,
1698 CostKind);
1699 return Cost;
1700 }
1701 case Intrinsic::uadd_with_overflow:
1702 case Intrinsic::usub_with_overflow: {
1703 Type *SumTy = RetTy->getContainedType(0);
1704 Type *OverflowTy = RetTy->getContainedType(1);
1705 unsigned Opcode = IID == Intrinsic::uadd_with_overflow
1706 ? BinaryOperator::Add
1707 : BinaryOperator::Sub;
1708
1709 unsigned Cost = 0;
1710 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
1711 Cost +=
1712 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy,
1713 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1714 return Cost;
1715 }
1716 case Intrinsic::smul_with_overflow:
1717 case Intrinsic::umul_with_overflow: {
1718 Type *MulTy = RetTy->getContainedType(0);
1719 Type *OverflowTy = RetTy->getContainedType(1);
1720 unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
1721 Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
1722
1723 unsigned ExtOp =
1724 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
1725 TTI::CastContextHint CCH = TTI::CastContextHint::None;
1726
1727 unsigned Cost = 0;
1728 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
1729 Cost +=
1730 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
1731 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
1732 CCH, CostKind);
1733 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, MulTy,
1734 CostKind, TTI::OK_AnyValue,
1735 TTI::OK_UniformConstantValue);
1736
1737 if (IID == Intrinsic::smul_with_overflow)
1738 Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy,
1739 CostKind, TTI::OK_AnyValue,
1740 TTI::OK_UniformConstantValue);
1741
1742 Cost +=
1743 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, MulTy, OverflowTy,
1744 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1745 return Cost;
1746 }
1747 case Intrinsic::ctpop:
1748 ISDs.push_back(ISD::CTPOP);
1749 // In case of legalization use TCC_Expensive. This is cheaper than a
1750 // library call but still not a cheap instruction.
1751 SingleCallCost = TargetTransformInfo::TCC_Expensive;
1752 break;
1753 // FIXME: ctlz, cttz, ...
1754 case Intrinsic::bswap:
1755 ISDs.push_back(ISD::BSWAP);
1756 break;
1757 case Intrinsic::bitreverse:
1758 ISDs.push_back(ISD::BITREVERSE);
1759 break;
1760 }
1761
1762 const TargetLoweringBase *TLI = getTLI();
1763 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
1764
1765 SmallVector<unsigned, 2> LegalCost;
1766 SmallVector<unsigned, 2> CustomCost;
1767 for (unsigned ISD : ISDs) {
1768 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
1769 if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
1770 TLI->isFAbsFree(LT.second)) {
1771 return 0;
1772 }
1773
1774 // The operation is legal. Assume it costs 1.
1775 // If the type is split to multiple registers, assume that there is some
1776 // overhead to this.
1777 // TODO: Once we have extract/insert subvector cost we need to use them.
1778 if (LT.first > 1)
1779 LegalCost.push_back(LT.first * 2);
1780 else
1781 LegalCost.push_back(LT.first * 1);
1782 } else if (!TLI->isOperationExpand(ISD, LT.second)) {
1783 // If the operation is custom lowered then assume
1784 // that the code is twice as expensive.
1785 CustomCost.push_back(LT.first * 2);
1786 }
1787 }
1788
1789 auto *MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end());
1790 if (MinLegalCostI != LegalCost.end())
1791 return *MinLegalCostI;
1792
1793 auto MinCustomCostI =
1794 std::min_element(CustomCost.begin(), CustomCost.end());
1795 if (MinCustomCostI != CustomCost.end())
1796 return *MinCustomCostI;
1797
1798 // If we can't lower fmuladd into an FMA estimate the cost as a floating
1799 // point mul followed by an add.
1800 if (IID == Intrinsic::fmuladd)
1801 return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy,
1802 CostKind) +
1803 thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy,
1804 CostKind);
1805 if (IID == Intrinsic::experimental_constrained_fmuladd) {
1806 IntrinsicCostAttributes FMulAttrs(
1807 Intrinsic::experimental_constrained_fmul, RetTy, Tys);
1808 IntrinsicCostAttributes FAddAttrs(
1809 Intrinsic::experimental_constrained_fadd, RetTy, Tys);
1810 return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) +
1811 thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind);
1812 }
1813
1814 // Else, assume that we need to scalarize this intrinsic. For math builtins
1815 // this will emit a costly libcall, adding call overhead and spills. Make it
1816 // very expensive.
1817 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
1818 unsigned ScalarizationCost = SkipScalarizationCost ?
1819 ScalarizationCostPassed : getScalarizationOverhead(RetVTy, true, false);
1820
1821 unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
1822 SmallVector<Type *, 4> ScalarTys;
1823 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1824 Type *Ty = Tys[i];
1825 if (Ty->isVectorTy())
1826 Ty = Ty->getScalarType();
1827 ScalarTys.push_back(Ty);
1828 }
1829 IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
1830 unsigned ScalarCost = thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1831 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1832 if (auto *VTy = dyn_cast<VectorType>(Tys[i])) {
1833 if (!ICA.skipScalarizationCost())
1834 ScalarizationCost += getScalarizationOverhead(VTy, false, true);
1835 ScalarCalls = std::max(ScalarCalls,
1836 cast<FixedVectorType>(VTy)->getNumElements());
1837 }
1838 }
1839 return ScalarCalls * ScalarCost + ScalarizationCost;
1840 }
1841
1842 // This is going to be turned into a library call, make it expensive.
1843 return SingleCallCost;
1844 }
1845
1846 /// Compute a cost of the given call instruction.
1847 ///
1848 /// Compute the cost of calling function F with return type RetTy and
1849 /// argument types Tys. F might be nullptr, in this case the cost of an
1850 /// arbitrary call with the specified signature will be returned.
1851 /// This is used, for instance, when we estimate call of a vector
1852 /// counterpart of the given function.
1853 /// \param F Called function, might be nullptr.
1854 /// \param RetTy Return value types.
1855 /// \param Tys Argument types.
1856 /// \returns The cost of Call instruction.
1857 unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys,
1858 TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) {
1859 return 10;
1860 }
1861
1862 unsigned getNumberOfParts(Type *Tp) {
1863 std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Tp);
1864 return LT.first;
1865 }
1866
1867 unsigned getAddressComputationCost(Type *Ty, ScalarEvolution *,
1868 const SCEV *) {
1869 return 0;
1870 }
1871
1872 /// Try to calculate arithmetic and shuffle op costs for reduction operations.
1873 /// We're assuming that reduction operation are performing the following way:
1874 /// 1. Non-pairwise reduction
1875 /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
1876 /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
1877 /// \----------------v-------------/ \----------v------------/
1878 /// n/2 elements n/2 elements
1879 /// %red1 = op <n x t> %val, <n x t> val1
1880 /// After this operation we have a vector %red1 where only the first n/2
1881 /// elements are meaningful, the second n/2 elements are undefined and can be
1882 /// dropped. All other operations are actually working with the vector of
1883 /// length n/2, not n, though the real vector length is still n.
1884 /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
1885 /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
1886 /// \----------------v-------------/ \----------v------------/
1887 /// n/4 elements 3*n/4 elements
1888 /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of
1889 /// length n/2, the resulting vector has length n/4 etc.
1890 /// 2. Pairwise reduction:
1891 /// Everything is the same except for an additional shuffle operation which
1892 /// is used to produce operands for pairwise kind of reductions.
1893 /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
1894 /// <n x i32> <i32 0, i32 2, ..., i32 n-2, i32 undef, ..., i32 undef>
1895 /// \-------------v----------/ \----------v------------/
1896 /// n/2 elements n/2 elements
1897 /// %val2 = shufflevector<n x t> %val, <n x t> %undef,
1898 /// <n x i32> <i32 1, i32 3, ..., i32 n-1, i32 undef, ..., i32 undef>
1899 /// \-------------v----------/ \----------v------------/
1900 /// n/2 elements n/2 elements
1901 /// %red1 = op <n x t> %val1, <n x t> val2
1902 /// Again, the operation is performed on <n x t> vector, but the resulting
1903 /// vector %red1 is <n/2 x t> vector.
1904 ///
1905 /// The cost model should take into account that the actual length of the
1906 /// vector is reduced on each iteration.
1907 unsigned getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
1908 bool IsPairwise,
1909 TTI::TargetCostKind CostKind) {
1910 Type *ScalarTy = Ty->getElementType();
1911 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
1912 unsigned NumReduxLevels = Log2_32(NumVecElts);
1913 unsigned ArithCost = 0;
1914 unsigned ShuffleCost = 0;
1915 std::pair<unsigned, MVT> LT =
1916 thisT()->getTLI()->getTypeLegalizationCost(DL, Ty);
1917 unsigned LongVectorCount = 0;
1918 unsigned MVTLen =
1919 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
1920 while (NumVecElts > MVTLen) {
1921 NumVecElts /= 2;
1922 VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
1923 // Assume the pairwise shuffles add a cost.
1924 ShuffleCost +=
1925 (IsPairwise + 1) * thisT()->getShuffleCost(TTI::SK_ExtractSubvector,
1926 Ty, NumVecElts, SubTy);
1927 ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind);
1928 Ty = SubTy;
1929 ++LongVectorCount;
1930 }
1931
1932 NumReduxLevels -= LongVectorCount;
1933
1934 // The minimal length of the vector is limited by the real length of vector
1935 // operations performed on the current platform. That's why several final
1936 // reduction operations are performed on the vectors with the same
1937 // architecture-dependent length.
1938
1939 // Non pairwise reductions need one shuffle per reduction level. Pairwise
1940 // reductions need two shuffles on every level, but the last one. On that
1941 // level one of the shuffles is <0, u, u, ...> which is identity.
1942 unsigned NumShuffles = NumReduxLevels;
1943 if (IsPairwise && NumReduxLevels >= 1)
1944 NumShuffles += NumReduxLevels - 1;
1945 ShuffleCost += NumShuffles *
1946 thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, 0, Ty);
1947 ArithCost += NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty);
1948 return ShuffleCost + ArithCost +
1949 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
1950 }
1951
1952 /// Try to calculate op costs for min/max reduction operations.
1953 /// \param CondTy Conditional type for the Select instruction.
1954 unsigned getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
1955 bool IsPairwise, bool IsUnsigned,
1956 TTI::TargetCostKind CostKind) {
1957 Type *ScalarTy = Ty->getElementType();
1958 Type *ScalarCondTy = CondTy->getElementType();
1959 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
1960 unsigned NumReduxLevels = Log2_32(NumVecElts);
1961 unsigned CmpOpcode;
1962 if (Ty->isFPOrFPVectorTy()) {
1963 CmpOpcode = Instruction::FCmp;
1964 } else {
1965 assert(Ty->isIntOrIntVectorTy() &&((Ty->isIntOrIntVectorTy() && "expecting floating point or integer type for min/max reduction"
) ? static_cast<void> (0) : __assert_fail ("Ty->isIntOrIntVectorTy() && \"expecting floating point or integer type for min/max reduction\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 1966, __PRETTY_FUNCTION__))
1966 "expecting floating point or integer type for min/max reduction")((Ty->isIntOrIntVectorTy() && "expecting floating point or integer type for min/max reduction"
) ? static_cast<void> (0) : __assert_fail ("Ty->isIntOrIntVectorTy() && \"expecting floating point or integer type for min/max reduction\""
, "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 1966, __PRETTY_FUNCTION__))
;
1967 CmpOpcode = Instruction::ICmp;
1968 }
1969 unsigned MinMaxCost = 0;
1970 unsigned ShuffleCost = 0;
1971 std::pair<unsigned, MVT> LT =
1972 thisT()->getTLI()->getTypeLegalizationCost(DL, Ty);
1973 unsigned LongVectorCount = 0;
1974 unsigned MVTLen =
1975 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
1976 while (NumVecElts > MVTLen) {
1977 NumVecElts /= 2;
1978 auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
1979 CondTy = FixedVectorType::get(ScalarCondTy, NumVecElts);
1980
1981 // Assume the pairwise shuffles add a cost.
1982 ShuffleCost +=
1983 (IsPairwise + 1) * thisT()->getShuffleCost(TTI::SK_ExtractSubvector,
1984 Ty, NumVecElts, SubTy);
1985 MinMaxCost +=
1986 thisT()->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy,
1987 CmpInst::BAD_ICMP_PREDICATE, CostKind) +
1988 thisT()->getCmpSelInstrCost(Instruction::Select, SubTy, CondTy,
1989 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1990 Ty = SubTy;
1991 ++LongVectorCount;
1992 }
1993
1994 NumReduxLevels -= LongVectorCount;
1995
1996 // The minimal length of the vector is limited by the real length of vector
1997 // operations performed on the current platform. That's why several final
1998 // reduction opertions are perfomed on the vectors with the same
1999 // architecture-dependent length.
2000
2001 // Non pairwise reductions need one shuffle per reduction level. Pairwise
2002 // reductions need two shuffles on every level, but the last one. On that
2003 // level one of the shuffles is <0, u, u, ...> which is identity.
2004 unsigned NumShuffles = NumReduxLevels;
2005 if (IsPairwise && NumReduxLevels >= 1)
2006 NumShuffles += NumReduxLevels - 1;
2007 ShuffleCost += NumShuffles *
2008 thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, 0, Ty);
2009 MinMaxCost +=
2010 NumReduxLevels *
2011 (thisT()->getCmpSelInstrCost(CmpOpcode, Ty, CondTy,
2012 CmpInst::BAD_ICMP_PREDICATE, CostKind) +
2013 thisT()->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
2014 CmpInst::BAD_ICMP_PREDICATE, CostKind));
2015 // The last min/max should be in vector registers and we counted it above.
2016 // So just need a single extractelement.
2017 return ShuffleCost + MinMaxCost +
2018 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
2019 }
2020
2021 unsigned getVectorSplitCost() { return 1; }
2022
2023 /// @}
2024};
2025
2026/// Concrete BasicTTIImpl that can be used if no further customization
2027/// is needed.
2028class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
2029 using BaseT = BasicTTIImplBase<BasicTTIImpl>;
2030
2031 friend class BasicTTIImplBase<BasicTTIImpl>;
2032
2033 const TargetSubtargetInfo *ST;
2034 const TargetLoweringBase *TLI;
2035
2036 const TargetSubtargetInfo *getST() const { return ST; }
2037 const TargetLoweringBase *getTLI() const { return TLI; }
2038
2039public:
2040 explicit BasicTTIImpl(const TargetMachine *TM, const Function &F);
2041};
2042
2043} // end namespace llvm
2044
2045#endif // LLVM_CODEGEN_BASICTTIIMPL_H