Bug Summary

File:llvm/lib/Target/X86/X86TargetTransformInfo.cpp
Warning:line 3278, column 15
Division by zero

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name X86TargetTransformInfo.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/build-llvm/lib/Target/X86 -resource-dir /usr/lib/llvm-13/lib/clang/13.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/build-llvm/lib/Target/X86 -I /build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86 -I /build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/build-llvm/include -I /build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-13/lib/clang/13.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/build-llvm/lib/Target/X86 -fdebug-prefix-map=/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039=. -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2021-07-06-042412-29666-1 -x c++ /build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp

/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp

1//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements a TargetTransformInfo analysis pass specific to the
10/// X86 target machine. It uses the target's detailed information to provide
11/// more precise answers to certain TTI queries, while letting the target
12/// independent and default TTI implementations handle the rest.
13///
14//===----------------------------------------------------------------------===//
15/// About Cost Model numbers used below it's necessary to say the following:
16/// the numbers correspond to some "generic" X86 CPU instead of usage of
17/// concrete CPU model. Usually the numbers correspond to CPU where the feature
18/// apeared at the first time. For example, if we do Subtarget.hasSSE42() in
19/// the lookups below the cost is based on Nehalem as that was the first CPU
20/// to support that feature level and thus has most likely the worst case cost.
21/// Some examples of other technologies/CPUs:
22/// SSE 3 - Pentium4 / Athlon64
23/// SSE 4.1 - Penryn
24/// SSE 4.2 - Nehalem
25/// AVX - Sandy Bridge
26/// AVX2 - Haswell
27/// AVX-512 - Xeon Phi / Skylake
28/// And some examples of instruction target dependent costs (latency)
29/// divss sqrtss rsqrtss
30/// AMD K7 11-16 19 3
31/// Piledriver 9-24 13-15 5
32/// Jaguar 14 16 2
33/// Pentium II,III 18 30 2
34/// Nehalem 7-14 7-18 3
35/// Haswell 10-13 11 5
36/// TODO: Develop and implement the target dependent cost model and
37/// specialize cost numbers for different Cost Model Targets such as throughput,
38/// code size, latency and uop count.
39//===----------------------------------------------------------------------===//
40
41#include "X86TargetTransformInfo.h"
42#include "llvm/Analysis/TargetTransformInfo.h"
43#include "llvm/CodeGen/BasicTTIImpl.h"
44#include "llvm/CodeGen/CostTable.h"
45#include "llvm/CodeGen/TargetLowering.h"
46#include "llvm/IR/IntrinsicInst.h"
47#include "llvm/Support/Debug.h"
48
49using namespace llvm;
50
51#define DEBUG_TYPE"x86tti" "x86tti"
52
53//===----------------------------------------------------------------------===//
54//
55// X86 cost model.
56//
57//===----------------------------------------------------------------------===//
58
59TargetTransformInfo::PopcntSupportKind
60X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
61 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2")(static_cast <bool> (isPowerOf2_32(TyWidth) && "Ty width must be power of 2"
) ? void (0) : __assert_fail ("isPowerOf2_32(TyWidth) && \"Ty width must be power of 2\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 61, __extension__ __PRETTY_FUNCTION__))
;
62 // TODO: Currently the __builtin_popcount() implementation using SSE3
63 // instructions is inefficient. Once the problem is fixed, we should
64 // call ST->hasSSE3() instead of ST->hasPOPCNT().
65 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
66}
67
68llvm::Optional<unsigned> X86TTIImpl::getCacheSize(
69 TargetTransformInfo::CacheLevel Level) const {
70 switch (Level) {
71 case TargetTransformInfo::CacheLevel::L1D:
72 // - Penryn
73 // - Nehalem
74 // - Westmere
75 // - Sandy Bridge
76 // - Ivy Bridge
77 // - Haswell
78 // - Broadwell
79 // - Skylake
80 // - Kabylake
81 return 32 * 1024; // 32 KByte
82 case TargetTransformInfo::CacheLevel::L2D:
83 // - Penryn
84 // - Nehalem
85 // - Westmere
86 // - Sandy Bridge
87 // - Ivy Bridge
88 // - Haswell
89 // - Broadwell
90 // - Skylake
91 // - Kabylake
92 return 256 * 1024; // 256 KByte
93 }
94
95 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel")::llvm::llvm_unreachable_internal("Unknown TargetTransformInfo::CacheLevel"
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 95)
;
96}
97
98llvm::Optional<unsigned> X86TTIImpl::getCacheAssociativity(
99 TargetTransformInfo::CacheLevel Level) const {
100 // - Penryn
101 // - Nehalem
102 // - Westmere
103 // - Sandy Bridge
104 // - Ivy Bridge
105 // - Haswell
106 // - Broadwell
107 // - Skylake
108 // - Kabylake
109 switch (Level) {
110 case TargetTransformInfo::CacheLevel::L1D:
111 LLVM_FALLTHROUGH[[gnu::fallthrough]];
112 case TargetTransformInfo::CacheLevel::L2D:
113 return 8;
114 }
115
116 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel")::llvm::llvm_unreachable_internal("Unknown TargetTransformInfo::CacheLevel"
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 116)
;
117}
118
119unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
120 bool Vector = (ClassID == 1);
121 if (Vector && !ST->hasSSE1())
122 return 0;
123
124 if (ST->is64Bit()) {
125 if (Vector && ST->hasAVX512())
126 return 32;
127 return 16;
128 }
129 return 8;
130}
131
132TypeSize
133X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
134 unsigned PreferVectorWidth = ST->getPreferVectorWidth();
135 switch (K) {
136 case TargetTransformInfo::RGK_Scalar:
137 return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
138 case TargetTransformInfo::RGK_FixedWidthVector:
139 if (ST->hasAVX512() && PreferVectorWidth >= 512)
140 return TypeSize::getFixed(512);
141 if (ST->hasAVX() && PreferVectorWidth >= 256)
142 return TypeSize::getFixed(256);
143 if (ST->hasSSE1() && PreferVectorWidth >= 128)
144 return TypeSize::getFixed(128);
145 return TypeSize::getFixed(0);
146 case TargetTransformInfo::RGK_ScalableVector:
147 return TypeSize::getScalable(0);
148 }
149
150 llvm_unreachable("Unsupported register kind")::llvm::llvm_unreachable_internal("Unsupported register kind"
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 150)
;
151}
152
153unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
154 return getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
155 .getFixedSize();
156}
157
158unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
159 // If the loop will not be vectorized, don't interleave the loop.
160 // Let regular unroll to unroll the loop, which saves the overflow
161 // check and memory check cost.
162 if (VF == 1)
163 return 1;
164
165 if (ST->isAtom())
166 return 1;
167
168 // Sandybridge and Haswell have multiple execution ports and pipelined
169 // vector units.
170 if (ST->hasAVX())
171 return 4;
172
173 return 2;
174}
175
176InstructionCost X86TTIImpl::getArithmeticInstrCost(
177 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
178 TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
179 TTI::OperandValueProperties Opd1PropInfo,
180 TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
181 const Instruction *CxtI) {
182 // TODO: Handle more cost kinds.
183 if (CostKind != TTI::TCK_RecipThroughput)
184 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
185 Op2Info, Opd1PropInfo,
186 Opd2PropInfo, Args, CxtI);
187
188 // vXi8 multiplications are always promoted to vXi16.
189 if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
190 Ty->getScalarSizeInBits() == 8) {
191 Type *WideVecTy =
192 VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
193 return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
194 TargetTransformInfo::CastContextHint::None,
195 CostKind) +
196 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
197 TargetTransformInfo::CastContextHint::None,
198 CostKind) +
199 getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info,
200 Opd1PropInfo, Opd2PropInfo);
201 }
202
203 // Legalize the type.
204 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
205
206 int ISD = TLI->InstructionOpcodeToISD(Opcode);
207 assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ?
void (0) : __assert_fail ("ISD && \"Invalid opcode\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 207, __extension__ __PRETTY_FUNCTION__))
;
208
209 static const CostTblEntry GLMCostTable[] = {
210 { ISD::FDIV, MVT::f32, 18 }, // divss
211 { ISD::FDIV, MVT::v4f32, 35 }, // divps
212 { ISD::FDIV, MVT::f64, 33 }, // divsd
213 { ISD::FDIV, MVT::v2f64, 65 }, // divpd
214 };
215
216 if (ST->useGLMDivSqrtCosts())
217 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD,
218 LT.second))
219 return LT.first * Entry->Cost;
220
221 static const CostTblEntry SLMCostTable[] = {
222 { ISD::MUL, MVT::v4i32, 11 }, // pmulld
223 { ISD::MUL, MVT::v8i16, 2 }, // pmullw
224 { ISD::FMUL, MVT::f64, 2 }, // mulsd
225 { ISD::FMUL, MVT::v2f64, 4 }, // mulpd
226 { ISD::FMUL, MVT::v4f32, 2 }, // mulps
227 { ISD::FDIV, MVT::f32, 17 }, // divss
228 { ISD::FDIV, MVT::v4f32, 39 }, // divps
229 { ISD::FDIV, MVT::f64, 32 }, // divsd
230 { ISD::FDIV, MVT::v2f64, 69 }, // divpd
231 { ISD::FADD, MVT::v2f64, 2 }, // addpd
232 { ISD::FSUB, MVT::v2f64, 2 }, // subpd
233 // v2i64/v4i64 mul is custom lowered as a series of long:
234 // multiplies(3), shifts(3) and adds(2)
235 // slm muldq version throughput is 2 and addq throughput 4
236 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
237 // 3X4 (addq throughput) = 17
238 { ISD::MUL, MVT::v2i64, 17 },
239 // slm addq\subq throughput is 4
240 { ISD::ADD, MVT::v2i64, 4 },
241 { ISD::SUB, MVT::v2i64, 4 },
242 };
243
244 if (ST->isSLM()) {
245 if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
246 // Check if the operands can be shrinked into a smaller datatype.
247 bool Op1Signed = false;
248 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
249 bool Op2Signed = false;
250 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
251
252 bool SignedMode = Op1Signed || Op2Signed;
253 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
254
255 if (OpMinSize <= 7)
256 return LT.first * 3; // pmullw/sext
257 if (!SignedMode && OpMinSize <= 8)
258 return LT.first * 3; // pmullw/zext
259 if (OpMinSize <= 15)
260 return LT.first * 5; // pmullw/pmulhw/pshuf
261 if (!SignedMode && OpMinSize <= 16)
262 return LT.first * 5; // pmullw/pmulhw/pshuf
263 }
264
265 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
266 LT.second)) {
267 return LT.first * Entry->Cost;
268 }
269 }
270
271 if ((ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
272 ISD == ISD::UREM) &&
273 (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
274 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
275 Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
276 if (ISD == ISD::SDIV || ISD == ISD::SREM) {
277 // On X86, vector signed division by constants power-of-two are
278 // normally expanded to the sequence SRA + SRL + ADD + SRA.
279 // The OperandValue properties may not be the same as that of the previous
280 // operation; conservatively assume OP_None.
281 InstructionCost Cost =
282 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Op1Info,
283 Op2Info, TargetTransformInfo::OP_None,
284 TargetTransformInfo::OP_None);
285 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info,
286 Op2Info,
287 TargetTransformInfo::OP_None,
288 TargetTransformInfo::OP_None);
289 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, Op1Info,
290 Op2Info,
291 TargetTransformInfo::OP_None,
292 TargetTransformInfo::OP_None);
293
294 if (ISD == ISD::SREM) {
295 // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
296 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info,
297 Op2Info);
298 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info,
299 Op2Info);
300 }
301
302 return Cost;
303 }
304
305 // Vector unsigned division/remainder will be simplified to shifts/masks.
306 if (ISD == ISD::UDIV)
307 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
308 Op1Info, Op2Info,
309 TargetTransformInfo::OP_None,
310 TargetTransformInfo::OP_None);
311
312 else // UREM
313 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
314 Op1Info, Op2Info,
315 TargetTransformInfo::OP_None,
316 TargetTransformInfo::OP_None);
317 }
318
319 static const CostTblEntry AVX512BWUniformConstCostTable[] = {
320 { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand.
321 { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand.
322 { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb.
323 };
324
325 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
326 ST->hasBWI()) {
327 if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
328 LT.second))
329 return LT.first * Entry->Cost;
330 }
331
332 static const CostTblEntry AVX512UniformConstCostTable[] = {
333 { ISD::SRA, MVT::v2i64, 1 },
334 { ISD::SRA, MVT::v4i64, 1 },
335 { ISD::SRA, MVT::v8i64, 1 },
336
337 { ISD::SHL, MVT::v64i8, 4 }, // psllw + pand.
338 { ISD::SRL, MVT::v64i8, 4 }, // psrlw + pand.
339 { ISD::SRA, MVT::v64i8, 8 }, // psrlw, pand, pxor, psubb.
340
341 { ISD::SDIV, MVT::v16i32, 6 }, // pmuludq sequence
342 { ISD::SREM, MVT::v16i32, 8 }, // pmuludq+mul+sub sequence
343 { ISD::UDIV, MVT::v16i32, 5 }, // pmuludq sequence
344 { ISD::UREM, MVT::v16i32, 7 }, // pmuludq+mul+sub sequence
345 };
346
347 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
348 ST->hasAVX512()) {
349 if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
350 LT.second))
351 return LT.first * Entry->Cost;
352 }
353
354 static const CostTblEntry AVX2UniformConstCostTable[] = {
355 { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand.
356 { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand.
357 { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb.
358
359 { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
360
361 { ISD::SDIV, MVT::v8i32, 6 }, // pmuludq sequence
362 { ISD::SREM, MVT::v8i32, 8 }, // pmuludq+mul+sub sequence
363 { ISD::UDIV, MVT::v8i32, 5 }, // pmuludq sequence
364 { ISD::UREM, MVT::v8i32, 7 }, // pmuludq+mul+sub sequence
365 };
366
367 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
368 ST->hasAVX2()) {
369 if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
370 LT.second))
371 return LT.first * Entry->Cost;
372 }
373
374 static const CostTblEntry SSE2UniformConstCostTable[] = {
375 { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand.
376 { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand.
377 { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
378
379 { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split.
380 { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split.
381 { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
382
383 { ISD::SDIV, MVT::v8i32, 12+2 }, // 2*pmuludq sequence + split.
384 { ISD::SREM, MVT::v8i32, 16+2 }, // 2*pmuludq+mul+sub sequence + split.
385 { ISD::SDIV, MVT::v4i32, 6 }, // pmuludq sequence
386 { ISD::SREM, MVT::v4i32, 8 }, // pmuludq+mul+sub sequence
387 { ISD::UDIV, MVT::v8i32, 10+2 }, // 2*pmuludq sequence + split.
388 { ISD::UREM, MVT::v8i32, 14+2 }, // 2*pmuludq+mul+sub sequence + split.
389 { ISD::UDIV, MVT::v4i32, 5 }, // pmuludq sequence
390 { ISD::UREM, MVT::v4i32, 7 }, // pmuludq+mul+sub sequence
391 };
392
393 // XOP has faster vXi8 shifts.
394 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
395 ST->hasSSE2() && !ST->hasXOP()) {
396 if (const auto *Entry =
397 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
398 return LT.first * Entry->Cost;
399 }
400
401 static const CostTblEntry AVX512BWConstCostTable[] = {
402 { ISD::SDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
403 { ISD::SREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
404 { ISD::UDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
405 { ISD::UREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
406 { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence
407 { ISD::SREM, MVT::v32i16, 8 }, // vpmulhw+mul+sub sequence
408 { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence
409 { ISD::UREM, MVT::v32i16, 8 }, // vpmulhuw+mul+sub sequence
410 };
411
412 if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
413 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
414 ST->hasBWI()) {
415 if (const auto *Entry =
416 CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
417 return LT.first * Entry->Cost;
418 }
419
420 static const CostTblEntry AVX512ConstCostTable[] = {
421 { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
422 { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
423 { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
424 { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
425 { ISD::SDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
426 { ISD::SREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
427 { ISD::UDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
428 { ISD::UREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
429 { ISD::SDIV, MVT::v32i16, 12 }, // 2*vpmulhw sequence
430 { ISD::SREM, MVT::v32i16, 16 }, // 2*vpmulhw+mul+sub sequence
431 { ISD::UDIV, MVT::v32i16, 12 }, // 2*vpmulhuw sequence
432 { ISD::UREM, MVT::v32i16, 16 }, // 2*vpmulhuw+mul+sub sequence
433 };
434
435 if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
436 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
437 ST->hasAVX512()) {
438 if (const auto *Entry =
439 CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
440 return LT.first * Entry->Cost;
441 }
442
443 static const CostTblEntry AVX2ConstCostTable[] = {
444 { ISD::SDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
445 { ISD::SREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
446 { ISD::UDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
447 { ISD::UREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
448 { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
449 { ISD::SREM, MVT::v16i16, 8 }, // vpmulhw+mul+sub sequence
450 { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
451 { ISD::UREM, MVT::v16i16, 8 }, // vpmulhuw+mul+sub sequence
452 { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
453 { ISD::SREM, MVT::v8i32, 19 }, // vpmuldq+mul+sub sequence
454 { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
455 { ISD::UREM, MVT::v8i32, 19 }, // vpmuludq+mul+sub sequence
456 };
457
458 if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
459 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
460 ST->hasAVX2()) {
461 if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
462 return LT.first * Entry->Cost;
463 }
464
465 static const CostTblEntry SSE2ConstCostTable[] = {
466 { ISD::SDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
467 { ISD::SREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
468 { ISD::SDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
469 { ISD::SREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
470 { ISD::UDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
471 { ISD::UREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
472 { ISD::UDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
473 { ISD::UREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
474 { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
475 { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split.
476 { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
477 { ISD::SREM, MVT::v8i16, 8 }, // pmulhw+mul+sub sequence
478 { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
479 { ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split.
480 { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
481 { ISD::UREM, MVT::v8i16, 8 }, // pmulhuw+mul+sub sequence
482 { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split.
483 { ISD::SREM, MVT::v8i32, 48+2 }, // 2*pmuludq+mul+sub sequence + split.
484 { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
485 { ISD::SREM, MVT::v4i32, 24 }, // pmuludq+mul+sub sequence
486 { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split.
487 { ISD::UREM, MVT::v8i32, 40+2 }, // 2*pmuludq+mul+sub sequence + split.
488 { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
489 { ISD::UREM, MVT::v4i32, 20 }, // pmuludq+mul+sub sequence
490 };
491
492 if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
493 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
494 ST->hasSSE2()) {
495 // pmuldq sequence.
496 if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
497 return LT.first * 32;
498 if (ISD == ISD::SREM && LT.second == MVT::v8i32 && ST->hasAVX())
499 return LT.first * 38;
500 if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
501 return LT.first * 15;
502 if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41())
503 return LT.first * 20;
504
505 if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
506 return LT.first * Entry->Cost;
507 }
508
509 static const CostTblEntry AVX512BWShiftCostTable[] = {
510 { ISD::SHL, MVT::v16i8, 4 }, // extend/vpsllvw/pack sequence.
511 { ISD::SRL, MVT::v16i8, 4 }, // extend/vpsrlvw/pack sequence.
512 { ISD::SRA, MVT::v16i8, 4 }, // extend/vpsravw/pack sequence.
513 { ISD::SHL, MVT::v32i8, 4 }, // extend/vpsllvw/pack sequence.
514 { ISD::SRL, MVT::v32i8, 4 }, // extend/vpsrlvw/pack sequence.
515 { ISD::SRA, MVT::v32i8, 6 }, // extend/vpsravw/pack sequence.
516 { ISD::SHL, MVT::v64i8, 6 }, // extend/vpsllvw/pack sequence.
517 { ISD::SRL, MVT::v64i8, 7 }, // extend/vpsrlvw/pack sequence.
518 { ISD::SRA, MVT::v64i8, 15 }, // extend/vpsravw/pack sequence.
519
520 { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
521 { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
522 { ISD::SRA, MVT::v8i16, 1 }, // vpsravw
523 { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
524 { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
525 { ISD::SRA, MVT::v16i16, 1 }, // vpsravw
526 { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
527 { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
528 { ISD::SRA, MVT::v32i16, 1 }, // vpsravw
529 };
530
531 if (ST->hasBWI())
532 if (const auto *Entry = CostTableLookup(AVX512BWShiftCostTable, ISD, LT.second))
533 return LT.first * Entry->Cost;
534
535 static const CostTblEntry AVX2UniformCostTable[] = {
536 // Uniform splats are cheaper for the following instructions.
537 { ISD::SHL, MVT::v16i16, 1 }, // psllw.
538 { ISD::SRL, MVT::v16i16, 1 }, // psrlw.
539 { ISD::SRA, MVT::v16i16, 1 }, // psraw.
540 { ISD::SHL, MVT::v32i16, 2 }, // 2*psllw.
541 { ISD::SRL, MVT::v32i16, 2 }, // 2*psrlw.
542 { ISD::SRA, MVT::v32i16, 2 }, // 2*psraw.
543
544 { ISD::SHL, MVT::v8i32, 1 }, // pslld
545 { ISD::SRL, MVT::v8i32, 1 }, // psrld
546 { ISD::SRA, MVT::v8i32, 1 }, // psrad
547 { ISD::SHL, MVT::v4i64, 1 }, // psllq
548 { ISD::SRL, MVT::v4i64, 1 }, // psrlq
549 };
550
551 if (ST->hasAVX2() &&
552 ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
553 (Op2Info == TargetTransformInfo::OK_UniformValue))) {
554 if (const auto *Entry =
555 CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
556 return LT.first * Entry->Cost;
557 }
558
559 static const CostTblEntry SSE2UniformCostTable[] = {
560 // Uniform splats are cheaper for the following instructions.
561 { ISD::SHL, MVT::v8i16, 1 }, // psllw.
562 { ISD::SHL, MVT::v4i32, 1 }, // pslld
563 { ISD::SHL, MVT::v2i64, 1 }, // psllq.
564
565 { ISD::SRL, MVT::v8i16, 1 }, // psrlw.
566 { ISD::SRL, MVT::v4i32, 1 }, // psrld.
567 { ISD::SRL, MVT::v2i64, 1 }, // psrlq.
568
569 { ISD::SRA, MVT::v8i16, 1 }, // psraw.
570 { ISD::SRA, MVT::v4i32, 1 }, // psrad.
571 };
572
573 if (ST->hasSSE2() &&
574 ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
575 (Op2Info == TargetTransformInfo::OK_UniformValue))) {
576 if (const auto *Entry =
577 CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
578 return LT.first * Entry->Cost;
579 }
580
581 static const CostTblEntry AVX512DQCostTable[] = {
582 { ISD::MUL, MVT::v2i64, 2 }, // pmullq
583 { ISD::MUL, MVT::v4i64, 2 }, // pmullq
584 { ISD::MUL, MVT::v8i64, 2 } // pmullq
585 };
586
587 // Look for AVX512DQ lowering tricks for custom cases.
588 if (ST->hasDQI())
589 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
590 return LT.first * Entry->Cost;
591
592 static const CostTblEntry AVX512BWCostTable[] = {
593 { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence.
594 { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence.
595 { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence.
596 };
597
598 // Look for AVX512BW lowering tricks for custom cases.
599 if (ST->hasBWI())
600 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
601 return LT.first * Entry->Cost;
602
603 static const CostTblEntry AVX512CostTable[] = {
604 { ISD::SHL, MVT::v4i32, 1 },
605 { ISD::SRL, MVT::v4i32, 1 },
606 { ISD::SRA, MVT::v4i32, 1 },
607 { ISD::SHL, MVT::v8i32, 1 },
608 { ISD::SRL, MVT::v8i32, 1 },
609 { ISD::SRA, MVT::v8i32, 1 },
610 { ISD::SHL, MVT::v16i32, 1 },
611 { ISD::SRL, MVT::v16i32, 1 },
612 { ISD::SRA, MVT::v16i32, 1 },
613
614 { ISD::SHL, MVT::v2i64, 1 },
615 { ISD::SRL, MVT::v2i64, 1 },
616 { ISD::SHL, MVT::v4i64, 1 },
617 { ISD::SRL, MVT::v4i64, 1 },
618 { ISD::SHL, MVT::v8i64, 1 },
619 { ISD::SRL, MVT::v8i64, 1 },
620
621 { ISD::SRA, MVT::v2i64, 1 },
622 { ISD::SRA, MVT::v4i64, 1 },
623 { ISD::SRA, MVT::v8i64, 1 },
624
625 { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org)
626 { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org)
627 { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org)
628 { ISD::MUL, MVT::v8i64, 6 }, // 3*pmuludq/3*shift/2*add
629
630 { ISD::FNEG, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
631 { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
632 { ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
633 { ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
634 { ISD::FDIV, MVT::f64, 4 }, // Skylake from http://www.agner.org/
635 { ISD::FDIV, MVT::v2f64, 4 }, // Skylake from http://www.agner.org/
636 { ISD::FDIV, MVT::v4f64, 8 }, // Skylake from http://www.agner.org/
637 { ISD::FDIV, MVT::v8f64, 16 }, // Skylake from http://www.agner.org/
638
639 { ISD::FNEG, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
640 { ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
641 { ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
642 { ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
643 { ISD::FDIV, MVT::f32, 3 }, // Skylake from http://www.agner.org/
644 { ISD::FDIV, MVT::v4f32, 3 }, // Skylake from http://www.agner.org/
645 { ISD::FDIV, MVT::v8f32, 5 }, // Skylake from http://www.agner.org/
646 { ISD::FDIV, MVT::v16f32, 10 }, // Skylake from http://www.agner.org/
647 };
648
649 if (ST->hasAVX512())
650 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
651 return LT.first * Entry->Cost;
652
653 static const CostTblEntry AVX2ShiftCostTable[] = {
654 // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
655 // customize them to detect the cases where shift amount is a scalar one.
656 { ISD::SHL, MVT::v4i32, 2 }, // vpsllvd (Haswell from agner.org)
657 { ISD::SRL, MVT::v4i32, 2 }, // vpsrlvd (Haswell from agner.org)
658 { ISD::SRA, MVT::v4i32, 2 }, // vpsravd (Haswell from agner.org)
659 { ISD::SHL, MVT::v8i32, 2 }, // vpsllvd (Haswell from agner.org)
660 { ISD::SRL, MVT::v8i32, 2 }, // vpsrlvd (Haswell from agner.org)
661 { ISD::SRA, MVT::v8i32, 2 }, // vpsravd (Haswell from agner.org)
662 { ISD::SHL, MVT::v2i64, 1 }, // vpsllvq (Haswell from agner.org)
663 { ISD::SRL, MVT::v2i64, 1 }, // vpsrlvq (Haswell from agner.org)
664 { ISD::SHL, MVT::v4i64, 1 }, // vpsllvq (Haswell from agner.org)
665 { ISD::SRL, MVT::v4i64, 1 }, // vpsrlvq (Haswell from agner.org)
666 };
667
668 if (ST->hasAVX512()) {
669 if (ISD == ISD::SHL && LT.second == MVT::v32i16 &&
670 (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
671 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
672 // On AVX512, a packed v32i16 shift left by a constant build_vector
673 // is lowered into a vector multiply (vpmullw).
674 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
675 Op1Info, Op2Info,
676 TargetTransformInfo::OP_None,
677 TargetTransformInfo::OP_None);
678 }
679
680 // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
681 if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
682 if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
683 (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
684 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
685 // On AVX2, a packed v16i16 shift left by a constant build_vector
686 // is lowered into a vector multiply (vpmullw).
687 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
688 Op1Info, Op2Info,
689 TargetTransformInfo::OP_None,
690 TargetTransformInfo::OP_None);
691
692 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
693 return LT.first * Entry->Cost;
694 }
695
696 static const CostTblEntry XOPShiftCostTable[] = {
697 // 128bit shifts take 1cy, but right shifts require negation beforehand.
698 { ISD::SHL, MVT::v16i8, 1 },
699 { ISD::SRL, MVT::v16i8, 2 },
700 { ISD::SRA, MVT::v16i8, 2 },
701 { ISD::SHL, MVT::v8i16, 1 },
702 { ISD::SRL, MVT::v8i16, 2 },
703 { ISD::SRA, MVT::v8i16, 2 },
704 { ISD::SHL, MVT::v4i32, 1 },
705 { ISD::SRL, MVT::v4i32, 2 },
706 { ISD::SRA, MVT::v4i32, 2 },
707 { ISD::SHL, MVT::v2i64, 1 },
708 { ISD::SRL, MVT::v2i64, 2 },
709 { ISD::SRA, MVT::v2i64, 2 },
710 // 256bit shifts require splitting if AVX2 didn't catch them above.
711 { ISD::SHL, MVT::v32i8, 2+2 },
712 { ISD::SRL, MVT::v32i8, 4+2 },
713 { ISD::SRA, MVT::v32i8, 4+2 },
714 { ISD::SHL, MVT::v16i16, 2+2 },
715 { ISD::SRL, MVT::v16i16, 4+2 },
716 { ISD::SRA, MVT::v16i16, 4+2 },
717 { ISD::SHL, MVT::v8i32, 2+2 },
718 { ISD::SRL, MVT::v8i32, 4+2 },
719 { ISD::SRA, MVT::v8i32, 4+2 },
720 { ISD::SHL, MVT::v4i64, 2+2 },
721 { ISD::SRL, MVT::v4i64, 4+2 },
722 { ISD::SRA, MVT::v4i64, 4+2 },
723 };
724
725 // Look for XOP lowering tricks.
726 if (ST->hasXOP()) {
727 // If the right shift is constant then we'll fold the negation so
728 // it's as cheap as a left shift.
729 int ShiftISD = ISD;
730 if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) &&
731 (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
732 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
733 ShiftISD = ISD::SHL;
734 if (const auto *Entry =
735 CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
736 return LT.first * Entry->Cost;
737 }
738
739 static const CostTblEntry SSE2UniformShiftCostTable[] = {
740 // Uniform splats are cheaper for the following instructions.
741 { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split.
742 { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split.
743 { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split.
744
745 { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split.
746 { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split.
747 { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split.
748
749 { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split.
750 { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split.
751 { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle.
752 { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split.
753 };
754
755 if (ST->hasSSE2() &&
756 ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
757 (Op2Info == TargetTransformInfo::OK_UniformValue))) {
758
759 // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
760 if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2())
761 return LT.first * 4; // 2*psrad + shuffle.
762
763 if (const auto *Entry =
764 CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
765 return LT.first * Entry->Cost;
766 }
767
768 if (ISD == ISD::SHL &&
769 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) {
770 MVT VT = LT.second;
771 // Vector shift left by non uniform constant can be lowered
772 // into vector multiply.
773 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
774 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
775 ISD = ISD::MUL;
776 }
777
778 static const CostTblEntry AVX2CostTable[] = {
779 { ISD::SHL, MVT::v16i8, 6 }, // vpblendvb sequence.
780 { ISD::SHL, MVT::v32i8, 6 }, // vpblendvb sequence.
781 { ISD::SHL, MVT::v64i8, 12 }, // 2*vpblendvb sequence.
782 { ISD::SHL, MVT::v8i16, 5 }, // extend/vpsrlvd/pack sequence.
783 { ISD::SHL, MVT::v16i16, 7 }, // extend/vpsrlvd/pack sequence.
784 { ISD::SHL, MVT::v32i16, 14 }, // 2*extend/vpsrlvd/pack sequence.
785
786 { ISD::SRL, MVT::v16i8, 6 }, // vpblendvb sequence.
787 { ISD::SRL, MVT::v32i8, 6 }, // vpblendvb sequence.
788 { ISD::SRL, MVT::v64i8, 12 }, // 2*vpblendvb sequence.
789 { ISD::SRL, MVT::v8i16, 5 }, // extend/vpsrlvd/pack sequence.
790 { ISD::SRL, MVT::v16i16, 7 }, // extend/vpsrlvd/pack sequence.
791 { ISD::SRL, MVT::v32i16, 14 }, // 2*extend/vpsrlvd/pack sequence.
792
793 { ISD::SRA, MVT::v16i8, 17 }, // vpblendvb sequence.
794 { ISD::SRA, MVT::v32i8, 17 }, // vpblendvb sequence.
795 { ISD::SRA, MVT::v64i8, 34 }, // 2*vpblendvb sequence.
796 { ISD::SRA, MVT::v8i16, 5 }, // extend/vpsravd/pack sequence.
797 { ISD::SRA, MVT::v16i16, 7 }, // extend/vpsravd/pack sequence.
798 { ISD::SRA, MVT::v32i16, 14 }, // 2*extend/vpsravd/pack sequence.
799 { ISD::SRA, MVT::v2i64, 2 }, // srl/xor/sub sequence.
800 { ISD::SRA, MVT::v4i64, 2 }, // srl/xor/sub sequence.
801
802 { ISD::SUB, MVT::v32i8, 1 }, // psubb
803 { ISD::ADD, MVT::v32i8, 1 }, // paddb
804 { ISD::SUB, MVT::v16i16, 1 }, // psubw
805 { ISD::ADD, MVT::v16i16, 1 }, // paddw
806 { ISD::SUB, MVT::v8i32, 1 }, // psubd
807 { ISD::ADD, MVT::v8i32, 1 }, // paddd
808 { ISD::SUB, MVT::v4i64, 1 }, // psubq
809 { ISD::ADD, MVT::v4i64, 1 }, // paddq
810
811 { ISD::MUL, MVT::v16i16, 1 }, // pmullw
812 { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org)
813 { ISD::MUL, MVT::v4i64, 6 }, // 3*pmuludq/3*shift/2*add
814
815 { ISD::FNEG, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
816 { ISD::FNEG, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
817 { ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
818 { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
819 { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
820 { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
821 { ISD::FMUL, MVT::f64, 1 }, // Haswell from http://www.agner.org/
822 { ISD::FMUL, MVT::v2f64, 1 }, // Haswell from http://www.agner.org/
823 { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
824 { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
825
826 { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
827 { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
828 { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
829 { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/
830 { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
831 { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
832 };
833
834 // Look for AVX2 lowering tricks for custom cases.
835 if (ST->hasAVX2())
836 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
837 return LT.first * Entry->Cost;
838
839 static const CostTblEntry AVX1CostTable[] = {
840 // We don't have to scalarize unsupported ops. We can issue two half-sized
841 // operations and we only need to extract the upper YMM half.
842 // Two ops + 1 extract + 1 insert = 4.
843 { ISD::MUL, MVT::v16i16, 4 },
844 { ISD::MUL, MVT::v8i32, 5 }, // BTVER2 from http://www.agner.org/
845 { ISD::MUL, MVT::v4i64, 12 },
846
847 { ISD::SUB, MVT::v32i8, 4 },
848 { ISD::ADD, MVT::v32i8, 4 },
849 { ISD::SUB, MVT::v16i16, 4 },
850 { ISD::ADD, MVT::v16i16, 4 },
851 { ISD::SUB, MVT::v8i32, 4 },
852 { ISD::ADD, MVT::v8i32, 4 },
853 { ISD::SUB, MVT::v4i64, 4 },
854 { ISD::ADD, MVT::v4i64, 4 },
855
856 { ISD::SHL, MVT::v16i8, 10 }, // pblendvb sequence .
857 { ISD::SHL, MVT::v32i8, 22 }, // pblendvb sequence + split.
858 { ISD::SHL, MVT::v8i16, 6 }, // pblendvb sequence.
859 { ISD::SHL, MVT::v16i16, 13 }, // pblendvb sequence + split.
860 { ISD::SHL, MVT::v4i32, 3 }, // pslld/paddd/cvttps2dq/pmulld
861 { ISD::SHL, MVT::v8i32, 9 }, // pslld/paddd/cvttps2dq/pmulld + split
862 { ISD::SHL, MVT::v2i64, 2 }, // Shift each lane + blend.
863 { ISD::SHL, MVT::v4i64, 6 }, // Shift each lane + blend + split.
864
865 { ISD::SRL, MVT::v16i8, 11 }, // pblendvb sequence.
866 { ISD::SRL, MVT::v32i8, 23 }, // pblendvb sequence + split.
867 { ISD::SRL, MVT::v8i16, 13 }, // pblendvb sequence.
868 { ISD::SRL, MVT::v16i16, 28 }, // pblendvb sequence + split.
869 { ISD::SRL, MVT::v4i32, 6 }, // Shift each lane + blend.
870 { ISD::SRL, MVT::v8i32, 14 }, // Shift each lane + blend + split.
871 { ISD::SRL, MVT::v2i64, 2 }, // Shift each lane + blend.
872 { ISD::SRL, MVT::v4i64, 6 }, // Shift each lane + blend + split.
873
874 { ISD::SRA, MVT::v16i8, 21 }, // pblendvb sequence.
875 { ISD::SRA, MVT::v32i8, 44 }, // pblendvb sequence + split.
876 { ISD::SRA, MVT::v8i16, 13 }, // pblendvb sequence.
877 { ISD::SRA, MVT::v16i16, 28 }, // pblendvb sequence + split.
878 { ISD::SRA, MVT::v4i32, 6 }, // Shift each lane + blend.
879 { ISD::SRA, MVT::v8i32, 14 }, // Shift each lane + blend + split.
880 { ISD::SRA, MVT::v2i64, 5 }, // Shift each lane + blend.
881 { ISD::SRA, MVT::v4i64, 12 }, // Shift each lane + blend + split.
882
883 { ISD::FNEG, MVT::v4f64, 2 }, // BTVER2 from http://www.agner.org/
884 { ISD::FNEG, MVT::v8f32, 2 }, // BTVER2 from http://www.agner.org/
885
886 { ISD::FMUL, MVT::f64, 2 }, // BTVER2 from http://www.agner.org/
887 { ISD::FMUL, MVT::v2f64, 2 }, // BTVER2 from http://www.agner.org/
888 { ISD::FMUL, MVT::v4f64, 4 }, // BTVER2 from http://www.agner.org/
889
890 { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
891 { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
892 { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
893 { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/
894 { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/
895 { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/
896 };
897
898 if (ST->hasAVX())
899 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
900 return LT.first * Entry->Cost;
901
902 static const CostTblEntry SSE42CostTable[] = {
903 { ISD::FADD, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
904 { ISD::FADD, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
905 { ISD::FADD, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
906 { ISD::FADD, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
907
908 { ISD::FSUB, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
909 { ISD::FSUB, MVT::f32 , 1 }, // Nehalem from http://www.agner.org/
910 { ISD::FSUB, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
911 { ISD::FSUB, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
912
913 { ISD::FMUL, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
914 { ISD::FMUL, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
915 { ISD::FMUL, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
916 { ISD::FMUL, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
917
918 { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/
919 { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
920 { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/
921 { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
922
923 { ISD::MUL, MVT::v2i64, 6 } // 3*pmuludq/3*shift/2*add
924 };
925
926 if (ST->hasSSE42())
927 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
928 return LT.first * Entry->Cost;
929
930 static const CostTblEntry SSE41CostTable[] = {
931 { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence.
932 { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence.
933 { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
934
935 { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence.
936 { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence.
937 { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend.
938
939 { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence.
940 { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence.
941 { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
942
943 { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org)
944 };
945
946 if (ST->hasSSE41())
947 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
948 return LT.first * Entry->Cost;
949
950 static const CostTblEntry SSE2CostTable[] = {
951 // We don't correctly identify costs of casts because they are marked as
952 // custom.
953 { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence.
954 { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence.
955 { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
956 { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
957
958 { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence.
959 { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence.
960 { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
961 { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
962
963 { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence.
964 { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence.
965 { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend.
966 { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence.
967
968 { ISD::MUL, MVT::v8i16, 1 }, // pmullw
969 { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
970 { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
971
972 { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/
973 { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
974 { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
975 { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
976
977 { ISD::FNEG, MVT::f32, 1 }, // Pentium IV from http://www.agner.org/
978 { ISD::FNEG, MVT::f64, 1 }, // Pentium IV from http://www.agner.org/
979 { ISD::FNEG, MVT::v4f32, 1 }, // Pentium IV from http://www.agner.org/
980 { ISD::FNEG, MVT::v2f64, 1 }, // Pentium IV from http://www.agner.org/
981
982 { ISD::FADD, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
983 { ISD::FADD, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
984
985 { ISD::FSUB, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
986 { ISD::FSUB, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
987 };
988
989 if (ST->hasSSE2())
990 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
991 return LT.first * Entry->Cost;
992
993 static const CostTblEntry SSE1CostTable[] = {
994 { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/
995 { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
996
997 { ISD::FNEG, MVT::f32, 2 }, // Pentium III from http://www.agner.org/
998 { ISD::FNEG, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
999
1000 { ISD::FADD, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
1001 { ISD::FADD, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
1002
1003 { ISD::FSUB, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
1004 { ISD::FSUB, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
1005 };
1006
1007 if (ST->hasSSE1())
1008 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1009 return LT.first * Entry->Cost;
1010
1011 static const CostTblEntry X64CostTbl[] = { // 64-bit targets
1012 { ISD::ADD, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/
1013 { ISD::SUB, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/
1014 };
1015
1016 if (ST->is64Bit())
1017 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1018 return LT.first * Entry->Cost;
1019
1020 static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1021 { ISD::ADD, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
1022 { ISD::ADD, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
1023 { ISD::ADD, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
1024
1025 { ISD::SUB, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
1026 { ISD::SUB, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
1027 { ISD::SUB, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
1028 };
1029
1030 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1031 return LT.first * Entry->Cost;
1032
1033 // It is not a good idea to vectorize division. We have to scalarize it and
1034 // in the process we will often end up having to spilling regular
1035 // registers. The overhead of division is going to dominate most kernels
1036 // anyways so try hard to prevent vectorization of division - it is
1037 // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1038 // to hide "20 cycles" for each lane.
1039 if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM ||
1040 ISD == ISD::UDIV || ISD == ISD::UREM)) {
1041 InstructionCost ScalarCost = getArithmeticInstrCost(
1042 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info,
1043 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
1044 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1045 }
1046
1047 // Fallback to the default implementation.
1048 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info);
1049}
1050
1051InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1052 VectorType *BaseTp,
1053 ArrayRef<int> Mask, int Index,
1054 VectorType *SubTp) {
1055 // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1056 // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1057 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, BaseTp);
1058
1059 Kind = improveShuffleKindFromMask(Kind, Mask);
1060 // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1061 if (Kind == TTI::SK_Transpose)
1062 Kind = TTI::SK_PermuteTwoSrc;
1063
1064 // For Broadcasts we are splatting the first element from the first input
1065 // register, so only need to reference that input and all the output
1066 // registers are the same.
1067 if (Kind == TTI::SK_Broadcast)
1068 LT.first = 1;
1069
1070 // Subvector extractions are free if they start at the beginning of a
1071 // vector and cheap if the subvectors are aligned.
1072 if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1073 int NumElts = LT.second.getVectorNumElements();
1074 if ((Index % NumElts) == 0)
1075 return 0;
1076 std::pair<InstructionCost, MVT> SubLT =
1077 TLI->getTypeLegalizationCost(DL, SubTp);
1078 if (SubLT.second.isVector()) {
1079 int NumSubElts = SubLT.second.getVectorNumElements();
1080 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1081 return SubLT.first;
1082 // Handle some cases for widening legalization. For now we only handle
1083 // cases where the original subvector was naturally aligned and evenly
1084 // fit in its legalized subvector type.
1085 // FIXME: Remove some of the alignment restrictions.
1086 // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1087 // vectors.
1088 int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1089 if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1090 (NumSubElts % OrigSubElts) == 0 &&
1091 LT.second.getVectorElementType() ==
1092 SubLT.second.getVectorElementType() &&
1093 LT.second.getVectorElementType().getSizeInBits() ==
1094 BaseTp->getElementType()->getPrimitiveSizeInBits()) {
1095 assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&(static_cast <bool> (NumElts >= NumSubElts &&
NumElts > OrigSubElts && "Unexpected number of elements!"
) ? void (0) : __assert_fail ("NumElts >= NumSubElts && NumElts > OrigSubElts && \"Unexpected number of elements!\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 1096, __extension__ __PRETTY_FUNCTION__))
1096 "Unexpected number of elements!")(static_cast <bool> (NumElts >= NumSubElts &&
NumElts > OrigSubElts && "Unexpected number of elements!"
) ? void (0) : __assert_fail ("NumElts >= NumSubElts && NumElts > OrigSubElts && \"Unexpected number of elements!\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 1096, __extension__ __PRETTY_FUNCTION__))
;
1097 auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
1098 LT.second.getVectorNumElements());
1099 auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
1100 SubLT.second.getVectorNumElements());
1101 int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1102 InstructionCost ExtractCost = getShuffleCost(
1103 TTI::SK_ExtractSubvector, VecTy, None, ExtractIndex, SubTy);
1104
1105 // If the original size is 32-bits or more, we can use pshufd. Otherwise
1106 // if we have SSSE3 we can use pshufb.
1107 if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1108 return ExtractCost + 1; // pshufd or pshufb
1109
1110 assert(SubTp->getPrimitiveSizeInBits() == 16 &&(static_cast <bool> (SubTp->getPrimitiveSizeInBits()
== 16 && "Unexpected vector size") ? void (0) : __assert_fail
("SubTp->getPrimitiveSizeInBits() == 16 && \"Unexpected vector size\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 1111, __extension__ __PRETTY_FUNCTION__))
1111 "Unexpected vector size")(static_cast <bool> (SubTp->getPrimitiveSizeInBits()
== 16 && "Unexpected vector size") ? void (0) : __assert_fail
("SubTp->getPrimitiveSizeInBits() == 16 && \"Unexpected vector size\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 1111, __extension__ __PRETTY_FUNCTION__))
;
1112
1113 return ExtractCost + 2; // worst case pshufhw + pshufd
1114 }
1115 }
1116 }
1117
1118 // Subvector insertions are cheap if the subvectors are aligned.
1119 // Note that in general, the insertion starting at the beginning of a vector
1120 // isn't free, because we need to preserve the rest of the wide vector.
1121 if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1122 int NumElts = LT.second.getVectorNumElements();
1123 std::pair<InstructionCost, MVT> SubLT =
1124 TLI->getTypeLegalizationCost(DL, SubTp);
1125 if (SubLT.second.isVector()) {
1126 int NumSubElts = SubLT.second.getVectorNumElements();
1127 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1128 return SubLT.first;
1129 }
1130 }
1131
1132 // Handle some common (illegal) sub-vector types as they are often very cheap
1133 // to shuffle even on targets without PSHUFB.
1134 EVT VT = TLI->getValueType(DL, BaseTp);
1135 if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1136 !ST->hasSSSE3()) {
1137 static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
1138 {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw
1139 {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw
1140 {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw
1141 {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw
1142 {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck
1143
1144 {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw
1145 {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw
1146 {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus
1147 {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck
1148
1149 {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw
1150 {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw
1151 {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw
1152 {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw
1153 {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck
1154
1155 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
1156 {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
1157 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw
1158 {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw
1159 {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck
1160 };
1161
1162 if (ST->hasSSE2())
1163 if (const auto *Entry =
1164 CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1165 return Entry->Cost;
1166 }
1167
1168 // We are going to permute multiple sources and the result will be in multiple
1169 // destinations. Providing an accurate cost only for splits where the element
1170 // type remains the same.
1171 if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
1172 MVT LegalVT = LT.second;
1173 if (LegalVT.isVector() &&
1174 LegalVT.getVectorElementType().getSizeInBits() ==
1175 BaseTp->getElementType()->getPrimitiveSizeInBits() &&
1176 LegalVT.getVectorNumElements() <
1177 cast<FixedVectorType>(BaseTp)->getNumElements()) {
1178
1179 unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
1180 unsigned LegalVTSize = LegalVT.getStoreSize();
1181 // Number of source vectors after legalization:
1182 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1183 // Number of destination vectors after legalization:
1184 InstructionCost NumOfDests = LT.first;
1185
1186 auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
1187 LegalVT.getVectorNumElements());
1188
1189 InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1190 return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1191 None, 0, nullptr);
1192 }
1193
1194 return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp);
1195 }
1196
1197 // For 2-input shuffles, we must account for splitting the 2 inputs into many.
1198 if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
1199 // We assume that source and destination have the same vector type.
1200 InstructionCost NumOfDests = LT.first;
1201 InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1;
1202 LT.first = NumOfDests * NumOfShufflesPerDest;
1203 }
1204
1205 static const CostTblEntry AVX512VBMIShuffleTbl[] = {
1206 {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
1207 {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
1208
1209 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
1210 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
1211
1212 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
1213 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
1214 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b
1215 };
1216
1217 if (ST->hasVBMI())
1218 if (const auto *Entry =
1219 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1220 return LT.first * Entry->Cost;
1221
1222 static const CostTblEntry AVX512BWShuffleTbl[] = {
1223 {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1224 {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1225
1226 {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
1227 {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
1228 {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
1229
1230 {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
1231 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
1232 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
1233
1234 {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
1235 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
1236 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
1237 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1238
1239 {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
1240 {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
1241 };
1242
1243 if (ST->hasBWI())
1244 if (const auto *Entry =
1245 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1246 return LT.first * Entry->Cost;
1247
1248 static const CostTblEntry AVX512ShuffleTbl[] = {
1249 {TTI::SK_Broadcast, MVT::v8f64, 1}, // vbroadcastpd
1250 {TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps
1251 {TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq
1252 {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd
1253 {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1254 {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1255
1256 {TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd
1257 {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps
1258 {TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq
1259 {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd
1260 {TTI::SK_Reverse, MVT::v32i16, 7}, // per mca
1261 {TTI::SK_Reverse, MVT::v64i8, 7}, // per mca
1262
1263 {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd
1264 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1265 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // vpermpd
1266 {TTI::SK_PermuteSingleSrc, MVT::v16f32, 1}, // vpermps
1267 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1268 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 1}, // vpermps
1269 {TTI::SK_PermuteSingleSrc, MVT::v8i64, 1}, // vpermq
1270 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1271 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // vpermq
1272 {TTI::SK_PermuteSingleSrc, MVT::v16i32, 1}, // vpermd
1273 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1274 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // vpermd
1275 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1276
1277 {TTI::SK_PermuteTwoSrc, MVT::v8f64, 1}, // vpermt2pd
1278 {TTI::SK_PermuteTwoSrc, MVT::v16f32, 1}, // vpermt2ps
1279 {TTI::SK_PermuteTwoSrc, MVT::v8i64, 1}, // vpermt2q
1280 {TTI::SK_PermuteTwoSrc, MVT::v16i32, 1}, // vpermt2d
1281 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 1}, // vpermt2pd
1282 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 1}, // vpermt2ps
1283 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 1}, // vpermt2q
1284 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 1}, // vpermt2d
1285 {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1}, // vpermt2pd
1286 {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1}, // vpermt2ps
1287 {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1}, // vpermt2q
1288 {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1}, // vpermt2d
1289
1290 // FIXME: This just applies the type legalization cost rules above
1291 // assuming these completely split.
1292 {TTI::SK_PermuteSingleSrc, MVT::v32i16, 14},
1293 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 14},
1294 {TTI::SK_PermuteTwoSrc, MVT::v32i16, 42},
1295 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 42},
1296
1297 {TTI::SK_Select, MVT::v32i16, 1}, // vpternlogq
1298 {TTI::SK_Select, MVT::v64i8, 1}, // vpternlogq
1299 {TTI::SK_Select, MVT::v8f64, 1}, // vblendmpd
1300 {TTI::SK_Select, MVT::v16f32, 1}, // vblendmps
1301 {TTI::SK_Select, MVT::v8i64, 1}, // vblendmq
1302 {TTI::SK_Select, MVT::v16i32, 1}, // vblendmd
1303 };
1304
1305 if (ST->hasAVX512())
1306 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1307 return LT.first * Entry->Cost;
1308
1309 static const CostTblEntry AVX2ShuffleTbl[] = {
1310 {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1311 {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1312 {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1313 {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1314 {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1315 {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1316
1317 {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1318 {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1319 {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1320 {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1321 {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1322 {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1323
1324 {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1325 {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1326
1327 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1328 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1329 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1330 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1331 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1332 // + vpblendvb
1333 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1334 // + vpblendvb
1335
1336 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1337 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1338 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1339 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1340 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1341 // + vpblendvb
1342 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
1343 // + vpblendvb
1344 };
1345
1346 if (ST->hasAVX2())
1347 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1348 return LT.first * Entry->Cost;
1349
1350 static const CostTblEntry XOPShuffleTbl[] = {
1351 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
1352 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
1353 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
1354 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
1355 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1356 // + vinsertf128
1357 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
1358 // + vinsertf128
1359
1360 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1361 // + vinsertf128
1362 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
1363 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
1364 // + vinsertf128
1365 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
1366 };
1367
1368 if (ST->hasXOP())
1369 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1370 return LT.first * Entry->Cost;
1371
1372 static const CostTblEntry AVX1ShuffleTbl[] = {
1373 {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1374 {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1375 {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1376 {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1377 {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1378 {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
1379
1380 {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1381 {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1382 {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1383 {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1384 {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1385 // + vinsertf128
1386 {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
1387 // + vinsertf128
1388
1389 {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
1390 {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
1391 {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
1392 {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
1393 {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
1394 {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
1395
1396 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
1397 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
1398 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1399 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1400 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
1401 // + 2*por + vinsertf128
1402 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
1403 // + 2*por + vinsertf128
1404
1405 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
1406 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
1407 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1408 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1409 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
1410 // + 4*por + vinsertf128
1411 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
1412 // + 4*por + vinsertf128
1413 };
1414
1415 if (ST->hasAVX())
1416 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
1417 return LT.first * Entry->Cost;
1418
1419 static const CostTblEntry SSE41ShuffleTbl[] = {
1420 {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
1421 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1422 {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
1423 {TTI::SK_Select, MVT::v4f32, 1}, // blendps
1424 {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
1425 {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
1426 };
1427
1428 if (ST->hasSSE41())
1429 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
1430 return LT.first * Entry->Cost;
1431
1432 static const CostTblEntry SSSE3ShuffleTbl[] = {
1433 {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
1434 {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
1435
1436 {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
1437 {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
1438
1439 {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
1440 {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
1441
1442 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
1443 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1444
1445 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
1446 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
1447 };
1448
1449 if (ST->hasSSSE3())
1450 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
1451 return LT.first * Entry->Cost;
1452
1453 static const CostTblEntry SSE2ShuffleTbl[] = {
1454 {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
1455 {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
1456 {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
1457 {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
1458 {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
1459
1460 {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
1461 {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
1462 {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
1463 {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
1464 {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
1465 // + 2*pshufd + 2*unpck + packus
1466
1467 {TTI::SK_Select, MVT::v2i64, 1}, // movsd
1468 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1469 {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
1470 {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
1471 {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
1472
1473 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
1474 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
1475 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
1476 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
1477 // + pshufd/unpck
1478 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
1479 // + 2*pshufd + 2*unpck + 2*packus
1480
1481 { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
1482 { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
1483 { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
1484 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
1485 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
1486 };
1487
1488 if (ST->hasSSE2())
1489 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
1490 return LT.first * Entry->Cost;
1491
1492 static const CostTblEntry SSE1ShuffleTbl[] = {
1493 { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
1494 { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
1495 { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
1496 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
1497 { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
1498 };
1499
1500 if (ST->hasSSE1())
1501 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
1502 return LT.first * Entry->Cost;
1503
1504 return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp);
1505}
1506
1507InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
1508 Type *Src,
1509 TTI::CastContextHint CCH,
1510 TTI::TargetCostKind CostKind,
1511 const Instruction *I) {
1512 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1513 assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ?
void (0) : __assert_fail ("ISD && \"Invalid opcode\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 1513, __extension__ __PRETTY_FUNCTION__))
;
1514
1515 // TODO: Allow non-throughput costs that aren't binary.
1516 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
1517 if (CostKind != TTI::TCK_RecipThroughput)
1518 return Cost == 0 ? 0 : 1;
1519 return Cost;
1520 };
1521
1522 // The cost tables include both specific, custom (non-legal) src/dst type
1523 // conversions and generic, legalized types. We test for customs first, before
1524 // falling back to legalization.
1525 // FIXME: Need a better design of the cost table to handle non-simple types of
1526 // potential massive combinations (elem_num x src_type x dst_type).
1527 static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
1528 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
1529 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
1530
1531 // Mask sign extend has an instruction.
1532 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
1533 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
1534 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
1535 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
1536 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
1537 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
1538 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
1539 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
1540 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
1541 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 },
1542 { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 },
1543
1544 // Mask zero extend is a sext + shift.
1545 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
1546 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
1547 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
1548 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
1549 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
1550 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
1551 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
1552 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
1553 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
1554 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 },
1555 { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 },
1556
1557 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 2 },
1558 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm
1559 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // widen to zmm
1560 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // widen to zmm
1561 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // vpmovwb
1562 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // widen to zmm
1563 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // widen to zmm
1564 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // vpmovwb
1565 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // widen to zmm
1566 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // widen to zmm
1567 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // vpmovwb
1568 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // widen to zmm
1569 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // widen to zmm
1570 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // widen to zmm
1571 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, 2 },
1572 { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, 2 },
1573 };
1574
1575 static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
1576 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
1577 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
1578
1579 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
1580 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
1581
1582 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 },
1583 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 },
1584
1585 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 },
1586 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 },
1587 };
1588
1589 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
1590 // 256-bit wide vectors.
1591
1592 static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
1593 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 },
1594 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 },
1595 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 },
1596
1597 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
1598 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
1599 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
1600 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd
1601 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
1602 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
1603 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
1604 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd
1605 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd
1606 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd
1607 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd
1608 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd
1609 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq
1610 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq
1611 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq
1612 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 2 }, // vpmovdb
1613 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 2 }, // vpmovdb
1614 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 }, // vpmovdb
1615 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 }, // vpmovdb
1616 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 2 }, // vpmovqb
1617 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1 }, // vpshufb
1618 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 }, // vpmovqb
1619 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 }, // vpmovqw
1620 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, // vpmovqd
1621 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd
1622 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb
1623
1624 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32
1625 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 8 },
1626
1627 // Sign extend is zmm vpternlogd+vptruncdb.
1628 // Zero extend is zmm broadcast load+vptruncdw.
1629 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 3 },
1630 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 4 },
1631 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 3 },
1632 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 4 },
1633 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 3 },
1634 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 4 },
1635 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 3 },
1636 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 4 },
1637
1638 // Sign extend is zmm vpternlogd+vptruncdw.
1639 // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
1640 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 3 },
1641 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 4 },
1642 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 3 },
1643 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 4 },
1644 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 3 },
1645 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 4 },
1646 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 3 },
1647 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
1648
1649 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd
1650 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld
1651 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd
1652 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld
1653 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd
1654 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld
1655 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq
1656 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq
1657 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq
1658 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq
1659
1660 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd
1661 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld
1662 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq
1663 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq
1664
1665 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
1666 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
1667 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
1668 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
1669 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
1670 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
1671 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
1672 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
1673 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
1674 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
1675
1676 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
1677 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
1678
1679 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
1680 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
1681 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 },
1682 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 },
1683 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
1684 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 },
1685 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
1686 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
1687
1688 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
1689 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
1690 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 },
1691 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 },
1692 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
1693 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 },
1694 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
1695 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
1696 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 },
1697 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 },
1698
1699 { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f64, 3 },
1700 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, 3 },
1701 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 1 },
1702 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 3 },
1703 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 3 },
1704 { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, 3 },
1705
1706 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 },
1707 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 3 },
1708 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 3 },
1709 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 },
1710 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 3 },
1711 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 3 },
1712 };
1713
1714 static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] {
1715 // Mask sign extend has an instruction.
1716 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
1717 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
1718 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
1719 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
1720 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
1721 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
1722 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
1723 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
1724 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
1725
1726 // Mask zero extend is a sext + shift.
1727 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
1728 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
1729 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
1730 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
1731 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
1732 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
1733 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
1734 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
1735 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
1736
1737 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 },
1738 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // vpsllw+vptestmb
1739 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // vpsllw+vptestmw
1740 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // vpsllw+vptestmb
1741 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // vpsllw+vptestmw
1742 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // vpsllw+vptestmb
1743 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // vpsllw+vptestmw
1744 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // vpsllw+vptestmb
1745 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // vpsllw+vptestmw
1746 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // vpsllw+vptestmb
1747 };
1748
1749 static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = {
1750 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
1751 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
1752 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
1753 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
1754
1755 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
1756 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
1757 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
1758 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
1759
1760 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 1 },
1761 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 },
1762 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
1763 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 },
1764
1765 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 1 },
1766 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 },
1767 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
1768 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 },
1769 };
1770
1771 static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = {
1772 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
1773 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
1774 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
1775 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8
1776 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
1777 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
1778 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
1779 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16
1780 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd
1781 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd
1782 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd
1783 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq
1784 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq
1785 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd
1786 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, // vpmovqb
1787 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, // vpmovqw
1788 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, // vpmovwb
1789
1790 // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
1791 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
1792 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 5 },
1793 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 6 },
1794 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 5 },
1795 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 6 },
1796 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 5 },
1797 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 6 },
1798 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 10 },
1799 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 12 },
1800
1801 // sign extend is vpcmpeq+maskedmove+vpmovdw
1802 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
1803 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 4 },
1804 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 5 },
1805 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 4 },
1806 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 5 },
1807 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 4 },
1808 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 5 },
1809 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 10 },
1810 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 12 },
1811
1812 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd
1813 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld
1814 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd
1815 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld
1816 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd
1817 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld
1818 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq
1819 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq
1820 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq
1821 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq
1822
1823 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 2 },
1824 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 },
1825 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 2 },
1826 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 5 },
1827 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 },
1828 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 },
1829 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
1830 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
1831 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
1832 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
1833 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
1834 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 },
1835 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
1836 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 5 },
1837
1838 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 },
1839 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 },
1840
1841 { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 3 },
1842 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f32, 3 },
1843
1844 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 },
1845 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 },
1846
1847 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
1848 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
1849 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 1 },
1850 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 },
1851 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 },
1852 };
1853
1854 static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
1855 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
1856 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
1857 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
1858 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
1859 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 1 },
1860 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 1 },
1861 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 1 },
1862 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 1 },
1863 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
1864 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
1865 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
1866 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
1867 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 1 },
1868 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 1 },
1869 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
1870 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
1871 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
1872 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
1873 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
1874 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
1875
1876 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 1 },
1877 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1 },
1878 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 },
1879 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
1880 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 },
1881
1882 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 },
1883 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 },
1884
1885 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 1 },
1886 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 1 },
1887 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 3 },
1888
1889 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 4 },
1890 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 7 },
1891 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 4 },
1892 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 7 },
1893 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 4 },
1894 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 15 },
1895
1896 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
1897 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
1898 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 3 },
1899
1900 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 },
1901 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
1902 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 2 },
1903 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
1904 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
1905 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 },
1906 };
1907
1908 static const TypeConversionCostTblEntry AVXConversionTbl[] = {
1909 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 },
1910 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 },
1911 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 },
1912 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 },
1913 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 3 },
1914 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 3 },
1915 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
1916 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
1917 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
1918 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
1919 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
1920 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
1921 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
1922 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
1923 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
1924 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
1925 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
1926 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
1927
1928 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 4 },
1929 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 5 },
1930 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 4 },
1931 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 9 },
1932 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, 11 },
1933
1934 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // and+extract+packuswb
1935 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 2 }, // and+packusdw+packuswb
1936 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 },
1937 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
1938 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 4 },
1939 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 3 }, // and+extract+2*packusdw
1940 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 },
1941 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 11 },
1942 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 9 },
1943 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 3 },
1944 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 11 },
1945
1946 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
1947 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 },
1948 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 },
1949 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
1950 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i8, 3 },
1951 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 8 },
1952 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 3 },
1953 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i16, 3 },
1954 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 },
1955 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
1956 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
1957 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 },
1958 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 5 },
1959 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 8 },
1960
1961 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 },
1962 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 },
1963 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 },
1964 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 2 },
1965 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 },
1966 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 5 },
1967 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
1968 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 },
1969 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 },
1970 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 4 },
1971 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 10 },
1972 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
1973 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 },
1974 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 18 },
1975 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 10 },
1976 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 },
1977 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 10 },
1978 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
1979 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 6 },
1980
1981 { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 4 },
1982 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f64, 3 },
1983 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f64, 2 },
1984 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 2 },
1985 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 3 },
1986 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 2 },
1987 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 5 },
1988
1989 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 5 },
1990 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 9 },
1991 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 5 },
1992 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f64, 3 },
1993 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f64, 2 },
1994 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 9 },
1995 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f32, 4 },
1996 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 3 },
1997 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 9 },
1998 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 19 },
1999
2000 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 },
2001 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 },
2002 };
2003
2004 static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
2005 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 2 },
2006 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 2 },
2007 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 2 },
2008 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 2 },
2009 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
2010 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
2011
2012 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i8, 1 },
2013 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i8, 1 },
2014 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i8, 1 },
2015 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i8, 1 },
2016 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 1 },
2017 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 1 },
2018 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 },
2019 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 1 },
2020 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 1 },
2021 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 1 },
2022 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
2023 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
2024 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 2 },
2025 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 2 },
2026 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
2027 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
2028 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 4 },
2029 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 4 },
2030 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i16, 1 },
2031 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i16, 1 },
2032 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 1 },
2033 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 1 },
2034 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
2035 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
2036 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
2037 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
2038 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
2039 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
2040 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
2041 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
2042
2043 // These truncates end up widening elements.
2044 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ
2045 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ
2046 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD
2047
2048 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 1 },
2049 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 1 },
2050 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1 },
2051 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1 },
2052 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
2053 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
2054 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 3 },
2055 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 },
2056 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1 }, // PSHUFB
2057
2058 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
2059 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 1 },
2060 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
2061
2062 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 4 },
2063 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 },
2064 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 12 },
2065 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 22 },
2066 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 4 },
2067
2068 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 3 },
2069 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 3 },
2070
2071 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 3 },
2072 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 3 },
2073 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
2074 };
2075
2076 static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
2077 // These are somewhat magic numbers justified by comparing the
2078 // output of llvm-mca for our various supported scheduler models
2079 // and basing it off the worst case scenario.
2080 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 3 },
2081 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 },
2082 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 3 },
2083 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 },
2084 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 },
2085 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4 },
2086 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 8 },
2087 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 8 },
2088
2089 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 8 },
2090 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 9 },
2091 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 },
2092 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 4 },
2093 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 4 },
2094 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 },
2095 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 7 },
2096 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
2097 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 15 },
2098 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 18 },
2099
2100 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 4 },
2101 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 2 },
2102 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 },
2103 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
2104 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
2105 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 4 },
2106 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 1 },
2107
2108 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 },
2109 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 15 },
2110 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 4 },
2111 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 4 },
2112 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 },
2113 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 2 },
2114 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
2115 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 4 },
2116 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 8 },
2117
2118 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 },
2119 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 },
2120 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
2121 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 3 },
2122 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 },
2123 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 8 },
2124 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
2125 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 2 },
2126 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 6 },
2127 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 6 },
2128 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
2129 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
2130 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 9 },
2131 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 12 },
2132 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
2133 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 2 },
2134 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
2135 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 10 },
2136 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
2137 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 },
2138 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 },
2139 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 8 },
2140 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
2141 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 5 },
2142
2143 // These truncates are really widening elements.
2144 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD
2145 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // PUNPCKLWD+DQ
2146 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // PUNPCKLBW+WD+PSHUFD
2147 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 1 }, // PUNPCKLWD
2148 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD
2149 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW
2150
2151 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // PAND+PACKUSWB
2152 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // PAND+PACKUSWB
2153 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // PAND+PACKUSWB
2154 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 },
2155 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 3 }, // PAND+2*PACKUSWB
2156 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 },
2157 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 3 },
2158 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 3 },
2159 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 },
2160 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 },
2161 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
2162 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 10 },
2163 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB
2164 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW
2165 { ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1 }, // PSHUFD
2166 };
2167
2168 // Attempt to map directly to (simple) MVT types to let us match custom entries.
2169 EVT SrcTy = TLI->getValueType(DL, Src);
2170 EVT DstTy = TLI->getValueType(DL, Dst);
2171
2172 // The function getSimpleVT only handles simple value types.
2173 if (SrcTy.isSimple() && DstTy.isSimple()) {
2174 MVT SimpleSrcTy = SrcTy.getSimpleVT();
2175 MVT SimpleDstTy = DstTy.getSimpleVT();
2176
2177 if (ST->useAVX512Regs()) {
2178 if (ST->hasBWI())
2179 if (const auto *Entry = ConvertCostTableLookup(
2180 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2181 return AdjustCost(Entry->Cost);
2182
2183 if (ST->hasDQI())
2184 if (const auto *Entry = ConvertCostTableLookup(
2185 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2186 return AdjustCost(Entry->Cost);
2187
2188 if (ST->hasAVX512())
2189 if (const auto *Entry = ConvertCostTableLookup(
2190 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2191 return AdjustCost(Entry->Cost);
2192 }
2193
2194 if (ST->hasBWI())
2195 if (const auto *Entry = ConvertCostTableLookup(
2196 AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2197 return AdjustCost(Entry->Cost);
2198
2199 if (ST->hasDQI())
2200 if (const auto *Entry = ConvertCostTableLookup(
2201 AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2202 return AdjustCost(Entry->Cost);
2203
2204 if (ST->hasAVX512())
2205 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2206 SimpleDstTy, SimpleSrcTy))
2207 return AdjustCost(Entry->Cost);
2208
2209 if (ST->hasAVX2()) {
2210 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2211 SimpleDstTy, SimpleSrcTy))
2212 return AdjustCost(Entry->Cost);
2213 }
2214
2215 if (ST->hasAVX()) {
2216 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2217 SimpleDstTy, SimpleSrcTy))
2218 return AdjustCost(Entry->Cost);
2219 }
2220
2221 if (ST->hasSSE41()) {
2222 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
2223 SimpleDstTy, SimpleSrcTy))
2224 return AdjustCost(Entry->Cost);
2225 }
2226
2227 if (ST->hasSSE2()) {
2228 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2229 SimpleDstTy, SimpleSrcTy))
2230 return AdjustCost(Entry->Cost);
2231 }
2232 }
2233
2234 // Fall back to legalized types.
2235 std::pair<InstructionCost, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
2236 std::pair<InstructionCost, MVT> LTDest =
2237 TLI->getTypeLegalizationCost(DL, Dst);
2238
2239 if (ST->useAVX512Regs()) {
2240 if (ST->hasBWI())
2241 if (const auto *Entry = ConvertCostTableLookup(
2242 AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
2243 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2244
2245 if (ST->hasDQI())
2246 if (const auto *Entry = ConvertCostTableLookup(
2247 AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
2248 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2249
2250 if (ST->hasAVX512())
2251 if (const auto *Entry = ConvertCostTableLookup(
2252 AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
2253 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2254 }
2255
2256 if (ST->hasBWI())
2257 if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
2258 LTDest.second, LTSrc.second))
2259 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2260
2261 if (ST->hasDQI())
2262 if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
2263 LTDest.second, LTSrc.second))
2264 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2265
2266 if (ST->hasAVX512())
2267 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2268 LTDest.second, LTSrc.second))
2269 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2270
2271 if (ST->hasAVX2())
2272 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2273 LTDest.second, LTSrc.second))
2274 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2275
2276 if (ST->hasAVX())
2277 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2278 LTDest.second, LTSrc.second))
2279 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2280
2281 if (ST->hasSSE41())
2282 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
2283 LTDest.second, LTSrc.second))
2284 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2285
2286 if (ST->hasSSE2())
2287 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2288 LTDest.second, LTSrc.second))
2289 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2290
2291 return AdjustCost(
2292 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2293}
2294
2295InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
2296 Type *CondTy,
2297 CmpInst::Predicate VecPred,
2298 TTI::TargetCostKind CostKind,
2299 const Instruction *I) {
2300 // TODO: Handle other cost kinds.
2301 if (CostKind != TTI::TCK_RecipThroughput)
2302 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2303 I);
2304
2305 // Legalize the type.
2306 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2307
2308 MVT MTy = LT.second;
2309
2310 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2311 assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ?
void (0) : __assert_fail ("ISD && \"Invalid opcode\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 2311, __extension__ __PRETTY_FUNCTION__))
;
2312
2313 unsigned ExtraCost = 0;
2314 if (I && (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp)) {
2315 // Some vector comparison predicates cost extra instructions.
2316 if (MTy.isVector() &&
2317 !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
2318 (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
2319 ST->hasBWI())) {
2320 switch (cast<CmpInst>(I)->getPredicate()) {
2321 case CmpInst::Predicate::ICMP_NE:
2322 // xor(cmpeq(x,y),-1)
2323 ExtraCost = 1;
2324 break;
2325 case CmpInst::Predicate::ICMP_SGE:
2326 case CmpInst::Predicate::ICMP_SLE:
2327 // xor(cmpgt(x,y),-1)
2328 ExtraCost = 1;
2329 break;
2330 case CmpInst::Predicate::ICMP_ULT:
2331 case CmpInst::Predicate::ICMP_UGT:
2332 // cmpgt(xor(x,signbit),xor(y,signbit))
2333 // xor(cmpeq(pmaxu(x,y),x),-1)
2334 ExtraCost = 2;
2335 break;
2336 case CmpInst::Predicate::ICMP_ULE:
2337 case CmpInst::Predicate::ICMP_UGE:
2338 if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
2339 (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
2340 // cmpeq(psubus(x,y),0)
2341 // cmpeq(pminu(x,y),x)
2342 ExtraCost = 1;
2343 } else {
2344 // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
2345 ExtraCost = 3;
2346 }
2347 break;
2348 default:
2349 break;
2350 }
2351 }
2352 }
2353
2354 static const CostTblEntry SLMCostTbl[] = {
2355 // slm pcmpeq/pcmpgt throughput is 2
2356 { ISD::SETCC, MVT::v2i64, 2 },
2357 };
2358
2359 static const CostTblEntry AVX512BWCostTbl[] = {
2360 { ISD::SETCC, MVT::v32i16, 1 },
2361 { ISD::SETCC, MVT::v64i8, 1 },
2362
2363 { ISD::SELECT, MVT::v32i16, 1 },
2364 { ISD::SELECT, MVT::v64i8, 1 },
2365 };
2366
2367 static const CostTblEntry AVX512CostTbl[] = {
2368 { ISD::SETCC, MVT::v8i64, 1 },
2369 { ISD::SETCC, MVT::v16i32, 1 },
2370 { ISD::SETCC, MVT::v8f64, 1 },
2371 { ISD::SETCC, MVT::v16f32, 1 },
2372
2373 { ISD::SELECT, MVT::v8i64, 1 },
2374 { ISD::SELECT, MVT::v16i32, 1 },
2375 { ISD::SELECT, MVT::v8f64, 1 },
2376 { ISD::SELECT, MVT::v16f32, 1 },
2377
2378 { ISD::SETCC, MVT::v32i16, 2 }, // FIXME: should probably be 4
2379 { ISD::SETCC, MVT::v64i8, 2 }, // FIXME: should probably be 4
2380
2381 { ISD::SELECT, MVT::v32i16, 2 }, // FIXME: should be 3
2382 { ISD::SELECT, MVT::v64i8, 2 }, // FIXME: should be 3
2383 };
2384
2385 static const CostTblEntry AVX2CostTbl[] = {
2386 { ISD::SETCC, MVT::v4i64, 1 },
2387 { ISD::SETCC, MVT::v8i32, 1 },
2388 { ISD::SETCC, MVT::v16i16, 1 },
2389 { ISD::SETCC, MVT::v32i8, 1 },
2390
2391 { ISD::SELECT, MVT::v4i64, 1 }, // pblendvb
2392 { ISD::SELECT, MVT::v8i32, 1 }, // pblendvb
2393 { ISD::SELECT, MVT::v16i16, 1 }, // pblendvb
2394 { ISD::SELECT, MVT::v32i8, 1 }, // pblendvb
2395 };
2396
2397 static const CostTblEntry AVX1CostTbl[] = {
2398 { ISD::SETCC, MVT::v4f64, 1 },
2399 { ISD::SETCC, MVT::v8f32, 1 },
2400 // AVX1 does not support 8-wide integer compare.
2401 { ISD::SETCC, MVT::v4i64, 4 },
2402 { ISD::SETCC, MVT::v8i32, 4 },
2403 { ISD::SETCC, MVT::v16i16, 4 },
2404 { ISD::SETCC, MVT::v32i8, 4 },
2405
2406 { ISD::SELECT, MVT::v4f64, 1 }, // vblendvpd
2407 { ISD::SELECT, MVT::v8f32, 1 }, // vblendvps
2408 { ISD::SELECT, MVT::v4i64, 1 }, // vblendvpd
2409 { ISD::SELECT, MVT::v8i32, 1 }, // vblendvps
2410 { ISD::SELECT, MVT::v16i16, 3 }, // vandps + vandnps + vorps
2411 { ISD::SELECT, MVT::v32i8, 3 }, // vandps + vandnps + vorps
2412 };
2413
2414 static const CostTblEntry SSE42CostTbl[] = {
2415 { ISD::SETCC, MVT::v2f64, 1 },
2416 { ISD::SETCC, MVT::v4f32, 1 },
2417 { ISD::SETCC, MVT::v2i64, 1 },
2418 };
2419
2420 static const CostTblEntry SSE41CostTbl[] = {
2421 { ISD::SELECT, MVT::v2f64, 1 }, // blendvpd
2422 { ISD::SELECT, MVT::v4f32, 1 }, // blendvps
2423 { ISD::SELECT, MVT::v2i64, 1 }, // pblendvb
2424 { ISD::SELECT, MVT::v4i32, 1 }, // pblendvb
2425 { ISD::SELECT, MVT::v8i16, 1 }, // pblendvb
2426 { ISD::SELECT, MVT::v16i8, 1 }, // pblendvb
2427 };
2428
2429 static const CostTblEntry SSE2CostTbl[] = {
2430 { ISD::SETCC, MVT::v2f64, 2 },
2431 { ISD::SETCC, MVT::f64, 1 },
2432 { ISD::SETCC, MVT::v2i64, 8 },
2433 { ISD::SETCC, MVT::v4i32, 1 },
2434 { ISD::SETCC, MVT::v8i16, 1 },
2435 { ISD::SETCC, MVT::v16i8, 1 },
2436
2437 { ISD::SELECT, MVT::v2f64, 3 }, // andpd + andnpd + orpd
2438 { ISD::SELECT, MVT::v2i64, 3 }, // pand + pandn + por
2439 { ISD::SELECT, MVT::v4i32, 3 }, // pand + pandn + por
2440 { ISD::SELECT, MVT::v8i16, 3 }, // pand + pandn + por
2441 { ISD::SELECT, MVT::v16i8, 3 }, // pand + pandn + por
2442 };
2443
2444 static const CostTblEntry SSE1CostTbl[] = {
2445 { ISD::SETCC, MVT::v4f32, 2 },
2446 { ISD::SETCC, MVT::f32, 1 },
2447
2448 { ISD::SELECT, MVT::v4f32, 3 }, // andps + andnps + orps
2449 };
2450
2451 if (ST->isSLM())
2452 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
2453 return LT.first * (ExtraCost + Entry->Cost);
2454
2455 if (ST->hasBWI())
2456 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
2457 return LT.first * (ExtraCost + Entry->Cost);
2458
2459 if (ST->hasAVX512())
2460 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2461 return LT.first * (ExtraCost + Entry->Cost);
2462
2463 if (ST->hasAVX2())
2464 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
2465 return LT.first * (ExtraCost + Entry->Cost);
2466
2467 if (ST->hasAVX())
2468 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
2469 return LT.first * (ExtraCost + Entry->Cost);
2470
2471 if (ST->hasSSE42())
2472 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
2473 return LT.first * (ExtraCost + Entry->Cost);
2474
2475 if (ST->hasSSE41())
2476 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
2477 return LT.first * (ExtraCost + Entry->Cost);
2478
2479 if (ST->hasSSE2())
2480 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
2481 return LT.first * (ExtraCost + Entry->Cost);
2482
2483 if (ST->hasSSE1())
2484 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
2485 return LT.first * (ExtraCost + Entry->Cost);
2486
2487 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
2488}
2489
2490unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; }
2491
2492InstructionCost
2493X86TTIImpl::getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
2494 TTI::TargetCostKind CostKind) {
2495
2496 // Costs should match the codegen from:
2497 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
2498 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
2499 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
2500 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
2501 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
2502
2503 // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
2504 // specialized in these tables yet.
2505 static const CostTblEntry AVX512CDCostTbl[] = {
2506 { ISD::CTLZ, MVT::v8i64, 1 },
2507 { ISD::CTLZ, MVT::v16i32, 1 },
2508 { ISD::CTLZ, MVT::v32i16, 8 },
2509 { ISD::CTLZ, MVT::v64i8, 20 },
2510 { ISD::CTLZ, MVT::v4i64, 1 },
2511 { ISD::CTLZ, MVT::v8i32, 1 },
2512 { ISD::CTLZ, MVT::v16i16, 4 },
2513 { ISD::CTLZ, MVT::v32i8, 10 },
2514 { ISD::CTLZ, MVT::v2i64, 1 },
2515 { ISD::CTLZ, MVT::v4i32, 1 },
2516 { ISD::CTLZ, MVT::v8i16, 4 },
2517 { ISD::CTLZ, MVT::v16i8, 4 },
2518 };
2519 static const CostTblEntry AVX512BWCostTbl[] = {
2520 { ISD::ABS, MVT::v32i16, 1 },
2521 { ISD::ABS, MVT::v64i8, 1 },
2522 { ISD::BITREVERSE, MVT::v8i64, 5 },
2523 { ISD::BITREVERSE, MVT::v16i32, 5 },
2524 { ISD::BITREVERSE, MVT::v32i16, 5 },
2525 { ISD::BITREVERSE, MVT::v64i8, 5 },
2526 { ISD::BSWAP, MVT::v8i64, 1 },
2527 { ISD::BSWAP, MVT::v16i32, 1 },
2528 { ISD::BSWAP, MVT::v32i16, 1 },
2529 { ISD::CTLZ, MVT::v8i64, 23 },
2530 { ISD::CTLZ, MVT::v16i32, 22 },
2531 { ISD::CTLZ, MVT::v32i16, 18 },
2532 { ISD::CTLZ, MVT::v64i8, 17 },
2533 { ISD::CTPOP, MVT::v8i64, 7 },
2534 { ISD::CTPOP, MVT::v16i32, 11 },
2535 { ISD::CTPOP, MVT::v32i16, 9 },
2536 { ISD::CTPOP, MVT::v64i8, 6 },
2537 { ISD::CTTZ, MVT::v8i64, 10 },
2538 { ISD::CTTZ, MVT::v16i32, 14 },
2539 { ISD::CTTZ, MVT::v32i16, 12 },
2540 { ISD::CTTZ, MVT::v64i8, 9 },
2541 { ISD::SADDSAT, MVT::v32i16, 1 },
2542 { ISD::SADDSAT, MVT::v64i8, 1 },
2543 { ISD::SMAX, MVT::v32i16, 1 },
2544 { ISD::SMAX, MVT::v64i8, 1 },
2545 { ISD::SMIN, MVT::v32i16, 1 },
2546 { ISD::SMIN, MVT::v64i8, 1 },
2547 { ISD::SSUBSAT, MVT::v32i16, 1 },
2548 { ISD::SSUBSAT, MVT::v64i8, 1 },
2549 { ISD::UADDSAT, MVT::v32i16, 1 },
2550 { ISD::UADDSAT, MVT::v64i8, 1 },
2551 { ISD::UMAX, MVT::v32i16, 1 },
2552 { ISD::UMAX, MVT::v64i8, 1 },
2553 { ISD::UMIN, MVT::v32i16, 1 },
2554 { ISD::UMIN, MVT::v64i8, 1 },
2555 { ISD::USUBSAT, MVT::v32i16, 1 },
2556 { ISD::USUBSAT, MVT::v64i8, 1 },
2557 };
2558 static const CostTblEntry AVX512CostTbl[] = {
2559 { ISD::ABS, MVT::v8i64, 1 },
2560 { ISD::ABS, MVT::v16i32, 1 },
2561 { ISD::ABS, MVT::v32i16, 2 }, // FIXME: include split
2562 { ISD::ABS, MVT::v64i8, 2 }, // FIXME: include split
2563 { ISD::ABS, MVT::v4i64, 1 },
2564 { ISD::ABS, MVT::v2i64, 1 },
2565 { ISD::BITREVERSE, MVT::v8i64, 36 },
2566 { ISD::BITREVERSE, MVT::v16i32, 24 },
2567 { ISD::BITREVERSE, MVT::v32i16, 10 },
2568 { ISD::BITREVERSE, MVT::v64i8, 10 },
2569 { ISD::BSWAP, MVT::v8i64, 4 },
2570 { ISD::BSWAP, MVT::v16i32, 4 },
2571 { ISD::BSWAP, MVT::v32i16, 4 },
2572 { ISD::CTLZ, MVT::v8i64, 29 },
2573 { ISD::CTLZ, MVT::v16i32, 35 },
2574 { ISD::CTLZ, MVT::v32i16, 28 },
2575 { ISD::CTLZ, MVT::v64i8, 18 },
2576 { ISD::CTPOP, MVT::v8i64, 16 },
2577 { ISD::CTPOP, MVT::v16i32, 24 },
2578 { ISD::CTPOP, MVT::v32i16, 18 },
2579 { ISD::CTPOP, MVT::v64i8, 12 },
2580 { ISD::CTTZ, MVT::v8i64, 20 },
2581 { ISD::CTTZ, MVT::v16i32, 28 },
2582 { ISD::CTTZ, MVT::v32i16, 24 },
2583 { ISD::CTTZ, MVT::v64i8, 18 },
2584 { ISD::SMAX, MVT::v8i64, 1 },
2585 { ISD::SMAX, MVT::v16i32, 1 },
2586 { ISD::SMAX, MVT::v32i16, 2 }, // FIXME: include split
2587 { ISD::SMAX, MVT::v64i8, 2 }, // FIXME: include split
2588 { ISD::SMAX, MVT::v4i64, 1 },
2589 { ISD::SMAX, MVT::v2i64, 1 },
2590 { ISD::SMIN, MVT::v8i64, 1 },
2591 { ISD::SMIN, MVT::v16i32, 1 },
2592 { ISD::SMIN, MVT::v32i16, 2 }, // FIXME: include split
2593 { ISD::SMIN, MVT::v64i8, 2 }, // FIXME: include split
2594 { ISD::SMIN, MVT::v4i64, 1 },
2595 { ISD::SMIN, MVT::v2i64, 1 },
2596 { ISD::UMAX, MVT::v8i64, 1 },
2597 { ISD::UMAX, MVT::v16i32, 1 },
2598 { ISD::UMAX, MVT::v32i16, 2 }, // FIXME: include split
2599 { ISD::UMAX, MVT::v64i8, 2 }, // FIXME: include split
2600 { ISD::UMAX, MVT::v4i64, 1 },
2601 { ISD::UMAX, MVT::v2i64, 1 },
2602 { ISD::UMIN, MVT::v8i64, 1 },
2603 { ISD::UMIN, MVT::v16i32, 1 },
2604 { ISD::UMIN, MVT::v32i16, 2 }, // FIXME: include split
2605 { ISD::UMIN, MVT::v64i8, 2 }, // FIXME: include split
2606 { ISD::UMIN, MVT::v4i64, 1 },
2607 { ISD::UMIN, MVT::v2i64, 1 },
2608 { ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd
2609 { ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq
2610 { ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq
2611 { ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq
2612 { ISD::UADDSAT, MVT::v16i32, 3 }, // not + pminud + paddd
2613 { ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq
2614 { ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq
2615 { ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq
2616 { ISD::SADDSAT, MVT::v32i16, 2 }, // FIXME: include split
2617 { ISD::SADDSAT, MVT::v64i8, 2 }, // FIXME: include split
2618 { ISD::SSUBSAT, MVT::v32i16, 2 }, // FIXME: include split
2619 { ISD::SSUBSAT, MVT::v64i8, 2 }, // FIXME: include split
2620 { ISD::UADDSAT, MVT::v32i16, 2 }, // FIXME: include split
2621 { ISD::UADDSAT, MVT::v64i8, 2 }, // FIXME: include split
2622 { ISD::USUBSAT, MVT::v32i16, 2 }, // FIXME: include split
2623 { ISD::USUBSAT, MVT::v64i8, 2 }, // FIXME: include split
2624 { ISD::FMAXNUM, MVT::f32, 2 },
2625 { ISD::FMAXNUM, MVT::v4f32, 2 },
2626 { ISD::FMAXNUM, MVT::v8f32, 2 },
2627 { ISD::FMAXNUM, MVT::v16f32, 2 },
2628 { ISD::FMAXNUM, MVT::f64, 2 },
2629 { ISD::FMAXNUM, MVT::v2f64, 2 },
2630 { ISD::FMAXNUM, MVT::v4f64, 2 },
2631 { ISD::FMAXNUM, MVT::v8f64, 2 },
2632 };
2633 static const CostTblEntry XOPCostTbl[] = {
2634 { ISD::BITREVERSE, MVT::v4i64, 4 },
2635 { ISD::BITREVERSE, MVT::v8i32, 4 },
2636 { ISD::BITREVERSE, MVT::v16i16, 4 },
2637 { ISD::BITREVERSE, MVT::v32i8, 4 },
2638 { ISD::BITREVERSE, MVT::v2i64, 1 },
2639 { ISD::BITREVERSE, MVT::v4i32, 1 },
2640 { ISD::BITREVERSE, MVT::v8i16, 1 },
2641 { ISD::BITREVERSE, MVT::v16i8, 1 },
2642 { ISD::BITREVERSE, MVT::i64, 3 },
2643 { ISD::BITREVERSE, MVT::i32, 3 },
2644 { ISD::BITREVERSE, MVT::i16, 3 },
2645 { ISD::BITREVERSE, MVT::i8, 3 }
2646 };
2647 static const CostTblEntry AVX2CostTbl[] = {
2648 { ISD::ABS, MVT::v4i64, 2 }, // VBLENDVPD(X,VPSUBQ(0,X),X)
2649 { ISD::ABS, MVT::v8i32, 1 },
2650 { ISD::ABS, MVT::v16i16, 1 },
2651 { ISD::ABS, MVT::v32i8, 1 },
2652 { ISD::BITREVERSE, MVT::v4i64, 5 },
2653 { ISD::BITREVERSE, MVT::v8i32, 5 },
2654 { ISD::BITREVERSE, MVT::v16i16, 5 },
2655 { ISD::BITREVERSE, MVT::v32i8, 5 },
2656 { ISD::BSWAP, MVT::v4i64, 1 },
2657 { ISD::BSWAP, MVT::v8i32, 1 },
2658 { ISD::BSWAP, MVT::v16i16, 1 },
2659 { ISD::CTLZ, MVT::v4i64, 23 },
2660 { ISD::CTLZ, MVT::v8i32, 18 },
2661 { ISD::CTLZ, MVT::v16i16, 14 },
2662 { ISD::CTLZ, MVT::v32i8, 9 },
2663 { ISD::CTPOP, MVT::v4i64, 7 },
2664 { ISD::CTPOP, MVT::v8i32, 11 },
2665 { ISD::CTPOP, MVT::v16i16, 9 },
2666 { ISD::CTPOP, MVT::v32i8, 6 },
2667 { ISD::CTTZ, MVT::v4i64, 10 },
2668 { ISD::CTTZ, MVT::v8i32, 14 },
2669 { ISD::CTTZ, MVT::v16i16, 12 },
2670 { ISD::CTTZ, MVT::v32i8, 9 },
2671 { ISD::SADDSAT, MVT::v16i16, 1 },
2672 { ISD::SADDSAT, MVT::v32i8, 1 },
2673 { ISD::SMAX, MVT::v8i32, 1 },
2674 { ISD::SMAX, MVT::v16i16, 1 },
2675 { ISD::SMAX, MVT::v32i8, 1 },
2676 { ISD::SMIN, MVT::v8i32, 1 },
2677 { ISD::SMIN, MVT::v16i16, 1 },
2678 { ISD::SMIN, MVT::v32i8, 1 },
2679 { ISD::SSUBSAT, MVT::v16i16, 1 },
2680 { ISD::SSUBSAT, MVT::v32i8, 1 },
2681 { ISD::UADDSAT, MVT::v16i16, 1 },
2682 { ISD::UADDSAT, MVT::v32i8, 1 },
2683 { ISD::UADDSAT, MVT::v8i32, 3 }, // not + pminud + paddd
2684 { ISD::UMAX, MVT::v8i32, 1 },
2685 { ISD::UMAX, MVT::v16i16, 1 },
2686 { ISD::UMAX, MVT::v32i8, 1 },
2687 { ISD::UMIN, MVT::v8i32, 1 },
2688 { ISD::UMIN, MVT::v16i16, 1 },
2689 { ISD::UMIN, MVT::v32i8, 1 },
2690 { ISD::USUBSAT, MVT::v16i16, 1 },
2691 { ISD::USUBSAT, MVT::v32i8, 1 },
2692 { ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd
2693 { ISD::FMAXNUM, MVT::v8f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS
2694 { ISD::FMAXNUM, MVT::v4f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD
2695 { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/
2696 { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
2697 { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
2698 { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/
2699 { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
2700 { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
2701 };
2702 static const CostTblEntry AVX1CostTbl[] = {
2703 { ISD::ABS, MVT::v4i64, 5 }, // VBLENDVPD(X,VPSUBQ(0,X),X)
2704 { ISD::ABS, MVT::v8i32, 3 },
2705 { ISD::ABS, MVT::v16i16, 3 },
2706 { ISD::ABS, MVT::v32i8, 3 },
2707 { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert
2708 { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert
2709 { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert
2710 { ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert
2711 { ISD::BSWAP, MVT::v4i64, 4 },
2712 { ISD::BSWAP, MVT::v8i32, 4 },
2713 { ISD::BSWAP, MVT::v16i16, 4 },
2714 { ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert
2715 { ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert
2716 { ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert
2717 { ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
2718 { ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert
2719 { ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert
2720 { ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert
2721 { ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert
2722 { ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert
2723 { ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert
2724 { ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert
2725 { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
2726 { ISD::SADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2727 { ISD::SADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2728 { ISD::SMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
2729 { ISD::SMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2730 { ISD::SMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2731 { ISD::SMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
2732 { ISD::SMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2733 { ISD::SMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2734 { ISD::SSUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2735 { ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2736 { ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2737 { ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2738 { ISD::UADDSAT, MVT::v8i32, 8 }, // 2 x 128-bit Op + extract/insert
2739 { ISD::UMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
2740 { ISD::UMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2741 { ISD::UMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2742 { ISD::UMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
2743 { ISD::UMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2744 { ISD::UMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2745 { ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
2746 { ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
2747 { ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert
2748 { ISD::FMAXNUM, MVT::f32, 3 }, // MAXSS + CMPUNORDSS + BLENDVPS
2749 { ISD::FMAXNUM, MVT::v4f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS
2750 { ISD::FMAXNUM, MVT::v8f32, 5 }, // MAXPS + CMPUNORDPS + BLENDVPS + ?
2751 { ISD::FMAXNUM, MVT::f64, 3 }, // MAXSD + CMPUNORDSD + BLENDVPD
2752 { ISD::FMAXNUM, MVT::v2f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD
2753 { ISD::FMAXNUM, MVT::v4f64, 5 }, // MAXPD + CMPUNORDPD + BLENDVPD + ?
2754 { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
2755 { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
2756 { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
2757 { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/
2758 { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/
2759 { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/
2760 };
2761 static const CostTblEntry GLMCostTbl[] = {
2762 { ISD::FSQRT, MVT::f32, 19 }, // sqrtss
2763 { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps
2764 { ISD::FSQRT, MVT::f64, 34 }, // sqrtsd
2765 { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd
2766 };
2767 static const CostTblEntry SLMCostTbl[] = {
2768 { ISD::FSQRT, MVT::f32, 20 }, // sqrtss
2769 { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps
2770 { ISD::FSQRT, MVT::f64, 35 }, // sqrtsd
2771 { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd
2772 };
2773 static const CostTblEntry SSE42CostTbl[] = {
2774 { ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd
2775 { ISD::UADDSAT, MVT::v4i32, 3 }, // not + pminud + paddd
2776 { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
2777 { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
2778 };
2779 static const CostTblEntry SSE41CostTbl[] = {
2780 { ISD::ABS, MVT::v2i64, 2 }, // BLENDVPD(X,PSUBQ(0,X),X)
2781 { ISD::SMAX, MVT::v4i32, 1 },
2782 { ISD::SMAX, MVT::v16i8, 1 },
2783 { ISD::SMIN, MVT::v4i32, 1 },
2784 { ISD::SMIN, MVT::v16i8, 1 },
2785 { ISD::UMAX, MVT::v4i32, 1 },
2786 { ISD::UMAX, MVT::v8i16, 1 },
2787 { ISD::UMIN, MVT::v4i32, 1 },
2788 { ISD::UMIN, MVT::v8i16, 1 },
2789 };
2790 static const CostTblEntry SSSE3CostTbl[] = {
2791 { ISD::ABS, MVT::v4i32, 1 },
2792 { ISD::ABS, MVT::v8i16, 1 },
2793 { ISD::ABS, MVT::v16i8, 1 },
2794 { ISD::BITREVERSE, MVT::v2i64, 5 },
2795 { ISD::BITREVERSE, MVT::v4i32, 5 },
2796 { ISD::BITREVERSE, MVT::v8i16, 5 },
2797 { ISD::BITREVERSE, MVT::v16i8, 5 },
2798 { ISD::BSWAP, MVT::v2i64, 1 },
2799 { ISD::BSWAP, MVT::v4i32, 1 },
2800 { ISD::BSWAP, MVT::v8i16, 1 },
2801 { ISD::CTLZ, MVT::v2i64, 23 },
2802 { ISD::CTLZ, MVT::v4i32, 18 },
2803 { ISD::CTLZ, MVT::v8i16, 14 },
2804 { ISD::CTLZ, MVT::v16i8, 9 },
2805 { ISD::CTPOP, MVT::v2i64, 7 },
2806 { ISD::CTPOP, MVT::v4i32, 11 },
2807 { ISD::CTPOP, MVT::v8i16, 9 },
2808 { ISD::CTPOP, MVT::v16i8, 6 },
2809 { ISD::CTTZ, MVT::v2i64, 10 },
2810 { ISD::CTTZ, MVT::v4i32, 14 },
2811 { ISD::CTTZ, MVT::v8i16, 12 },
2812 { ISD::CTTZ, MVT::v16i8, 9 }
2813 };
2814 static const CostTblEntry SSE2CostTbl[] = {
2815 { ISD::ABS, MVT::v2i64, 4 },
2816 { ISD::ABS, MVT::v4i32, 3 },
2817 { ISD::ABS, MVT::v8i16, 2 },
2818 { ISD::ABS, MVT::v16i8, 2 },
2819 { ISD::BITREVERSE, MVT::v2i64, 29 },
2820 { ISD::BITREVERSE, MVT::v4i32, 27 },
2821 { ISD::BITREVERSE, MVT::v8i16, 27 },
2822 { ISD::BITREVERSE, MVT::v16i8, 20 },
2823 { ISD::BSWAP, MVT::v2i64, 7 },
2824 { ISD::BSWAP, MVT::v4i32, 7 },
2825 { ISD::BSWAP, MVT::v8i16, 7 },
2826 { ISD::CTLZ, MVT::v2i64, 25 },
2827 { ISD::CTLZ, MVT::v4i32, 26 },
2828 { ISD::CTLZ, MVT::v8i16, 20 },
2829 { ISD::CTLZ, MVT::v16i8, 17 },
2830 { ISD::CTPOP, MVT::v2i64, 12 },
2831 { ISD::CTPOP, MVT::v4i32, 15 },
2832 { ISD::CTPOP, MVT::v8i16, 13 },
2833 { ISD::CTPOP, MVT::v16i8, 10 },
2834 { ISD::CTTZ, MVT::v2i64, 14 },
2835 { ISD::CTTZ, MVT::v4i32, 18 },
2836 { ISD::CTTZ, MVT::v8i16, 16 },
2837 { ISD::CTTZ, MVT::v16i8, 13 },
2838 { ISD::SADDSAT, MVT::v8i16, 1 },
2839 { ISD::SADDSAT, MVT::v16i8, 1 },
2840 { ISD::SMAX, MVT::v8i16, 1 },
2841 { ISD::SMIN, MVT::v8i16, 1 },
2842 { ISD::SSUBSAT, MVT::v8i16, 1 },
2843 { ISD::SSUBSAT, MVT::v16i8, 1 },
2844 { ISD::UADDSAT, MVT::v8i16, 1 },
2845 { ISD::UADDSAT, MVT::v16i8, 1 },
2846 { ISD::UMAX, MVT::v8i16, 2 },
2847 { ISD::UMAX, MVT::v16i8, 1 },
2848 { ISD::UMIN, MVT::v8i16, 2 },
2849 { ISD::UMIN, MVT::v16i8, 1 },
2850 { ISD::USUBSAT, MVT::v8i16, 1 },
2851 { ISD::USUBSAT, MVT::v16i8, 1 },
2852 { ISD::FMAXNUM, MVT::f64, 4 },
2853 { ISD::FMAXNUM, MVT::v2f64, 4 },
2854 { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/
2855 { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/
2856 };
2857 static const CostTblEntry SSE1CostTbl[] = {
2858 { ISD::FMAXNUM, MVT::f32, 4 },
2859 { ISD::FMAXNUM, MVT::v4f32, 4 },
2860 { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
2861 { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
2862 };
2863 static const CostTblEntry BMI64CostTbl[] = { // 64-bit targets
2864 { ISD::CTTZ, MVT::i64, 1 },
2865 };
2866 static const CostTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
2867 { ISD::CTTZ, MVT::i32, 1 },
2868 { ISD::CTTZ, MVT::i16, 1 },
2869 { ISD::CTTZ, MVT::i8, 1 },
2870 };
2871 static const CostTblEntry LZCNT64CostTbl[] = { // 64-bit targets
2872 { ISD::CTLZ, MVT::i64, 1 },
2873 };
2874 static const CostTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
2875 { ISD::CTLZ, MVT::i32, 1 },
2876 { ISD::CTLZ, MVT::i16, 1 },
2877 { ISD::CTLZ, MVT::i8, 1 },
2878 };
2879 static const CostTblEntry POPCNT64CostTbl[] = { // 64-bit targets
2880 { ISD::CTPOP, MVT::i64, 1 },
2881 };
2882 static const CostTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
2883 { ISD::CTPOP, MVT::i32, 1 },
2884 { ISD::CTPOP, MVT::i16, 1 },
2885 { ISD::CTPOP, MVT::i8, 1 },
2886 };
2887 static const CostTblEntry X64CostTbl[] = { // 64-bit targets
2888 { ISD::ABS, MVT::i64, 2 }, // SUB+CMOV
2889 { ISD::BITREVERSE, MVT::i64, 14 },
2890 { ISD::BSWAP, MVT::i64, 1 },
2891 { ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV
2892 { ISD::CTTZ, MVT::i64, 3 }, // TEST+BSF+CMOV/BRANCH
2893 { ISD::CTPOP, MVT::i64, 10 },
2894 { ISD::SADDO, MVT::i64, 1 },
2895 { ISD::UADDO, MVT::i64, 1 },
2896 { ISD::UMULO, MVT::i64, 2 }, // mulq + seto
2897 };
2898 static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
2899 { ISD::ABS, MVT::i32, 2 }, // SUB+CMOV
2900 { ISD::ABS, MVT::i16, 2 }, // SUB+CMOV
2901 { ISD::BITREVERSE, MVT::i32, 14 },
2902 { ISD::BITREVERSE, MVT::i16, 14 },
2903 { ISD::BITREVERSE, MVT::i8, 11 },
2904 { ISD::BSWAP, MVT::i32, 1 },
2905 { ISD::BSWAP, MVT::i16, 1 }, // ROL
2906 { ISD::CTLZ, MVT::i32, 4 }, // BSR+XOR or BSR+XOR+CMOV
2907 { ISD::CTLZ, MVT::i16, 4 }, // BSR+XOR or BSR+XOR+CMOV
2908 { ISD::CTLZ, MVT::i8, 4 }, // BSR+XOR or BSR+XOR+CMOV
2909 { ISD::CTTZ, MVT::i32, 3 }, // TEST+BSF+CMOV/BRANCH
2910 { ISD::CTTZ, MVT::i16, 3 }, // TEST+BSF+CMOV/BRANCH
2911 { ISD::CTTZ, MVT::i8, 3 }, // TEST+BSF+CMOV/BRANCH
2912 { ISD::CTPOP, MVT::i32, 8 },
2913 { ISD::CTPOP, MVT::i16, 9 },
2914 { ISD::CTPOP, MVT::i8, 7 },
2915 { ISD::SADDO, MVT::i32, 1 },
2916 { ISD::SADDO, MVT::i16, 1 },
2917 { ISD::SADDO, MVT::i8, 1 },
2918 { ISD::UADDO, MVT::i32, 1 },
2919 { ISD::UADDO, MVT::i16, 1 },
2920 { ISD::UADDO, MVT::i8, 1 },
2921 { ISD::UMULO, MVT::i32, 2 }, // mul + seto
2922 { ISD::UMULO, MVT::i16, 2 },
2923 { ISD::UMULO, MVT::i8, 2 },
2924 };
2925
2926 Type *RetTy = ICA.getReturnType();
2927 Type *OpTy = RetTy;
2928 Intrinsic::ID IID = ICA.getID();
2929 unsigned ISD = ISD::DELETED_NODE;
2930 switch (IID) {
2931 default:
2932 break;
2933 case Intrinsic::abs:
2934 ISD = ISD::ABS;
2935 break;
2936 case Intrinsic::bitreverse:
2937 ISD = ISD::BITREVERSE;
2938 break;
2939 case Intrinsic::bswap:
2940 ISD = ISD::BSWAP;
2941 break;
2942 case Intrinsic::ctlz:
2943 ISD = ISD::CTLZ;
2944 break;
2945 case Intrinsic::ctpop:
2946 ISD = ISD::CTPOP;
2947 break;
2948 case Intrinsic::cttz:
2949 ISD = ISD::CTTZ;
2950 break;
2951 case Intrinsic::maxnum:
2952 case Intrinsic::minnum:
2953 // FMINNUM has same costs so don't duplicate.
2954 ISD = ISD::FMAXNUM;
2955 break;
2956 case Intrinsic::sadd_sat:
2957 ISD = ISD::SADDSAT;
2958 break;
2959 case Intrinsic::smax:
2960 ISD = ISD::SMAX;
2961 break;
2962 case Intrinsic::smin:
2963 ISD = ISD::SMIN;
2964 break;
2965 case Intrinsic::ssub_sat:
2966 ISD = ISD::SSUBSAT;
2967 break;
2968 case Intrinsic::uadd_sat:
2969 ISD = ISD::UADDSAT;
2970 break;
2971 case Intrinsic::umax:
2972 ISD = ISD::UMAX;
2973 break;
2974 case Intrinsic::umin:
2975 ISD = ISD::UMIN;
2976 break;
2977 case Intrinsic::usub_sat:
2978 ISD = ISD::USUBSAT;
2979 break;
2980 case Intrinsic::sqrt:
2981 ISD = ISD::FSQRT;
2982 break;
2983 case Intrinsic::sadd_with_overflow:
2984 case Intrinsic::ssub_with_overflow:
2985 // SSUBO has same costs so don't duplicate.
2986 ISD = ISD::SADDO;
2987 OpTy = RetTy->getContainedType(0);
2988 break;
2989 case Intrinsic::uadd_with_overflow:
2990 case Intrinsic::usub_with_overflow:
2991 // USUBO has same costs so don't duplicate.
2992 ISD = ISD::UADDO;
2993 OpTy = RetTy->getContainedType(0);
2994 break;
2995 case Intrinsic::umul_with_overflow:
2996 case Intrinsic::smul_with_overflow:
2997 // SMULO has same costs so don't duplicate.
2998 ISD = ISD::UMULO;
2999 OpTy = RetTy->getContainedType(0);
3000 break;
3001 }
3002
3003 if (ISD != ISD::DELETED_NODE) {
3004 // Legalize the type.
3005 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, OpTy);
3006 MVT MTy = LT.second;
3007
3008 // Attempt to lookup cost.
3009 if (ISD == ISD::BITREVERSE && ST->hasGFNI() && ST->hasSSSE3() &&
3010 MTy.isVector()) {
3011 // With PSHUFB the code is very similar for all types. If we have integer
3012 // byte operations, we just need a GF2P8AFFINEQB for vXi8. For other types
3013 // we also need a PSHUFB.
3014 unsigned Cost = MTy.getVectorElementType() == MVT::i8 ? 1 : 2;
3015
3016 // Without byte operations, we need twice as many GF2P8AFFINEQB and PSHUFB
3017 // instructions. We also need an extract and an insert.
3018 if (!(MTy.is128BitVector() || (ST->hasAVX2() && MTy.is256BitVector()) ||
3019 (ST->hasBWI() && MTy.is512BitVector())))
3020 Cost = Cost * 2 + 2;
3021
3022 return LT.first * Cost;
3023 }
3024
3025 auto adjustTableCost = [](const CostTblEntry &Entry,
3026 InstructionCost LegalizationCost,
3027 FastMathFlags FMF) {
3028 // If there are no NANs to deal with, then these are reduced to a
3029 // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
3030 // assume is used in the non-fast case.
3031 if (Entry.ISD == ISD::FMAXNUM || Entry.ISD == ISD::FMINNUM) {
3032 if (FMF.noNaNs())
3033 return LegalizationCost * 1;
3034 }
3035 return LegalizationCost * (int)Entry.Cost;
3036 };
3037
3038 if (ST->useGLMDivSqrtCosts())
3039 if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
3040 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3041
3042 if (ST->isSLM())
3043 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
3044 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3045
3046 if (ST->hasCDI())
3047 if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
3048 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3049
3050 if (ST->hasBWI())
3051 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3052 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3053
3054 if (ST->hasAVX512())
3055 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3056 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3057
3058 if (ST->hasXOP())
3059 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3060 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3061
3062 if (ST->hasAVX2())
3063 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3064 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3065
3066 if (ST->hasAVX())
3067 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3068 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3069
3070 if (ST->hasSSE42())
3071 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3072 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3073
3074 if (ST->hasSSE41())
3075 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3076 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3077
3078 if (ST->hasSSSE3())
3079 if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
3080 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3081
3082 if (ST->hasSSE2())
3083 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3084 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3085
3086 if (ST->hasSSE1())
3087 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3088 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3089
3090 if (ST->hasBMI()) {
3091 if (ST->is64Bit())
3092 if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
3093 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3094
3095 if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
3096 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3097 }
3098
3099 if (ST->hasLZCNT()) {
3100 if (ST->is64Bit())
3101 if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
3102 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3103
3104 if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
3105 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3106 }
3107
3108 if (ST->hasPOPCNT()) {
3109 if (ST->is64Bit())
3110 if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
3111 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3112
3113 if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
3114 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3115 }
3116
3117 if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
3118 if (const Instruction *II = ICA.getInst()) {
3119 if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
3120 return TTI::TCC_Free;
3121 if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
3122 if (LI->hasOneUse())
3123 return TTI::TCC_Free;
3124 }
3125 }
3126 }
3127
3128 // TODO - add BMI (TZCNT) scalar handling
3129
3130 if (ST->is64Bit())
3131 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
3132 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3133
3134 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
3135 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3136 }
3137
3138 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
3139}
3140
3141InstructionCost
3142X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
3143 TTI::TargetCostKind CostKind) {
3144 if (ICA.isTypeBasedOnly())
3145 return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
3146
3147 static const CostTblEntry AVX512CostTbl[] = {
3148 { ISD::ROTL, MVT::v8i64, 1 },
3149 { ISD::ROTL, MVT::v4i64, 1 },
3150 { ISD::ROTL, MVT::v2i64, 1 },
3151 { ISD::ROTL, MVT::v16i32, 1 },
3152 { ISD::ROTL, MVT::v8i32, 1 },
3153 { ISD::ROTL, MVT::v4i32, 1 },
3154 { ISD::ROTR, MVT::v8i64, 1 },
3155 { ISD::ROTR, MVT::v4i64, 1 },
3156 { ISD::ROTR, MVT::v2i64, 1 },
3157 { ISD::ROTR, MVT::v16i32, 1 },
3158 { ISD::ROTR, MVT::v8i32, 1 },
3159 { ISD::ROTR, MVT::v4i32, 1 }
3160 };
3161 // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
3162 static const CostTblEntry XOPCostTbl[] = {
3163 { ISD::ROTL, MVT::v4i64, 4 },
3164 { ISD::ROTL, MVT::v8i32, 4 },
3165 { ISD::ROTL, MVT::v16i16, 4 },
3166 { ISD::ROTL, MVT::v32i8, 4 },
3167 { ISD::ROTL, MVT::v2i64, 1 },
3168 { ISD::ROTL, MVT::v4i32, 1 },
3169 { ISD::ROTL, MVT::v8i16, 1 },
3170 { ISD::ROTL, MVT::v16i8, 1 },
3171 { ISD::ROTR, MVT::v4i64, 6 },
3172 { ISD::ROTR, MVT::v8i32, 6 },
3173 { ISD::ROTR, MVT::v16i16, 6 },
3174 { ISD::ROTR, MVT::v32i8, 6 },
3175 { ISD::ROTR, MVT::v2i64, 2 },
3176 { ISD::ROTR, MVT::v4i32, 2 },
3177 { ISD::ROTR, MVT::v8i16, 2 },
3178 { ISD::ROTR, MVT::v16i8, 2 }
3179 };
3180 static const CostTblEntry X64CostTbl[] = { // 64-bit targets
3181 { ISD::ROTL, MVT::i64, 1 },
3182 { ISD::ROTR, MVT::i64, 1 },
3183 { ISD::FSHL, MVT::i64, 4 }
3184 };
3185 static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
3186 { ISD::ROTL, MVT::i32, 1 },
3187 { ISD::ROTL, MVT::i16, 1 },
3188 { ISD::ROTL, MVT::i8, 1 },
3189 { ISD::ROTR, MVT::i32, 1 },
3190 { ISD::ROTR, MVT::i16, 1 },
3191 { ISD::ROTR, MVT::i8, 1 },
3192 { ISD::FSHL, MVT::i32, 4 },
3193 { ISD::FSHL, MVT::i16, 4 },
3194 { ISD::FSHL, MVT::i8, 4 }
3195 };
3196
3197 Intrinsic::ID IID = ICA.getID();
3198 Type *RetTy = ICA.getReturnType();
3199 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
3200 unsigned ISD = ISD::DELETED_NODE;
3201 switch (IID) {
3202 default:
3203 break;
3204 case Intrinsic::fshl:
3205 ISD = ISD::FSHL;
3206 if (Args[0] == Args[1])
3207 ISD = ISD::ROTL;
3208 break;
3209 case Intrinsic::fshr:
3210 // FSHR has same costs so don't duplicate.
3211 ISD = ISD::FSHL;
3212 if (Args[0] == Args[1])
3213 ISD = ISD::ROTR;
3214 break;
3215 }
3216
3217 if (ISD != ISD::DELETED_NODE) {
3218 // Legalize the type.
3219 std::pair<InstructionCost, MVT> LT =
3220 TLI->getTypeLegalizationCost(DL, RetTy);
3221 MVT MTy = LT.second;
3222
3223 // Attempt to lookup cost.
3224 if (ST->hasAVX512())
3225 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3226 return LT.first * Entry->Cost;
3227
3228 if (ST->hasXOP())
3229 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3230 return LT.first * Entry->Cost;
3231
3232 if (ST->is64Bit())
3233 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
3234 return LT.first * Entry->Cost;
3235
3236 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
3237 return LT.first * Entry->Cost;
3238 }
3239
3240 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
3241}
3242
3243InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
3244 unsigned Index) {
3245 static const CostTblEntry SLMCostTbl[] = {
3246 { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },
3247 { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 },
3248 { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 },
3249 { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 }
3250 };
3251
3252 assert(Val->isVectorTy() && "This must be a vector type")(static_cast <bool> (Val->isVectorTy() && "This must be a vector type"
) ? void (0) : __assert_fail ("Val->isVectorTy() && \"This must be a vector type\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3252, __extension__ __PRETTY_FUNCTION__))
;
21
'?' condition is true
3253 Type *ScalarType = Val->getScalarType();
3254 int RegisterFileMoveCost = 0;
3255
3256 if (Index != -1U && (Opcode
21.1
'Opcode' is equal to ExtractElement
21.1
'Opcode' is equal to ExtractElement
21.1
'Opcode' is equal to ExtractElement
== Instruction::ExtractElement ||
3257 Opcode == Instruction::InsertElement)) {
3258 // Legalize the type.
3259 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
3260
3261 // This type is legalized to a scalar type.
3262 if (!LT.second.isVector())
22
Calling 'MVT::isVector'
26
Returning from 'MVT::isVector'
27
Taking false branch
3263 return 0;
3264
3265 // The type may be split. Normalize the index to the new type.
3266 unsigned NumElts = LT.second.getVectorNumElements();
3267 unsigned SubNumElts = NumElts;
3268 Index = Index % NumElts;
3269
3270 // For >128-bit vectors, we need to extract higher 128-bit subvectors.
3271 // For inserts, we also need to insert the subvector back.
3272 if (LT.second.getSizeInBits() > 128) {
28
Assuming the condition is true
29
Taking true branch
3273 assert((LT.second.getSizeInBits() % 128) == 0 && "Illegal vector")(static_cast <bool> ((LT.second.getSizeInBits() % 128) ==
0 && "Illegal vector") ? void (0) : __assert_fail ("(LT.second.getSizeInBits() % 128) == 0 && \"Illegal vector\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3273, __extension__ __PRETTY_FUNCTION__))
;
30
Assuming the condition is true
31
'?' condition is true
3274 unsigned NumSubVecs = LT.second.getSizeInBits() / 128;
3275 SubNumElts = NumElts / NumSubVecs;
32
Value assigned to 'SubNumElts'
3276 if (SubNumElts <= Index) {
33
Assuming 'SubNumElts' is <= 'Index'
34
Taking true branch
3277 RegisterFileMoveCost += (Opcode
34.1
'Opcode' is not equal to InsertElement
34.1
'Opcode' is not equal to InsertElement
34.1
'Opcode' is not equal to InsertElement
== Instruction::InsertElement ? 2 : 1);
35
'?' condition is false
3278 Index %= SubNumElts;
36
Division by zero
3279 }
3280 }
3281
3282 if (Index == 0) {
3283 // Floating point scalars are already located in index #0.
3284 // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
3285 // true for all.
3286 if (ScalarType->isFloatingPointTy())
3287 return RegisterFileMoveCost;
3288
3289 // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
3290 if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
3291 return 1 + RegisterFileMoveCost;
3292 }
3293
3294 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3295 assert(ISD && "Unexpected vector opcode")(static_cast <bool> (ISD && "Unexpected vector opcode"
) ? void (0) : __assert_fail ("ISD && \"Unexpected vector opcode\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3295, __extension__ __PRETTY_FUNCTION__))
;
3296 MVT MScalarTy = LT.second.getScalarType();
3297 if (ST->isSLM())
3298 if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
3299 return Entry->Cost + RegisterFileMoveCost;
3300
3301 // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
3302 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
3303 (MScalarTy.isInteger() && ST->hasSSE41()))
3304 return 1 + RegisterFileMoveCost;
3305
3306 // Assume insertps is relatively cheap on all targets.
3307 if (MScalarTy == MVT::f32 && ST->hasSSE41() &&
3308 Opcode == Instruction::InsertElement)
3309 return 1 + RegisterFileMoveCost;
3310
3311 // For extractions we just need to shuffle the element to index 0, which
3312 // should be very cheap (assume cost = 1). For insertions we need to shuffle
3313 // the elements to its destination. In both cases we must handle the
3314 // subvector move(s).
3315 // If the vector type is already less than 128-bits then don't reduce it.
3316 // TODO: Under what circumstances should we shuffle using the full width?
3317 InstructionCost ShuffleCost = 1;
3318 if (Opcode == Instruction::InsertElement) {
3319 auto *SubTy = cast<VectorType>(Val);
3320 EVT VT = TLI->getValueType(DL, Val);
3321 if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
3322 SubTy = FixedVectorType::get(ScalarType, SubNumElts);
3323 ShuffleCost =
3324 getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, None, 0, SubTy);
3325 }
3326 int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
3327 return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
3328 }
3329
3330 // Add to the base cost if we know that the extracted element of a vector is
3331 // destined to be moved to and used in the integer register file.
3332 if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
3333 RegisterFileMoveCost += 1;
3334
3335 return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
3336}
3337
3338InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
3339 const APInt &DemandedElts,
3340 bool Insert,
3341 bool Extract) {
3342 InstructionCost Cost = 0;
3343
3344 // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
3345 // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
3346 if (Insert) {
3347 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
3348 MVT MScalarTy = LT.second.getScalarType();
3349
3350 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
3351 (MScalarTy.isInteger() && ST->hasSSE41()) ||
3352 (MScalarTy == MVT::f32 && ST->hasSSE41())) {
3353 // For types we can insert directly, insertion into 128-bit sub vectors is
3354 // cheap, followed by a cheap chain of concatenations.
3355 if (LT.second.getSizeInBits() <= 128) {
3356 Cost +=
3357 BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false);
3358 } else {
3359 // In each 128-lane, if at least one index is demanded but not all
3360 // indices are demanded and this 128-lane is not the first 128-lane of
3361 // the legalized-vector, then this 128-lane needs a extracti128; If in
3362 // each 128-lane, there is at least one demanded index, this 128-lane
3363 // needs a inserti128.
3364
3365 // The following cases will help you build a better understanding:
3366 // Assume we insert several elements into a v8i32 vector in avx2,
3367 // Case#1: inserting into 1th index needs vpinsrd + inserti128.
3368 // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
3369 // inserti128.
3370 // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
3371 const int CostValue = *LT.first.getValue();
3372 assert(CostValue >= 0 && "Negative cost!")(static_cast <bool> (CostValue >= 0 && "Negative cost!"
) ? void (0) : __assert_fail ("CostValue >= 0 && \"Negative cost!\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3372, __extension__ __PRETTY_FUNCTION__))
;
3373 unsigned Num128Lanes = LT.second.getSizeInBits() / 128 * CostValue;
3374 unsigned NumElts = LT.second.getVectorNumElements() * CostValue;
3375 APInt WidenedDemandedElts = DemandedElts.zextOrSelf(NumElts);
3376 unsigned Scale = NumElts / Num128Lanes;
3377 // We iterate each 128-lane, and check if we need a
3378 // extracti128/inserti128 for this 128-lane.
3379 for (unsigned I = 0; I < NumElts; I += Scale) {
3380 APInt Mask = WidenedDemandedElts.getBitsSet(NumElts, I, I + Scale);
3381 APInt MaskedDE = Mask & WidenedDemandedElts;
3382 unsigned Population = MaskedDE.countPopulation();
3383 Cost += (Population > 0 && Population != Scale &&
3384 I % LT.second.getVectorNumElements() != 0);
3385 Cost += Population > 0;
3386 }
3387 Cost += DemandedElts.countPopulation();
3388
3389 // For vXf32 cases, insertion into the 0'th index in each v4f32
3390 // 128-bit vector is free.
3391 // NOTE: This assumes legalization widens vXf32 vectors.
3392 if (MScalarTy == MVT::f32)
3393 for (unsigned i = 0, e = cast<FixedVectorType>(Ty)->getNumElements();
3394 i < e; i += 4)
3395 if (DemandedElts[i])
3396 Cost--;
3397 }
3398 } else if (LT.second.isVector()) {
3399 // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
3400 // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
3401 // series of UNPCK followed by CONCAT_VECTORS - all of these can be
3402 // considered cheap.
3403 if (Ty->isIntOrIntVectorTy())
3404 Cost += DemandedElts.countPopulation();
3405
3406 // Get the smaller of the legalized or original pow2-extended number of
3407 // vector elements, which represents the number of unpacks we'll end up
3408 // performing.
3409 unsigned NumElts = LT.second.getVectorNumElements();
3410 unsigned Pow2Elts =
3411 PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements());
3412 Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
3413 }
3414 }
3415
3416 // TODO: Use default extraction for now, but we should investigate extending this
3417 // to handle repeated subvector extraction.
3418 if (Extract)
3419 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, false, Extract);
3420
3421 return Cost;
3422}
3423
3424InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
3425 MaybeAlign Alignment,
3426 unsigned AddressSpace,
3427 TTI::TargetCostKind CostKind,
3428 const Instruction *I) {
3429 // TODO: Handle other cost kinds.
3430 if (CostKind != TTI::TCK_RecipThroughput) {
3431 if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
3432 // Store instruction with index and scale costs 2 Uops.
3433 // Check the preceding GEP to identify non-const indices.
3434 if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
3435 if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
3436 return TTI::TCC_Basic * 2;
3437 }
3438 }
3439 return TTI::TCC_Basic;
3440 }
3441
3442 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&(static_cast <bool> ((Opcode == Instruction::Load || Opcode
== Instruction::Store) && "Invalid Opcode") ? void (
0) : __assert_fail ("(Opcode == Instruction::Load || Opcode == Instruction::Store) && \"Invalid Opcode\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3443, __extension__ __PRETTY_FUNCTION__))
3443 "Invalid Opcode")(static_cast <bool> ((Opcode == Instruction::Load || Opcode
== Instruction::Store) && "Invalid Opcode") ? void (
0) : __assert_fail ("(Opcode == Instruction::Load || Opcode == Instruction::Store) && \"Invalid Opcode\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3443, __extension__ __PRETTY_FUNCTION__))
;
3444 // Type legalization can't handle structs
3445 if (TLI->getValueType(DL, Src, true) == MVT::Other)
3446 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
3447 CostKind);
3448
3449 // Legalize the type.
3450 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
3451
3452 auto *VTy = dyn_cast<FixedVectorType>(Src);
3453
3454 // Handle the simple case of non-vectors.
3455 // NOTE: this assumes that legalization never creates vector from scalars!
3456 if (!VTy || !LT.second.isVector())
3457 // Each load/store unit costs 1.
3458 return LT.first * 1;
3459
3460 bool IsLoad = Opcode == Instruction::Load;
3461
3462 Type *EltTy = VTy->getElementType();
3463
3464 const int EltTyBits = DL.getTypeSizeInBits(EltTy);
3465
3466 InstructionCost Cost = 0;
3467
3468 // Source of truth: how many elements were there in the original IR vector?
3469 const unsigned SrcNumElt = VTy->getNumElements();
3470
3471 // How far have we gotten?
3472 int NumEltRemaining = SrcNumElt;
3473 // Note that we intentionally capture by-reference, NumEltRemaining changes.
3474 auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
3475
3476 const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
3477
3478 // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
3479 const unsigned XMMBits = 128;
3480 if (XMMBits % EltTyBits != 0)
3481 // Vector size must be a multiple of the element size. I.e. no padding.
3482 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
3483 CostKind);
3484 const int NumEltPerXMM = XMMBits / EltTyBits;
3485
3486 auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
3487
3488 for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
3489 NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
3490 // How many elements would a single op deal with at once?
3491 if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
3492 // Vector size must be a multiple of the element size. I.e. no padding.
3493 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
3494 CostKind);
3495 int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
3496
3497 assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?")(static_cast <bool> (CurrOpSizeBytes > 0 && CurrNumEltPerOp
> 0 && "How'd we get here?") ? void (0) : __assert_fail
("CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && \"How'd we get here?\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3497, __extension__ __PRETTY_FUNCTION__))
;
3498 assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||(static_cast <bool> ((((NumEltRemaining * EltTyBits) <
(2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes
)) && "Unless we haven't halved the op size yet, " "we have less than two op's sized units of work left."
) ? void (0) : __assert_fail ("(((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes)) && \"Unless we haven't halved the op size yet, \" \"we have less than two op's sized units of work left.\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3501, __extension__ __PRETTY_FUNCTION__))
3499 (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&(static_cast <bool> ((((NumEltRemaining * EltTyBits) <
(2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes
)) && "Unless we haven't halved the op size yet, " "we have less than two op's sized units of work left."
) ? void (0) : __assert_fail ("(((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes)) && \"Unless we haven't halved the op size yet, \" \"we have less than two op's sized units of work left.\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3501, __extension__ __PRETTY_FUNCTION__))
3500 "Unless we haven't halved the op size yet, "(static_cast <bool> ((((NumEltRemaining * EltTyBits) <
(2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes
)) && "Unless we haven't halved the op size yet, " "we have less than two op's sized units of work left."
) ? void (0) : __assert_fail ("(((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes)) && \"Unless we haven't halved the op size yet, \" \"we have less than two op's sized units of work left.\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3501, __extension__ __PRETTY_FUNCTION__))
3501 "we have less than two op's sized units of work left.")(static_cast <bool> ((((NumEltRemaining * EltTyBits) <
(2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes
)) && "Unless we haven't halved the op size yet, " "we have less than two op's sized units of work left."
) ? void (0) : __assert_fail ("(((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes)) && \"Unless we haven't halved the op size yet, \" \"we have less than two op's sized units of work left.\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3501, __extension__ __PRETTY_FUNCTION__))
;
3502
3503 auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
3504 ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
3505 : XMMVecTy;
3506
3507 assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&(static_cast <bool> (CurrVecTy->getNumElements() % CurrNumEltPerOp
== 0 && "After halving sizes, the vector elt count is no longer a multiple "
"of number of elements per operation?") ? void (0) : __assert_fail
("CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && \"After halving sizes, the vector elt count is no longer a multiple \" \"of number of elements per operation?\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3509, __extension__ __PRETTY_FUNCTION__))
3508 "After halving sizes, the vector elt count is no longer a multiple "(static_cast <bool> (CurrVecTy->getNumElements() % CurrNumEltPerOp
== 0 && "After halving sizes, the vector elt count is no longer a multiple "
"of number of elements per operation?") ? void (0) : __assert_fail
("CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && \"After halving sizes, the vector elt count is no longer a multiple \" \"of number of elements per operation?\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3509, __extension__ __PRETTY_FUNCTION__))
3509 "of number of elements per operation?")(static_cast <bool> (CurrVecTy->getNumElements() % CurrNumEltPerOp
== 0 && "After halving sizes, the vector elt count is no longer a multiple "
"of number of elements per operation?") ? void (0) : __assert_fail
("CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && \"After halving sizes, the vector elt count is no longer a multiple \" \"of number of elements per operation?\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3509, __extension__ __PRETTY_FUNCTION__))
;
3510 auto *CoalescedVecTy =
3511 CurrNumEltPerOp == 1
3512 ? CurrVecTy
3513 : FixedVectorType::get(
3514 IntegerType::get(Src->getContext(),
3515 EltTyBits * CurrNumEltPerOp),
3516 CurrVecTy->getNumElements() / CurrNumEltPerOp);
3517 assert(DL.getTypeSizeInBits(CoalescedVecTy) ==(static_cast <bool> (DL.getTypeSizeInBits(CoalescedVecTy
) == DL.getTypeSizeInBits(CurrVecTy) && "coalesciing elements doesn't change vector width."
) ? void (0) : __assert_fail ("DL.getTypeSizeInBits(CoalescedVecTy) == DL.getTypeSizeInBits(CurrVecTy) && \"coalesciing elements doesn't change vector width.\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3519, __extension__ __PRETTY_FUNCTION__))
3518 DL.getTypeSizeInBits(CurrVecTy) &&(static_cast <bool> (DL.getTypeSizeInBits(CoalescedVecTy
) == DL.getTypeSizeInBits(CurrVecTy) && "coalesciing elements doesn't change vector width."
) ? void (0) : __assert_fail ("DL.getTypeSizeInBits(CoalescedVecTy) == DL.getTypeSizeInBits(CurrVecTy) && \"coalesciing elements doesn't change vector width.\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3519, __extension__ __PRETTY_FUNCTION__))
3519 "coalesciing elements doesn't change vector width.")(static_cast <bool> (DL.getTypeSizeInBits(CoalescedVecTy
) == DL.getTypeSizeInBits(CurrVecTy) && "coalesciing elements doesn't change vector width."
) ? void (0) : __assert_fail ("DL.getTypeSizeInBits(CoalescedVecTy) == DL.getTypeSizeInBits(CurrVecTy) && \"coalesciing elements doesn't change vector width.\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3519, __extension__ __PRETTY_FUNCTION__))
;
3520
3521 while (NumEltRemaining > 0) {
3522 assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?")(static_cast <bool> (SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?"
) ? void (0) : __assert_fail ("SubVecEltsLeft >= 0 && \"Subreg element count overconsumtion?\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3522, __extension__ __PRETTY_FUNCTION__))
;
3523
3524 // Can we use this vector size, as per the remaining element count?
3525 // Iff the vector is naturally aligned, we can do a wide load regardless.
3526 if (NumEltRemaining < CurrNumEltPerOp &&
3527 (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) &&
3528 CurrOpSizeBytes != 1)
3529 break; // Try smalled vector size.
3530
3531 bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
3532
3533 // If we have fully processed the previous reg, we need to replenish it.
3534 if (SubVecEltsLeft == 0) {
3535 SubVecEltsLeft += CurrVecTy->getNumElements();
3536 // And that's free only for the 0'th subvector of a legalized vector.
3537 if (!Is0thSubVec)
3538 Cost += getShuffleCost(IsLoad ? TTI::ShuffleKind::SK_InsertSubvector
3539 : TTI::ShuffleKind::SK_ExtractSubvector,
3540 VTy, None, NumEltDone(), CurrVecTy);
3541 }
3542
3543 // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
3544 // for smaller widths (32/16/8) we have to insert/extract them separately.
3545 // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
3546 // but let's pretend that it is also true for 16/8 bit wide ops...)
3547 if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
3548 int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
3549 assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "")(static_cast <bool> (NumEltDoneInCurrXMM % CurrNumEltPerOp
== 0 && "") ? void (0) : __assert_fail ("NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && \"\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3549, __extension__ __PRETTY_FUNCTION__))
;
3550 int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
3551 APInt DemandedElts =
3552 APInt::getBitsSet(CoalescedVecTy->getNumElements(),
3553 CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
3554 assert(DemandedElts.countPopulation() == 1 && "Inserting single value")(static_cast <bool> (DemandedElts.countPopulation() == 1
&& "Inserting single value") ? void (0) : __assert_fail
("DemandedElts.countPopulation() == 1 && \"Inserting single value\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3554, __extension__ __PRETTY_FUNCTION__))
;
3555 Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
3556 !IsLoad);
3557 }
3558
3559 // This isn't exactly right. We're using slow unaligned 32-byte accesses
3560 // as a proxy for a double-pumped AVX memory interface such as on
3561 // Sandybridge.
3562 if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
3563 Cost += 2;
3564 else
3565 Cost += 1;
3566
3567 SubVecEltsLeft -= CurrNumEltPerOp;
3568 NumEltRemaining -= CurrNumEltPerOp;
3569 Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes);
3570 }
3571 }
3572
3573 assert(NumEltRemaining <= 0 && "Should have processed all the elements.")(static_cast <bool> (NumEltRemaining <= 0 &&
"Should have processed all the elements.") ? void (0) : __assert_fail
("NumEltRemaining <= 0 && \"Should have processed all the elements.\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3573, __extension__ __PRETTY_FUNCTION__))
;
3574
3575 return Cost;
3576}
3577
3578InstructionCost
3579X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
3580 unsigned AddressSpace,
3581 TTI::TargetCostKind CostKind) {
3582 bool IsLoad = (Instruction::Load == Opcode);
3583 bool IsStore = (Instruction::Store == Opcode);
3584
3585 auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
3586 if (!SrcVTy)
3587 // To calculate scalar take the regular cost, without mask
3588 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
3589
3590 unsigned NumElem = SrcVTy->getNumElements();
3591 auto *MaskTy =
3592 FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
3593 if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) ||
3594 (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) {
3595 // Scalarization
3596 APInt DemandedElts = APInt::getAllOnesValue(NumElem);
3597 InstructionCost MaskSplitCost =
3598 getScalarizationOverhead(MaskTy, DemandedElts, false, true);
3599 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
3600 Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
3601 CmpInst::BAD_ICMP_PREDICATE, CostKind);
3602 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
3603 InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
3604 InstructionCost ValueSplitCost =
3605 getScalarizationOverhead(SrcVTy, DemandedElts, IsLoad, IsStore);
3606 InstructionCost MemopCost =
3607 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
3608 Alignment, AddressSpace, CostKind);
3609 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
3610 }
3611
3612 // Legalize the type.
3613 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
3614 auto VT = TLI->getValueType(DL, SrcVTy);
3615 InstructionCost Cost = 0;
3616 if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
3617 LT.second.getVectorNumElements() == NumElem)
3618 // Promotion requires extend/truncate for data and a shuffle for mask.
3619 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, None, 0, nullptr) +
3620 getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, None, 0, nullptr);
3621
3622 else if (LT.first * LT.second.getVectorNumElements() > NumElem) {
3623 auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
3624 LT.second.getVectorNumElements());
3625 // Expanding requires fill mask with zeroes
3626 Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, None, 0, MaskTy);
3627 }
3628
3629 // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
3630 if (!ST->hasAVX512())
3631 return Cost + LT.first * (IsLoad ? 2 : 8);
3632
3633 // AVX-512 masked load/store is cheapper
3634 return Cost + LT.first;
3635}
3636
3637InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty,
3638 ScalarEvolution *SE,
3639 const SCEV *Ptr) {
3640 // Address computations in vectorized code with non-consecutive addresses will
3641 // likely result in more instructions compared to scalar code where the
3642 // computation can more often be merged into the index mode. The resulting
3643 // extra micro-ops can significantly decrease throughput.
3644 const unsigned NumVectorInstToHideOverhead = 10;
3645
3646 // Cost modeling of Strided Access Computation is hidden by the indexing
3647 // modes of X86 regardless of the stride value. We dont believe that there
3648 // is a difference between constant strided access in gerenal and constant
3649 // strided value which is less than or equal to 64.
3650 // Even in the case of (loop invariant) stride whose value is not known at
3651 // compile time, the address computation will not incur more than one extra
3652 // ADD instruction.
3653 if (Ty->isVectorTy() && SE) {
3654 if (!BaseT::isStridedAccess(Ptr))
3655 return NumVectorInstToHideOverhead;
3656 if (!BaseT::getConstantStrideStep(SE, Ptr))
3657 return 1;
3658 }
3659
3660 return BaseT::getAddressComputationCost(Ty, SE, Ptr);
3661}
3662
3663InstructionCost
3664X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
3665 bool IsPairwise,
3666 TTI::TargetCostKind CostKind) {
3667 // Just use the default implementation for pair reductions.
3668 if (IsPairwise)
3669 return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise, CostKind);
3670
3671 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
3672 // and make it as the cost.
3673
3674 static const CostTblEntry SLMCostTblNoPairWise[] = {
3675 { ISD::FADD, MVT::v2f64, 3 },
3676 { ISD::ADD, MVT::v2i64, 5 },
3677 };
3678
3679 static const CostTblEntry SSE2CostTblNoPairWise[] = {
3680 { ISD::FADD, MVT::v2f64, 2 },
3681 { ISD::FADD, MVT::v2f32, 2 },
3682 { ISD::FADD, MVT::v4f32, 4 },
3683 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
3684 { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
3685 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
3686 { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
3687 { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
3688 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
3689 { ISD::ADD, MVT::v2i8, 2 },
3690 { ISD::ADD, MVT::v4i8, 2 },
3691 { ISD::ADD, MVT::v8i8, 2 },
3692 { ISD::ADD, MVT::v16i8, 3 },
3693 };
3694
3695 static const CostTblEntry AVX1CostTblNoPairWise[] = {
3696 { ISD::FADD, MVT::v4f64, 3 },
3697 { ISD::FADD, MVT::v4f32, 3 },
3698 { ISD::FADD, MVT::v8f32, 4 },
3699 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
3700 { ISD::ADD, MVT::v4i64, 3 },
3701 { ISD::ADD, MVT::v8i32, 5 },
3702 { ISD::ADD, MVT::v16i16, 5 },
3703 { ISD::ADD, MVT::v32i8, 4 },
3704 };
3705
3706 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3707 assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ?
void (0) : __assert_fail ("ISD && \"Invalid opcode\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3707, __extension__ __PRETTY_FUNCTION__))
;
3708
3709 // Before legalizing the type, give a chance to look up illegal narrow types
3710 // in the table.
3711 // FIXME: Is there a better way to do this?
3712 EVT VT = TLI->getValueType(DL, ValTy);
3713 if (VT.isSimple()) {
3714 MVT MTy = VT.getSimpleVT();
3715 if (ST->isSLM())
3716 if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
3717 return Entry->Cost;
3718
3719 if (ST->hasAVX())
3720 if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
3721 return Entry->Cost;
3722
3723 if (ST->hasSSE2())
3724 if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
3725 return Entry->Cost;
3726 }
3727
3728 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
3729
3730 MVT MTy = LT.second;
3731
3732 auto *ValVTy = cast<FixedVectorType>(ValTy);
3733
3734 // Special case: vXi8 mul reductions are performed as vXi16.
3735 if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
3736 auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
3737 auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
3738 return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
3739 TargetTransformInfo::CastContextHint::None,
3740 CostKind) +
3741 getArithmeticReductionCost(Opcode, WideVecTy, IsPairwise, CostKind);
3742 }
3743
3744 InstructionCost ArithmeticCost = 0;
3745 if (LT.first != 1 && MTy.isVector() &&
3746 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
3747 // Type needs to be split. We need LT.first - 1 arithmetic ops.
3748 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
3749 MTy.getVectorNumElements());
3750 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
3751 ArithmeticCost *= LT.first - 1;
3752 }
3753
3754 if (ST->isSLM())
3755 if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
3756 return ArithmeticCost + Entry->Cost;
3757
3758 if (ST->hasAVX())
3759 if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
3760 return ArithmeticCost + Entry->Cost;
3761
3762 if (ST->hasSSE2())
3763 if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
3764 return ArithmeticCost + Entry->Cost;
3765
3766 // FIXME: These assume a naive kshift+binop lowering, which is probably
3767 // conservative in most cases.
3768 static const CostTblEntry AVX512BoolReduction[] = {
3769 { ISD::AND, MVT::v2i1, 3 },
3770 { ISD::AND, MVT::v4i1, 5 },
3771 { ISD::AND, MVT::v8i1, 7 },
3772 { ISD::AND, MVT::v16i1, 9 },
3773 { ISD::AND, MVT::v32i1, 11 },
3774 { ISD::AND, MVT::v64i1, 13 },
3775 { ISD::OR, MVT::v2i1, 3 },
3776 { ISD::OR, MVT::v4i1, 5 },
3777 { ISD::OR, MVT::v8i1, 7 },
3778 { ISD::OR, MVT::v16i1, 9 },
3779 { ISD::OR, MVT::v32i1, 11 },
3780 { ISD::OR, MVT::v64i1, 13 },
3781 };
3782
3783 static const CostTblEntry AVX2BoolReduction[] = {
3784 { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
3785 { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
3786 { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
3787 { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
3788 };
3789
3790 static const CostTblEntry AVX1BoolReduction[] = {
3791 { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
3792 { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
3793 { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
3794 { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
3795 { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
3796 { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
3797 { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
3798 { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
3799 };
3800
3801 static const CostTblEntry SSE2BoolReduction[] = {
3802 { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
3803 { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
3804 { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
3805 { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
3806 { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
3807 { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
3808 { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
3809 { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
3810 };
3811
3812 // Handle bool allof/anyof patterns.
3813 if (ValVTy->getElementType()->isIntegerTy(1)) {
3814 InstructionCost ArithmeticCost = 0;
3815 if (LT.first != 1 && MTy.isVector() &&
3816 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
3817 // Type needs to be split. We need LT.first - 1 arithmetic ops.
3818 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
3819 MTy.getVectorNumElements());
3820 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
3821 ArithmeticCost *= LT.first - 1;
3822 }
3823
3824 if (ST->hasAVX512())
3825 if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
3826 return ArithmeticCost + Entry->Cost;
3827 if (ST->hasAVX2())
3828 if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
3829 return ArithmeticCost + Entry->Cost;
3830 if (ST->hasAVX())
3831 if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
3832 return ArithmeticCost + Entry->Cost;
3833 if (ST->hasSSE2())
3834 if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
3835 return ArithmeticCost + Entry->Cost;
3836
3837 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, IsPairwise,
3838 CostKind);
3839 }
3840
3841 unsigned NumVecElts = ValVTy->getNumElements();
3842 unsigned ScalarSize = ValVTy->getScalarSizeInBits();
3843
3844 // Special case power of 2 reductions where the scalar type isn't changed
3845 // by type legalization.
3846 if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
3847 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, IsPairwise,
3848 CostKind);
3849
3850 InstructionCost ReductionCost = 0;
3851
3852 auto *Ty = ValVTy;
3853 if (LT.first != 1 && MTy.isVector() &&
3854 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
3855 // Type needs to be split. We need LT.first - 1 arithmetic ops.
3856 Ty = FixedVectorType::get(ValVTy->getElementType(),
3857 MTy.getVectorNumElements());
3858 ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
3859 ReductionCost *= LT.first - 1;
3860 NumVecElts = MTy.getVectorNumElements();
3861 }
3862
3863 // Now handle reduction with the legal type, taking into account size changes
3864 // at each level.
3865 while (NumVecElts > 1) {
3866 // Determine the size of the remaining vector we need to reduce.
3867 unsigned Size = NumVecElts * ScalarSize;
3868 NumVecElts /= 2;
3869 // If we're reducing from 256/512 bits, use an extract_subvector.
3870 if (Size > 128) {
3871 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
3872 ReductionCost +=
3873 getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, NumVecElts, SubTy);
3874 Ty = SubTy;
3875 } else if (Size == 128) {
3876 // Reducing from 128 bits is a permute of v2f64/v2i64.
3877 FixedVectorType *ShufTy;
3878 if (ValVTy->isFloatingPointTy())
3879 ShufTy =
3880 FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
3881 else
3882 ShufTy =
3883 FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
3884 ReductionCost +=
3885 getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr);
3886 } else if (Size == 64) {
3887 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
3888 FixedVectorType *ShufTy;
3889 if (ValVTy->isFloatingPointTy())
3890 ShufTy =
3891 FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
3892 else
3893 ShufTy =
3894 FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
3895 ReductionCost +=
3896 getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr);
3897 } else {
3898 // Reducing from smaller size is a shift by immediate.
3899 auto *ShiftTy = FixedVectorType::get(
3900 Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
3901 ReductionCost += getArithmeticInstrCost(
3902 Instruction::LShr, ShiftTy, CostKind,
3903 TargetTransformInfo::OK_AnyValue,
3904 TargetTransformInfo::OK_UniformConstantValue,
3905 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
3906 }
3907
3908 // Add the arithmetic op for this level.
3909 ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
3910 }
3911
3912 // Add the final extract element to the cost.
3913 return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
3914}
3915
3916InstructionCost X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy,
3917 bool IsUnsigned) {
3918 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
3919
3920 MVT MTy = LT.second;
3921
3922 int ISD;
3923 if (Ty->isIntOrIntVectorTy()) {
3924 ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
3925 } else {
3926 assert(Ty->isFPOrFPVectorTy() &&(static_cast <bool> (Ty->isFPOrFPVectorTy() &&
"Expected float point or integer vector type.") ? void (0) :
__assert_fail ("Ty->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3927, __extension__ __PRETTY_FUNCTION__))
3927 "Expected float point or integer vector type.")(static_cast <bool> (Ty->isFPOrFPVectorTy() &&
"Expected float point or integer vector type.") ? void (0) :
__assert_fail ("Ty->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 3927, __extension__ __PRETTY_FUNCTION__))
;
3928 ISD = ISD::FMINNUM;
3929 }
3930
3931 static const CostTblEntry SSE1CostTbl[] = {
3932 {ISD::FMINNUM, MVT::v4f32, 1},
3933 };
3934
3935 static const CostTblEntry SSE2CostTbl[] = {
3936 {ISD::FMINNUM, MVT::v2f64, 1},
3937 {ISD::SMIN, MVT::v8i16, 1},
3938 {ISD::UMIN, MVT::v16i8, 1},
3939 };
3940
3941 static const CostTblEntry SSE41CostTbl[] = {
3942 {ISD::SMIN, MVT::v4i32, 1},
3943 {ISD::UMIN, MVT::v4i32, 1},
3944 {ISD::UMIN, MVT::v8i16, 1},
3945 {ISD::SMIN, MVT::v16i8, 1},
3946 };
3947
3948 static const CostTblEntry SSE42CostTbl[] = {
3949 {ISD::UMIN, MVT::v2i64, 3}, // xor+pcmpgtq+blendvpd
3950 };
3951
3952 static const CostTblEntry AVX1CostTbl[] = {
3953 {ISD::FMINNUM, MVT::v8f32, 1},
3954 {ISD::FMINNUM, MVT::v4f64, 1},
3955 {ISD::SMIN, MVT::v8i32, 3},
3956 {ISD::UMIN, MVT::v8i32, 3},
3957 {ISD::SMIN, MVT::v16i16, 3},
3958 {ISD::UMIN, MVT::v16i16, 3},
3959 {ISD::SMIN, MVT::v32i8, 3},
3960 {ISD::UMIN, MVT::v32i8, 3},
3961 };
3962
3963 static const CostTblEntry AVX2CostTbl[] = {
3964 {ISD::SMIN, MVT::v8i32, 1},
3965 {ISD::UMIN, MVT::v8i32, 1},
3966 {ISD::SMIN, MVT::v16i16, 1},
3967 {ISD::UMIN, MVT::v16i16, 1},
3968 {ISD::SMIN, MVT::v32i8, 1},
3969 {ISD::UMIN, MVT::v32i8, 1},
3970 };
3971
3972 static const CostTblEntry AVX512CostTbl[] = {
3973 {ISD::FMINNUM, MVT::v16f32, 1},
3974 {ISD::FMINNUM, MVT::v8f64, 1},
3975 {ISD::SMIN, MVT::v2i64, 1},
3976 {ISD::UMIN, MVT::v2i64, 1},
3977 {ISD::SMIN, MVT::v4i64, 1},
3978 {ISD::UMIN, MVT::v4i64, 1},
3979 {ISD::SMIN, MVT::v8i64, 1},
3980 {ISD::UMIN, MVT::v8i64, 1},
3981 {ISD::SMIN, MVT::v16i32, 1},
3982 {ISD::UMIN, MVT::v16i32, 1},
3983 };
3984
3985 static const CostTblEntry AVX512BWCostTbl[] = {
3986 {ISD::SMIN, MVT::v32i16, 1},
3987 {ISD::UMIN, MVT::v32i16, 1},
3988 {ISD::SMIN, MVT::v64i8, 1},
3989 {ISD::UMIN, MVT::v64i8, 1},
3990 };
3991
3992 // If we have a native MIN/MAX instruction for this type, use it.
3993 if (ST->hasBWI())
3994 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3995 return LT.first * Entry->Cost;
3996
3997 if (ST->hasAVX512())
3998 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3999 return LT.first * Entry->Cost;
4000
4001 if (ST->hasAVX2())
4002 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
4003 return LT.first * Entry->Cost;
4004
4005 if (ST->hasAVX())
4006 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
4007 return LT.first * Entry->Cost;
4008
4009 if (ST->hasSSE42())
4010 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
4011 return LT.first * Entry->Cost;
4012
4013 if (ST->hasSSE41())
4014 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
4015 return LT.first * Entry->Cost;
4016
4017 if (ST->hasSSE2())
4018 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
4019 return LT.first * Entry->Cost;
4020
4021 if (ST->hasSSE1())
4022 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
4023 return LT.first * Entry->Cost;
4024
4025 unsigned CmpOpcode;
4026 if (Ty->isFPOrFPVectorTy()) {
4027 CmpOpcode = Instruction::FCmp;
4028 } else {
4029 assert(Ty->isIntOrIntVectorTy() &&(static_cast <bool> (Ty->isIntOrIntVectorTy() &&
"expecting floating point or integer type for min/max reduction"
) ? void (0) : __assert_fail ("Ty->isIntOrIntVectorTy() && \"expecting floating point or integer type for min/max reduction\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 4030, __extension__ __PRETTY_FUNCTION__))
4030 "expecting floating point or integer type for min/max reduction")(static_cast <bool> (Ty->isIntOrIntVectorTy() &&
"expecting floating point or integer type for min/max reduction"
) ? void (0) : __assert_fail ("Ty->isIntOrIntVectorTy() && \"expecting floating point or integer type for min/max reduction\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 4030, __extension__ __PRETTY_FUNCTION__))
;
4031 CmpOpcode = Instruction::ICmp;
4032 }
4033
4034 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4035 // Otherwise fall back to cmp+select.
4036 InstructionCost Result =
4037 getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CmpInst::BAD_ICMP_PREDICATE,
4038 CostKind) +
4039 getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
4040 CmpInst::BAD_ICMP_PREDICATE, CostKind);
4041 return Result;
4042}
4043
4044InstructionCost
4045X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy,
4046 bool IsPairwise, bool IsUnsigned,
4047 TTI::TargetCostKind CostKind) {
4048 // Just use the default implementation for pair reductions.
4049 if (IsPairwise)
4050 return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned,
4051 CostKind);
4052
4053 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
4054
4055 MVT MTy = LT.second;
4056
4057 int ISD;
4058 if (ValTy->isIntOrIntVectorTy()) {
4059 ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
4060 } else {
4061 assert(ValTy->isFPOrFPVectorTy() &&(static_cast <bool> (ValTy->isFPOrFPVectorTy() &&
"Expected float point or integer vector type.") ? void (0) :
__assert_fail ("ValTy->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 4062, __extension__ __PRETTY_FUNCTION__))
4062 "Expected float point or integer vector type.")(static_cast <bool> (ValTy->isFPOrFPVectorTy() &&
"Expected float point or integer vector type.") ? void (0) :
__assert_fail ("ValTy->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 4062, __extension__ __PRETTY_FUNCTION__))
;
4063 ISD = ISD::FMINNUM;
4064 }
4065
4066 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
4067 // and make it as the cost.
4068
4069 static const CostTblEntry SSE2CostTblNoPairWise[] = {
4070 {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
4071 {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
4072 {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
4073 };
4074
4075 static const CostTblEntry SSE41CostTblNoPairWise[] = {
4076 {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
4077 {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
4078 {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
4079 {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
4080 {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
4081 {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
4082 {ISD::SMIN, MVT::v2i8, 3}, // pminsb
4083 {ISD::SMIN, MVT::v4i8, 5}, // pminsb
4084 {ISD::SMIN, MVT::v8i8, 7}, // pminsb
4085 {ISD::SMIN, MVT::v16i8, 6},
4086 {ISD::UMIN, MVT::v2i8, 3}, // same as sse2
4087 {ISD::UMIN, MVT::v4i8, 5}, // same as sse2
4088 {ISD::UMIN, MVT::v8i8, 7}, // same as sse2
4089 {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
4090 };
4091
4092 static const CostTblEntry AVX1CostTblNoPairWise[] = {
4093 {ISD::SMIN, MVT::v16i16, 6},
4094 {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
4095 {ISD::SMIN, MVT::v32i8, 8},
4096 {ISD::UMIN, MVT::v32i8, 8},
4097 };
4098
4099 static const CostTblEntry AVX512BWCostTblNoPairWise[] = {
4100 {ISD::SMIN, MVT::v32i16, 8},
4101 {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
4102 {ISD::SMIN, MVT::v64i8, 10},
4103 {ISD::UMIN, MVT::v64i8, 10},
4104 };
4105
4106 // Before legalizing the type, give a chance to look up illegal narrow types
4107 // in the table.
4108 // FIXME: Is there a better way to do this?
4109 EVT VT = TLI->getValueType(DL, ValTy);
4110 if (VT.isSimple()) {
4111 MVT MTy = VT.getSimpleVT();
4112 if (ST->hasBWI())
4113 if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy))
4114 return Entry->Cost;
4115
4116 if (ST->hasAVX())
4117 if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
4118 return Entry->Cost;
4119
4120 if (ST->hasSSE41())
4121 if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy))
4122 return Entry->Cost;
4123
4124 if (ST->hasSSE2())
4125 if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
4126 return Entry->Cost;
4127 }
4128
4129 auto *ValVTy = cast<FixedVectorType>(ValTy);
4130 unsigned NumVecElts = ValVTy->getNumElements();
4131
4132 auto *Ty = ValVTy;
4133 InstructionCost MinMaxCost = 0;
4134 if (LT.first != 1 && MTy.isVector() &&
4135 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
4136 // Type needs to be split. We need LT.first - 1 operations ops.
4137 Ty = FixedVectorType::get(ValVTy->getElementType(),
4138 MTy.getVectorNumElements());
4139 auto *SubCondTy = FixedVectorType::get(CondTy->getElementType(),
4140 MTy.getVectorNumElements());
4141 MinMaxCost = getMinMaxCost(Ty, SubCondTy, IsUnsigned);
4142 MinMaxCost *= LT.first - 1;
4143 NumVecElts = MTy.getVectorNumElements();
4144 }
4145
4146 if (ST->hasBWI())
4147 if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy))
4148 return MinMaxCost + Entry->Cost;
4149
4150 if (ST->hasAVX())
4151 if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
4152 return MinMaxCost + Entry->Cost;
4153
4154 if (ST->hasSSE41())
4155 if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy))
4156 return MinMaxCost + Entry->Cost;
4157
4158 if (ST->hasSSE2())
4159 if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
4160 return MinMaxCost + Entry->Cost;
4161
4162 unsigned ScalarSize = ValTy->getScalarSizeInBits();
4163
4164 // Special case power of 2 reductions where the scalar type isn't changed
4165 // by type legalization.
4166 if (!isPowerOf2_32(ValVTy->getNumElements()) ||
4167 ScalarSize != MTy.getScalarSizeInBits())
4168 return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned,
4169 CostKind);
4170
4171 // Now handle reduction with the legal type, taking into account size changes
4172 // at each level.
4173 while (NumVecElts > 1) {
4174 // Determine the size of the remaining vector we need to reduce.
4175 unsigned Size = NumVecElts * ScalarSize;
4176 NumVecElts /= 2;
4177 // If we're reducing from 256/512 bits, use an extract_subvector.
4178 if (Size > 128) {
4179 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
4180 MinMaxCost +=
4181 getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, NumVecElts, SubTy);
4182 Ty = SubTy;
4183 } else if (Size == 128) {
4184 // Reducing from 128 bits is a permute of v2f64/v2i64.
4185 VectorType *ShufTy;
4186 if (ValTy->isFloatingPointTy())
4187 ShufTy =
4188 FixedVectorType::get(Type::getDoubleTy(ValTy->getContext()), 2);
4189 else
4190 ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
4191 MinMaxCost +=
4192 getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr);
4193 } else if (Size == 64) {
4194 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
4195 FixedVectorType *ShufTy;
4196 if (ValTy->isFloatingPointTy())
4197 ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
4198 else
4199 ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
4200 MinMaxCost +=
4201 getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr);
4202 } else {
4203 // Reducing from smaller size is a shift by immediate.
4204 auto *ShiftTy = FixedVectorType::get(
4205 Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
4206 MinMaxCost += getArithmeticInstrCost(
4207 Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
4208 TargetTransformInfo::OK_AnyValue,
4209 TargetTransformInfo::OK_UniformConstantValue,
4210 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
4211 }
4212
4213 // Add the arithmetic op for this level.
4214 auto *SubCondTy =
4215 FixedVectorType::get(CondTy->getElementType(), Ty->getNumElements());
4216 MinMaxCost += getMinMaxCost(Ty, SubCondTy, IsUnsigned);
4217 }
4218
4219 // Add the final extract element to the cost.
4220 return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
4221}
4222
4223/// Calculate the cost of materializing a 64-bit value. This helper
4224/// method might only calculate a fraction of a larger immediate. Therefore it
4225/// is valid to return a cost of ZERO.
4226InstructionCost X86TTIImpl::getIntImmCost(int64_t Val) {
4227 if (Val == 0)
4228 return TTI::TCC_Free;
4229
4230 if (isInt<32>(Val))
4231 return TTI::TCC_Basic;
4232
4233 return 2 * TTI::TCC_Basic;
4234}
4235
4236InstructionCost X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
4237 TTI::TargetCostKind CostKind) {
4238 assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
__assert_fail ("Ty->isIntegerTy()", "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 4238, __extension__ __PRETTY_FUNCTION__))
;
4239
4240 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4241 if (BitSize == 0)
4242 return ~0U;
4243
4244 // Never hoist constants larger than 128bit, because this might lead to
4245 // incorrect code generation or assertions in codegen.
4246 // Fixme: Create a cost model for types larger than i128 once the codegen
4247 // issues have been fixed.
4248 if (BitSize > 128)
4249 return TTI::TCC_Free;
4250
4251 if (Imm == 0)
4252 return TTI::TCC_Free;
4253
4254 // Sign-extend all constants to a multiple of 64-bit.
4255 APInt ImmVal = Imm;
4256 if (BitSize % 64 != 0)
4257 ImmVal = Imm.sext(alignTo(BitSize, 64));
4258
4259 // Split the constant into 64-bit chunks and calculate the cost for each
4260 // chunk.
4261 InstructionCost Cost = 0;
4262 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
4263 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
4264 int64_t Val = Tmp.getSExtValue();
4265 Cost += getIntImmCost(Val);
4266 }
4267 // We need at least one instruction to materialize the constant.
4268 return std::max<InstructionCost>(1, Cost);
4269}
4270
4271InstructionCost X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
4272 const APInt &Imm, Type *Ty,
4273 TTI::TargetCostKind CostKind,
4274 Instruction *Inst) {
4275 assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
__assert_fail ("Ty->isIntegerTy()", "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 4275, __extension__ __PRETTY_FUNCTION__))
;
4276
4277 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4278 // There is no cost model for constants with a bit size of 0. Return TCC_Free
4279 // here, so that constant hoisting will ignore this constant.
4280 if (BitSize == 0)
4281 return TTI::TCC_Free;
4282
4283 unsigned ImmIdx = ~0U;
4284 switch (Opcode) {
4285 default:
4286 return TTI::TCC_Free;
4287 case Instruction::GetElementPtr:
4288 // Always hoist the base address of a GetElementPtr. This prevents the
4289 // creation of new constants for every base constant that gets constant
4290 // folded with the offset.
4291 if (Idx == 0)
4292 return 2 * TTI::TCC_Basic;
4293 return TTI::TCC_Free;
4294 case Instruction::Store:
4295 ImmIdx = 0;
4296 break;
4297 case Instruction::ICmp:
4298 // This is an imperfect hack to prevent constant hoisting of
4299 // compares that might be trying to check if a 64-bit value fits in
4300 // 32-bits. The backend can optimize these cases using a right shift by 32.
4301 // Ideally we would check the compare predicate here. There also other
4302 // similar immediates the backend can use shifts for.
4303 if (Idx == 1 && Imm.getBitWidth() == 64) {
4304 uint64_t ImmVal = Imm.getZExtValue();
4305 if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
4306 return TTI::TCC_Free;
4307 }
4308 ImmIdx = 1;
4309 break;
4310 case Instruction::And:
4311 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
4312 // by using a 32-bit operation with implicit zero extension. Detect such
4313 // immediates here as the normal path expects bit 31 to be sign extended.
4314 if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue()))
4315 return TTI::TCC_Free;
4316 ImmIdx = 1;
4317 break;
4318 case Instruction::Add:
4319 case Instruction::Sub:
4320 // For add/sub, we can use the opposite instruction for INT32_MIN.
4321 if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000)
4322 return TTI::TCC_Free;
4323 ImmIdx = 1;
4324 break;
4325 case Instruction::UDiv:
4326 case Instruction::SDiv:
4327 case Instruction::URem:
4328 case Instruction::SRem:
4329 // Division by constant is typically expanded later into a different
4330 // instruction sequence. This completely changes the constants.
4331 // Report them as "free" to stop ConstantHoist from marking them as opaque.
4332 return TTI::TCC_Free;
4333 case Instruction::Mul:
4334 case Instruction::Or:
4335 case Instruction::Xor:
4336 ImmIdx = 1;
4337 break;
4338 // Always return TCC_Free for the shift value of a shift instruction.
4339 case Instruction::Shl:
4340 case Instruction::LShr:
4341 case Instruction::AShr:
4342 if (Idx == 1)
4343 return TTI::TCC_Free;
4344 break;
4345 case Instruction::Trunc:
4346 case Instruction::ZExt:
4347 case Instruction::SExt:
4348 case Instruction::IntToPtr:
4349 case Instruction::PtrToInt:
4350 case Instruction::BitCast:
4351 case Instruction::PHI:
4352 case Instruction::Call:
4353 case Instruction::Select:
4354 case Instruction::Ret:
4355 case Instruction::Load:
4356 break;
4357 }
4358
4359 if (Idx == ImmIdx) {
4360 int NumConstants = divideCeil(BitSize, 64);
4361 InstructionCost Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
4362 return (Cost <= NumConstants * TTI::TCC_Basic)
4363 ? static_cast<int>(TTI::TCC_Free)
4364 : Cost;
4365 }
4366
4367 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
4368}
4369
4370InstructionCost X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
4371 const APInt &Imm, Type *Ty,
4372 TTI::TargetCostKind CostKind) {
4373 assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
__assert_fail ("Ty->isIntegerTy()", "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 4373, __extension__ __PRETTY_FUNCTION__))
;
4374
4375 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4376 // There is no cost model for constants with a bit size of 0. Return TCC_Free
4377 // here, so that constant hoisting will ignore this constant.
4378 if (BitSize == 0)
4379 return TTI::TCC_Free;
4380
4381 switch (IID) {
4382 default:
4383 return TTI::TCC_Free;
4384 case Intrinsic::sadd_with_overflow:
4385 case Intrinsic::uadd_with_overflow:
4386 case Intrinsic::ssub_with_overflow:
4387 case Intrinsic::usub_with_overflow:
4388 case Intrinsic::smul_with_overflow:
4389 case Intrinsic::umul_with_overflow:
4390 if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
4391 return TTI::TCC_Free;
4392 break;
4393 case Intrinsic::experimental_stackmap:
4394 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
4395 return TTI::TCC_Free;
4396 break;
4397 case Intrinsic::experimental_patchpoint_void:
4398 case Intrinsic::experimental_patchpoint_i64:
4399 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
4400 return TTI::TCC_Free;
4401 break;
4402 }
4403 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
4404}
4405
4406InstructionCost X86TTIImpl::getCFInstrCost(unsigned Opcode,
4407 TTI::TargetCostKind CostKind,
4408 const Instruction *I) {
4409 if (CostKind != TTI::TCK_RecipThroughput)
4410 return Opcode == Instruction::PHI ? 0 : 1;
4411 // Branches are assumed to be predicted.
4412 return 0;
4413}
4414
4415int X86TTIImpl::getGatherOverhead() const {
4416 // Some CPUs have more overhead for gather. The specified overhead is relative
4417 // to the Load operation. "2" is the number provided by Intel architects. This
4418 // parameter is used for cost estimation of Gather Op and comparison with
4419 // other alternatives.
4420 // TODO: Remove the explicit hasAVX512()?, That would mean we would only
4421 // enable gather with a -march.
4422 if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
4423 return 2;
4424
4425 return 1024;
4426}
4427
4428int X86TTIImpl::getScatterOverhead() const {
4429 if (ST->hasAVX512())
4430 return 2;
4431
4432 return 1024;
4433}
4434
4435// Return an average cost of Gather / Scatter instruction, maybe improved later.
4436// FIXME: Add TargetCostKind support.
4437InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy,
4438 const Value *Ptr, Align Alignment,
4439 unsigned AddressSpace) {
4440
4441 assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost")(static_cast <bool> (isa<VectorType>(SrcVTy) &&
"Unexpected type in getGSVectorCost") ? void (0) : __assert_fail
("isa<VectorType>(SrcVTy) && \"Unexpected type in getGSVectorCost\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 4441, __extension__ __PRETTY_FUNCTION__))
;
4442 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
4443
4444 // Try to reduce index size from 64 bit (default for GEP)
4445 // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
4446 // operation will use 16 x 64 indices which do not fit in a zmm and needs
4447 // to split. Also check that the base pointer is the same for all lanes,
4448 // and that there's at most one variable index.
4449 auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
4450 unsigned IndexSize = DL.getPointerSizeInBits();
4451 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
4452 if (IndexSize < 64 || !GEP)
4453 return IndexSize;
4454
4455 unsigned NumOfVarIndices = 0;
4456 const Value *Ptrs = GEP->getPointerOperand();
4457 if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
4458 return IndexSize;
4459 for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
4460 if (isa<Constant>(GEP->getOperand(i)))
4461 continue;
4462 Type *IndxTy = GEP->getOperand(i)->getType();
4463 if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
4464 IndxTy = IndexVTy->getElementType();
4465 if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
4466 !isa<SExtInst>(GEP->getOperand(i))) ||
4467 ++NumOfVarIndices > 1)
4468 return IndexSize; // 64
4469 }
4470 return (unsigned)32;
4471 };
4472
4473 // Trying to reduce IndexSize to 32 bits for vector 16.
4474 // By default the IndexSize is equal to pointer size.
4475 unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
4476 ? getIndexSizeInBits(Ptr, DL)
4477 : DL.getPointerSizeInBits();
4478
4479 auto *IndexVTy = FixedVectorType::get(
4480 IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
4481 std::pair<InstructionCost, MVT> IdxsLT =
4482 TLI->getTypeLegalizationCost(DL, IndexVTy);
4483 std::pair<InstructionCost, MVT> SrcLT =
4484 TLI->getTypeLegalizationCost(DL, SrcVTy);
4485 InstructionCost::CostType SplitFactor =
4486 *std::max(IdxsLT.first, SrcLT.first).getValue();
4487 if (SplitFactor > 1) {
4488 // Handle splitting of vector of pointers
4489 auto *SplitSrcTy =
4490 FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
4491 return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
4492 AddressSpace);
4493 }
4494
4495 // The gather / scatter cost is given by Intel architects. It is a rough
4496 // number since we are looking at one instruction in a time.
4497 const int GSOverhead = (Opcode == Instruction::Load)
4498 ? getGatherOverhead()
4499 : getScatterOverhead();
4500 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
4501 MaybeAlign(Alignment), AddressSpace,
4502 TTI::TCK_RecipThroughput);
4503}
4504
4505/// Return the cost of full scalarization of gather / scatter operation.
4506///
4507/// Opcode - Load or Store instruction.
4508/// SrcVTy - The type of the data vector that should be gathered or scattered.
4509/// VariableMask - The mask is non-constant at compile time.
4510/// Alignment - Alignment for one element.
4511/// AddressSpace - pointer[s] address space.
4512///
4513/// FIXME: Add TargetCostKind support.
4514InstructionCost X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
4515 bool VariableMask, Align Alignment,
4516 unsigned AddressSpace) {
4517 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
4518 APInt DemandedElts = APInt::getAllOnesValue(VF);
4519 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4520
4521 InstructionCost MaskUnpackCost = 0;
4522 if (VariableMask) {
4523 auto *MaskTy =
4524 FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
4525 MaskUnpackCost =
4526 getScalarizationOverhead(MaskTy, DemandedElts, false, true);
4527 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
4528 Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr,
4529 CmpInst::BAD_ICMP_PREDICATE, CostKind);
4530 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
4531 MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
4532 }
4533
4534 // The cost of the scalar loads/stores.
4535 InstructionCost MemoryOpCost =
4536 VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
4537 MaybeAlign(Alignment), AddressSpace, CostKind);
4538
4539 InstructionCost InsertExtractCost = 0;
4540 if (Opcode == Instruction::Load)
4541 for (unsigned i = 0; i < VF; ++i)
4542 // Add the cost of inserting each scalar load into the vector
4543 InsertExtractCost +=
4544 getVectorInstrCost(Instruction::InsertElement, SrcVTy, i);
4545 else
4546 for (unsigned i = 0; i < VF; ++i)
4547 // Add the cost of extracting each element out of the data vector
4548 InsertExtractCost +=
4549 getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i);
4550
4551 return MemoryOpCost + MaskUnpackCost + InsertExtractCost;
4552}
4553
4554/// Calculate the cost of Gather / Scatter operation
4555InstructionCost X86TTIImpl::getGatherScatterOpCost(
4556 unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask,
4557 Align Alignment, TTI::TargetCostKind CostKind,
4558 const Instruction *I = nullptr) {
4559 if (CostKind != TTI::TCK_RecipThroughput) {
4560 if ((Opcode == Instruction::Load &&
4561 isLegalMaskedGather(SrcVTy, Align(Alignment))) ||
4562 (Opcode == Instruction::Store &&
4563 isLegalMaskedScatter(SrcVTy, Align(Alignment))))
4564 return 1;
4565 return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask,
4566 Alignment, CostKind, I);
4567 }
4568
4569 assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter")(static_cast <bool> (SrcVTy->isVectorTy() &&
"Unexpected data type for Gather/Scatter") ? void (0) : __assert_fail
("SrcVTy->isVectorTy() && \"Unexpected data type for Gather/Scatter\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 4569, __extension__ __PRETTY_FUNCTION__))
;
4570 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
4571 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
4572 if (!PtrTy && Ptr->getType()->isVectorTy())
4573 PtrTy = dyn_cast<PointerType>(
4574 cast<VectorType>(Ptr->getType())->getElementType());
4575 assert(PtrTy && "Unexpected type for Ptr argument")(static_cast <bool> (PtrTy && "Unexpected type for Ptr argument"
) ? void (0) : __assert_fail ("PtrTy && \"Unexpected type for Ptr argument\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 4575, __extension__ __PRETTY_FUNCTION__))
;
4576 unsigned AddressSpace = PtrTy->getAddressSpace();
4577
4578 bool Scalarize = false;
4579 if ((Opcode == Instruction::Load &&
4580 !isLegalMaskedGather(SrcVTy, Align(Alignment))) ||
4581 (Opcode == Instruction::Store &&
4582 !isLegalMaskedScatter(SrcVTy, Align(Alignment))))
4583 Scalarize = true;
4584 // Gather / Scatter for vector 2 is not profitable on KNL / SKX
4585 // Vector-4 of gather/scatter instruction does not exist on KNL.
4586 // We can extend it to 8 elements, but zeroing upper bits of
4587 // the mask vector will add more instructions. Right now we give the scalar
4588 // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction
4589 // is better in the VariableMask case.
4590 if (ST->hasAVX512() && (VF == 2 || (VF == 4 && !ST->hasVLX())))
4591 Scalarize = true;
4592
4593 if (Scalarize)
4594 return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
4595 AddressSpace);
4596
4597 return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
4598}
4599
4600bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
4601 TargetTransformInfo::LSRCost &C2) {
4602 // X86 specific here are "instruction number 1st priority".
4603 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
4604 C1.NumIVMuls, C1.NumBaseAdds,
4605 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
4606 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
4607 C2.NumIVMuls, C2.NumBaseAdds,
4608 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
4609}
4610
4611bool X86TTIImpl::canMacroFuseCmp() {
4612 return ST->hasMacroFusion() || ST->hasBranchFusion();
4613}
4614
4615bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
4616 if (!ST->hasAVX())
4617 return false;
4618
4619 // The backend can't handle a single element vector.
4620 if (isa<VectorType>(DataTy) &&
4621 cast<FixedVectorType>(DataTy)->getNumElements() == 1)
4622 return false;
4623 Type *ScalarTy = DataTy->getScalarType();
4624
4625 if (ScalarTy->isPointerTy())
4626 return true;
4627
4628 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
4629 return true;
4630
4631 if (!ScalarTy->isIntegerTy())
4632 return false;
4633
4634 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
4635 return IntWidth == 32 || IntWidth == 64 ||
4636 ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
4637}
4638
4639bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) {
4640 return isLegalMaskedLoad(DataType, Alignment);
4641}
4642
4643bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) {
4644 unsigned DataSize = DL.getTypeStoreSize(DataType);
4645 // The only supported nontemporal loads are for aligned vectors of 16 or 32
4646 // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
4647 // (the equivalent stores only require AVX).
4648 if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
4649 return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
4650
4651 return false;
4652}
4653
4654bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) {
4655 unsigned DataSize = DL.getTypeStoreSize(DataType);
4656
4657 // SSE4A supports nontemporal stores of float and double at arbitrary
4658 // alignment.
4659 if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
4660 return true;
4661
4662 // Besides the SSE4A subtarget exception above, only aligned stores are
4663 // available nontemporaly on any other subtarget. And only stores with a size
4664 // of 4..32 bytes (powers of 2, only) are permitted.
4665 if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
4666 !isPowerOf2_32(DataSize))
4667 return false;
4668
4669 // 32-byte vector nontemporal stores are supported by AVX (the equivalent
4670 // loads require AVX2).
4671 if (DataSize == 32)
4672 return ST->hasAVX();
4673 else if (DataSize == 16)
4674 return ST->hasSSE1();
4675 return true;
4676}
4677
4678bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) {
4679 if (!isa<VectorType>(DataTy))
4680 return false;
4681
4682 if (!ST->hasAVX512())
4683 return false;
4684
4685 // The backend can't handle a single element vector.
4686 if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
4687 return false;
4688
4689 Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
4690
4691 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
4692 return true;
4693
4694 if (!ScalarTy->isIntegerTy())
4695 return false;
4696
4697 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
4698 return IntWidth == 32 || IntWidth == 64 ||
4699 ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
4700}
4701
4702bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) {
4703 return isLegalMaskedExpandLoad(DataTy);
4704}
4705
4706bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
4707 // Some CPUs have better gather performance than others.
4708 // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
4709 // enable gather with a -march.
4710 if (!(ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2())))
4711 return false;
4712
4713 // This function is called now in two cases: from the Loop Vectorizer
4714 // and from the Scalarizer.
4715 // When the Loop Vectorizer asks about legality of the feature,
4716 // the vectorization factor is not calculated yet. The Loop Vectorizer
4717 // sends a scalar type and the decision is based on the width of the
4718 // scalar element.
4719 // Later on, the cost model will estimate usage this intrinsic based on
4720 // the vector type.
4721 // The Scalarizer asks again about legality. It sends a vector type.
4722 // In this case we can reject non-power-of-2 vectors.
4723 // We also reject single element vectors as the type legalizer can't
4724 // scalarize it.
4725 if (auto *DataVTy = dyn_cast<FixedVectorType>(DataTy)) {
4726 unsigned NumElts = DataVTy->getNumElements();
4727 if (NumElts == 1)
4728 return false;
4729 }
4730 Type *ScalarTy = DataTy->getScalarType();
4731 if (ScalarTy->isPointerTy())
4732 return true;
4733
4734 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
4735 return true;
4736
4737 if (!ScalarTy->isIntegerTy())
4738 return false;
4739
4740 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
4741 return IntWidth == 32 || IntWidth == 64;
4742}
4743
4744bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) {
4745 // AVX2 doesn't support scatter
4746 if (!ST->hasAVX512())
4747 return false;
4748 return isLegalMaskedGather(DataType, Alignment);
4749}
4750
4751bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
4752 EVT VT = TLI->getValueType(DL, DataType);
4753 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
4754}
4755
4756bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
4757 return false;
4758}
4759
4760bool X86TTIImpl::areInlineCompatible(const Function *Caller,
4761 const Function *Callee) const {
4762 const TargetMachine &TM = getTLI()->getTargetMachine();
4763
4764 // Work this as a subsetting of subtarget features.
4765 const FeatureBitset &CallerBits =
4766 TM.getSubtargetImpl(*Caller)->getFeatureBits();
4767 const FeatureBitset &CalleeBits =
4768 TM.getSubtargetImpl(*Callee)->getFeatureBits();
4769
4770 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
4771 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
4772 return (RealCallerBits & RealCalleeBits) == RealCalleeBits;
4773}
4774
4775bool X86TTIImpl::areFunctionArgsABICompatible(
4776 const Function *Caller, const Function *Callee,
4777 SmallPtrSetImpl<Argument *> &Args) const {
4778 if (!BaseT::areFunctionArgsABICompatible(Caller, Callee, Args))
4779 return false;
4780
4781 // If we get here, we know the target features match. If one function
4782 // considers 512-bit vectors legal and the other does not, consider them
4783 // incompatible.
4784 const TargetMachine &TM = getTLI()->getTargetMachine();
4785
4786 if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
4787 TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs())
4788 return true;
4789
4790 // Consider the arguments compatible if they aren't vectors or aggregates.
4791 // FIXME: Look at the size of vectors.
4792 // FIXME: Look at the element types of aggregates to see if there are vectors.
4793 // FIXME: The API of this function seems intended to allow arguments
4794 // to be removed from the set, but the caller doesn't check if the set
4795 // becomes empty so that may not work in practice.
4796 return llvm::none_of(Args, [](Argument *A) {
4797 auto *EltTy = cast<PointerType>(A->getType())->getElementType();
4798 return EltTy->isVectorTy() || EltTy->isAggregateType();
4799 });
4800}
4801
4802X86TTIImpl::TTI::MemCmpExpansionOptions
4803X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
4804 TTI::MemCmpExpansionOptions Options;
4805 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
4806 Options.NumLoadsPerBlock = 2;
4807 // All GPR and vector loads can be unaligned.
4808 Options.AllowOverlappingLoads = true;
4809 if (IsZeroCmp) {
4810 // Only enable vector loads for equality comparison. Right now the vector
4811 // version is not as fast for three way compare (see #33329).
4812 const unsigned PreferredWidth = ST->getPreferVectorWidth();
4813 if (PreferredWidth >= 512 && ST->hasAVX512()) Options.LoadSizes.push_back(64);
4814 if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
4815 if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
4816 }
4817 if (ST->is64Bit()) {
4818 Options.LoadSizes.push_back(8);
4819 }
4820 Options.LoadSizes.push_back(4);
4821 Options.LoadSizes.push_back(2);
4822 Options.LoadSizes.push_back(1);
4823 return Options;
4824}
4825
4826bool X86TTIImpl::enableInterleavedAccessVectorization() {
4827 // TODO: We expect this to be beneficial regardless of arch,
4828 // but there are currently some unexplained performance artifacts on Atom.
4829 // As a temporary solution, disable on Atom.
4830 return !(ST->isAtom());
4831}
4832
4833// Get estimation for interleaved load/store operations for AVX2.
4834// \p Factor is the interleaved-access factor (stride) - number of
4835// (interleaved) elements in the group.
4836// \p Indices contains the indices for a strided load: when the
4837// interleaved load has gaps they indicate which elements are used.
4838// If Indices is empty (or if the number of indices is equal to the size
4839// of the interleaved-access as given in \p Factor) the access has no gaps.
4840//
4841// As opposed to AVX-512, AVX2 does not have generic shuffles that allow
4842// computing the cost using a generic formula as a function of generic
4843// shuffles. We therefore use a lookup table instead, filled according to
4844// the instruction sequences that codegen currently generates.
4845InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX2(
4846 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
4847 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
4848 TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
4849
4850 if (UseMaskForCond || UseMaskForGaps)
4
Assuming 'UseMaskForCond' is false
5
Assuming 'UseMaskForGaps' is false
6
Taking false branch
4851 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
4852 Alignment, AddressSpace, CostKind,
4853 UseMaskForCond, UseMaskForGaps);
4854
4855 // We currently Support only fully-interleaved groups, with no gaps.
4856 // TODO: Support also strided loads (interleaved-groups with gaps).
4857 if (Indices.size() && Indices.size() != Factor)
7
Assuming the condition is true
8
Assuming the condition is true
9
Taking true branch
4858 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
10
Calling 'BasicTTIImplBase::getInterleavedMemoryOpCost'
4859 Alignment, AddressSpace, CostKind);
4860
4861 // VecTy for interleave memop is <VF*Factor x Elt>.
4862 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
4863 // VecTy = <12 x i32>.
4864 MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
4865
4866 // This function can be called with VecTy=<6xi128>, Factor=3, in which case
4867 // the VF=2, while v2i128 is an unsupported MVT vector type
4868 // (see MachineValueType.h::getVectorVT()).
4869 if (!LegalVT.isVector())
4870 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
4871 Alignment, AddressSpace, CostKind);
4872
4873 unsigned VF = VecTy->getNumElements() / Factor;
4874 Type *ScalarTy = VecTy->getElementType();
4875 // Deduplicate entries, model floats/pointers as appropriately-sized integers.
4876 if (!ScalarTy->isIntegerTy())
4877 ScalarTy =
4878 Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
4879
4880 // Get the cost of all the memory operations.
4881 InstructionCost MemOpCosts = getMemoryOpCost(
4882 Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind);
4883
4884 auto *VT = FixedVectorType::get(ScalarTy, VF);
4885 EVT ETy = TLI->getValueType(DL, VT);
4886 if (!ETy.isSimple())
4887 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
4888 Alignment, AddressSpace, CostKind);
4889
4890 // TODO: Complete for other data-types and strides.
4891 // Each combination of Stride, element bit width and VF results in a different
4892 // sequence; The cost tables are therefore accessed with:
4893 // Factor (stride) and VectorType=VFxiN.
4894 // The Cost accounts only for the shuffle sequence;
4895 // The cost of the loads/stores is accounted for separately.
4896 //
4897 static const CostTblEntry AVX2InterleavedLoadTbl[] = {
4898 {2, MVT::v4i64, 6}, // (load 8i64 and) deinterleave into 2 x 4i64
4899
4900 {3, MVT::v2i8, 10}, // (load 6i8 and) deinterleave into 3 x 2i8
4901 {3, MVT::v4i8, 4}, // (load 12i8 and) deinterleave into 3 x 4i8
4902 {3, MVT::v8i8, 9}, // (load 24i8 and) deinterleave into 3 x 8i8
4903 {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
4904 {3, MVT::v32i8, 13}, // (load 96i8 and) deinterleave into 3 x 32i8
4905
4906 {3, MVT::v8i32, 17}, // (load 24i32 and) deinterleave into 3 x 8i32
4907
4908 {4, MVT::v2i8, 12}, // (load 8i8 and) deinterleave into 4 x 2i8
4909 {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8
4910 {4, MVT::v8i8, 20}, // (load 32i8 and) deinterleave into 4 x 8i8
4911 {4, MVT::v16i8, 39}, // (load 64i8 and) deinterleave into 4 x 16i8
4912 {4, MVT::v32i8, 80}, // (load 128i8 and) deinterleave into 4 x 32i8
4913
4914 {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
4915 };
4916
4917 static const CostTblEntry AVX2InterleavedStoreTbl[] = {
4918 {2, MVT::v4i64, 6}, // interleave 2 x 4i64 into 8i64 (and store)
4919
4920 {3, MVT::v2i8, 7}, // interleave 3 x 2i8 into 6i8 (and store)
4921 {3, MVT::v4i8, 8}, // interleave 3 x 4i8 into 12i8 (and store)
4922 {3, MVT::v8i8, 11}, // interleave 3 x 8i8 into 24i8 (and store)
4923 {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
4924 {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
4925
4926 {4, MVT::v2i8, 12}, // interleave 4 x 2i8 into 8i8 (and store)
4927 {4, MVT::v4i8, 9}, // interleave 4 x 4i8 into 16i8 (and store)
4928 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
4929 {4, MVT::v16i8, 10}, // interleave 4 x 16i8 into 64i8 (and store)
4930 {4, MVT::v32i8, 12} // interleave 4 x 32i8 into 128i8 (and store)
4931 };
4932
4933 if (Opcode == Instruction::Load) {
4934 if (const auto *Entry =
4935 CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT()))
4936 return MemOpCosts + Entry->Cost;
4937 } else {
4938 assert(Opcode == Instruction::Store &&(static_cast <bool> (Opcode == Instruction::Store &&
"Expected Store Instruction at this point") ? void (0) : __assert_fail
("Opcode == Instruction::Store && \"Expected Store Instruction at this point\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 4939, __extension__ __PRETTY_FUNCTION__))
4939 "Expected Store Instruction at this point")(static_cast <bool> (Opcode == Instruction::Store &&
"Expected Store Instruction at this point") ? void (0) : __assert_fail
("Opcode == Instruction::Store && \"Expected Store Instruction at this point\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 4939, __extension__ __PRETTY_FUNCTION__))
;
4940 if (const auto *Entry =
4941 CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT()))
4942 return MemOpCosts + Entry->Cost;
4943 }
4944
4945 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
4946 Alignment, AddressSpace, CostKind);
4947}
4948
4949// Get estimation for interleaved load/store operations and strided load.
4950// \p Indices contains indices for strided load.
4951// \p Factor - the factor of interleaving.
4952// AVX-512 provides 3-src shuffles that significantly reduces the cost.
4953InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
4954 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
4955 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
4956 TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
4957
4958 if (UseMaskForCond || UseMaskForGaps)
4959 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
4960 Alignment, AddressSpace, CostKind,
4961 UseMaskForCond, UseMaskForGaps);
4962
4963 // VecTy for interleave memop is <VF*Factor x Elt>.
4964 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
4965 // VecTy = <12 x i32>.
4966
4967 // Calculate the number of memory operations (NumOfMemOps), required
4968 // for load/store the VecTy.
4969 MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
4970 unsigned VecTySize = DL.getTypeStoreSize(VecTy);
4971 unsigned LegalVTSize = LegalVT.getStoreSize();
4972 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
4973
4974 // Get the cost of one memory operation.
4975 auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
4976 LegalVT.getVectorNumElements());
4977 InstructionCost MemOpCost = getMemoryOpCost(
4978 Opcode, SingleMemOpTy, MaybeAlign(Alignment), AddressSpace, CostKind);
4979
4980 unsigned VF = VecTy->getNumElements() / Factor;
4981 MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
4982
4983 if (Opcode == Instruction::Load) {
4984 // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
4985 // contain the cost of the optimized shuffle sequence that the
4986 // X86InterleavedAccess pass will generate.
4987 // The cost of loads and stores are computed separately from the table.
4988
4989 // X86InterleavedAccess support only the following interleaved-access group.
4990 static const CostTblEntry AVX512InterleavedLoadTbl[] = {
4991 {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
4992 {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
4993 {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
4994 };
4995
4996 if (const auto *Entry =
4997 CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
4998 return NumOfMemOps * MemOpCost + Entry->Cost;
4999 //If an entry does not exist, fallback to the default implementation.
5000
5001 // Kind of shuffle depends on number of loaded values.
5002 // If we load the entire data in one register, we can use a 1-src shuffle.
5003 // Otherwise, we'll merge 2 sources in each operation.
5004 TTI::ShuffleKind ShuffleKind =
5005 (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
5006
5007 InstructionCost ShuffleCost =
5008 getShuffleCost(ShuffleKind, SingleMemOpTy, None, 0, nullptr);
5009
5010 unsigned NumOfLoadsInInterleaveGrp =
5011 Indices.size() ? Indices.size() : Factor;
5012 auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
5013 VecTy->getNumElements() / Factor);
5014 InstructionCost NumOfResults =
5015 getTLI()->getTypeLegalizationCost(DL, ResultTy).first *
5016 NumOfLoadsInInterleaveGrp;
5017
5018 // About a half of the loads may be folded in shuffles when we have only
5019 // one result. If we have more than one result, we do not fold loads at all.
5020 unsigned NumOfUnfoldedLoads =
5021 NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
5022
5023 // Get a number of shuffle operations per result.
5024 unsigned NumOfShufflesPerResult =
5025 std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
5026
5027 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
5028 // When we have more than one destination, we need additional instructions
5029 // to keep sources.
5030 InstructionCost NumOfMoves = 0;
5031 if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
5032 NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
5033
5034 InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
5035 NumOfUnfoldedLoads * MemOpCost + NumOfMoves;
5036
5037 return Cost;
5038 }
5039
5040 // Store.
5041 assert(Opcode == Instruction::Store &&(static_cast <bool> (Opcode == Instruction::Store &&
"Expected Store Instruction at this point") ? void (0) : __assert_fail
("Opcode == Instruction::Store && \"Expected Store Instruction at this point\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 5042, __extension__ __PRETTY_FUNCTION__))
5042 "Expected Store Instruction at this point")(static_cast <bool> (Opcode == Instruction::Store &&
"Expected Store Instruction at this point") ? void (0) : __assert_fail
("Opcode == Instruction::Store && \"Expected Store Instruction at this point\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/lib/Target/X86/X86TargetTransformInfo.cpp"
, 5042, __extension__ __PRETTY_FUNCTION__))
;
5043 // X86InterleavedAccess support only the following interleaved-access group.
5044 static const CostTblEntry AVX512InterleavedStoreTbl[] = {
5045 {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
5046 {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
5047 {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
5048
5049 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
5050 {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
5051 {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
5052 {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
5053 };
5054
5055 if (const auto *Entry =
5056 CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
5057 return NumOfMemOps * MemOpCost + Entry->Cost;
5058 //If an entry does not exist, fallback to the default implementation.
5059
5060 // There is no strided stores meanwhile. And store can't be folded in
5061 // shuffle.
5062 unsigned NumOfSources = Factor; // The number of values to be merged.
5063 InstructionCost ShuffleCost =
5064 getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, None, 0, nullptr);
5065 unsigned NumOfShufflesPerStore = NumOfSources - 1;
5066
5067 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
5068 // We need additional instructions to keep sources.
5069 unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
5070 InstructionCost Cost =
5071 NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
5072 NumOfMoves;
5073 return Cost;
5074}
5075
5076InstructionCost X86TTIImpl::getInterleavedMemoryOpCost(
5077 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
5078 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
5079 bool UseMaskForCond, bool UseMaskForGaps) {
5080 auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) {
5081 Type *EltTy = cast<VectorType>(VecTy)->getElementType();
5082 if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
5083 EltTy->isIntegerTy(32) || EltTy->isPointerTy())
5084 return true;
5085 if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8))
5086 return HasBW;
5087 return false;
5088 };
5089 if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
5090 return getInterleavedMemoryOpCostAVX512(
5091 Opcode, cast<FixedVectorType>(VecTy), Factor, Indices, Alignment,
5092 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
5093 if (ST->hasAVX2())
1
Taking true branch
5094 return getInterleavedMemoryOpCostAVX2(
3
Calling 'X86TTIImpl::getInterleavedMemoryOpCostAVX2'
5095 Opcode, cast<FixedVectorType>(VecTy), Factor, Indices, Alignment,
2
'VecTy' is a 'FixedVectorType'
5096 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
5097
5098 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
5099 Alignment, AddressSpace, CostKind,
5100 UseMaskForCond, UseMaskForGaps);
5101}

/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/CodeGen/BasicTTIImpl.h

1//===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file provides a helper that implements much of the TTI interface in
11/// terms of the target-independent code generator and TargetLowering
12/// interfaces.
13//
14//===----------------------------------------------------------------------===//
15
16#ifndef LLVM_CODEGEN_BASICTTIIMPL_H
17#define LLVM_CODEGEN_BASICTTIIMPL_H
18
19#include "llvm/ADT/APInt.h"
20#include "llvm/ADT/ArrayRef.h"
21#include "llvm/ADT/BitVector.h"
22#include "llvm/ADT/SmallPtrSet.h"
23#include "llvm/ADT/SmallVector.h"
24#include "llvm/Analysis/LoopInfo.h"
25#include "llvm/Analysis/TargetTransformInfo.h"
26#include "llvm/Analysis/TargetTransformInfoImpl.h"
27#include "llvm/CodeGen/ISDOpcodes.h"
28#include "llvm/CodeGen/TargetLowering.h"
29#include "llvm/CodeGen/TargetSubtargetInfo.h"
30#include "llvm/CodeGen/ValueTypes.h"
31#include "llvm/IR/BasicBlock.h"
32#include "llvm/IR/Constant.h"
33#include "llvm/IR/Constants.h"
34#include "llvm/IR/DataLayout.h"
35#include "llvm/IR/DerivedTypes.h"
36#include "llvm/IR/InstrTypes.h"
37#include "llvm/IR/Instruction.h"
38#include "llvm/IR/Instructions.h"
39#include "llvm/IR/Intrinsics.h"
40#include "llvm/IR/Operator.h"
41#include "llvm/IR/Type.h"
42#include "llvm/IR/Value.h"
43#include "llvm/Support/Casting.h"
44#include "llvm/Support/CommandLine.h"
45#include "llvm/Support/ErrorHandling.h"
46#include "llvm/Support/MachineValueType.h"
47#include "llvm/Support/MathExtras.h"
48#include "llvm/Target/TargetMachine.h"
49#include <algorithm>
50#include <cassert>
51#include <cstdint>
52#include <limits>
53#include <utility>
54
55namespace llvm {
56
57class Function;
58class GlobalValue;
59class LLVMContext;
60class ScalarEvolution;
61class SCEV;
62class TargetMachine;
63
64extern cl::opt<unsigned> PartialUnrollingThreshold;
65
66/// Base class which can be used to help build a TTI implementation.
67///
68/// This class provides as much implementation of the TTI interface as is
69/// possible using the target independent parts of the code generator.
70///
71/// In order to subclass it, your class must implement a getST() method to
72/// return the subtarget, and a getTLI() method to return the target lowering.
73/// We need these methods implemented in the derived class so that this class
74/// doesn't have to duplicate storage for them.
75template <typename T>
76class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
77private:
78 using BaseT = TargetTransformInfoImplCRTPBase<T>;
79 using TTI = TargetTransformInfo;
80
81 /// Helper function to access this as a T.
82 T *thisT() { return static_cast<T *>(this); }
83
84 /// Estimate a cost of Broadcast as an extract and sequence of insert
85 /// operations.
86 InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy) {
87 InstructionCost Cost = 0;
88 // Broadcast cost is equal to the cost of extracting the zero'th element
89 // plus the cost of inserting it into every element of the result vector.
90 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 0);
91
92 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
93 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i);
94 }
95 return Cost;
96 }
97
98 /// Estimate a cost of shuffle as a sequence of extract and insert
99 /// operations.
100 InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy) {
101 InstructionCost Cost = 0;
102 // Shuffle cost is equal to the cost of extracting element from its argument
103 // plus the cost of inserting them onto the result vector.
104
105 // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
106 // index 0 of first vector, index 1 of second vector,index 2 of first
107 // vector and finally index 3 of second vector and insert them at index
108 // <0,1,2,3> of result vector.
109 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
110 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i);
111 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, i);
112 }
113 return Cost;
114 }
115
116 /// Estimate a cost of subvector extraction as a sequence of extract and
117 /// insert operations.
118 InstructionCost getExtractSubvectorOverhead(VectorType *VTy, int Index,
119 FixedVectorType *SubVTy) {
120 assert(VTy && SubVTy &&(static_cast <bool> (VTy && SubVTy && "Can only extract subvectors from vectors"
) ? void (0) : __assert_fail ("VTy && SubVTy && \"Can only extract subvectors from vectors\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 121, __extension__ __PRETTY_FUNCTION__))
121 "Can only extract subvectors from vectors")(static_cast <bool> (VTy && SubVTy && "Can only extract subvectors from vectors"
) ? void (0) : __assert_fail ("VTy && SubVTy && \"Can only extract subvectors from vectors\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 121, __extension__ __PRETTY_FUNCTION__))
;
122 int NumSubElts = SubVTy->getNumElements();
123 assert((!isa<FixedVectorType>(VTy) ||(static_cast <bool> ((!isa<FixedVectorType>(VTy) ||
(Index + NumSubElts) <= (int)cast<FixedVectorType>(
VTy)->getNumElements()) && "SK_ExtractSubvector index out of range"
) ? void (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_ExtractSubvector index out of range\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 126, __extension__ __PRETTY_FUNCTION__))
124 (Index + NumSubElts) <=(static_cast <bool> ((!isa<FixedVectorType>(VTy) ||
(Index + NumSubElts) <= (int)cast<FixedVectorType>(
VTy)->getNumElements()) && "SK_ExtractSubvector index out of range"
) ? void (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_ExtractSubvector index out of range\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 126, __extension__ __PRETTY_FUNCTION__))
125 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&(static_cast <bool> ((!isa<FixedVectorType>(VTy) ||
(Index + NumSubElts) <= (int)cast<FixedVectorType>(
VTy)->getNumElements()) && "SK_ExtractSubvector index out of range"
) ? void (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_ExtractSubvector index out of range\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 126, __extension__ __PRETTY_FUNCTION__))
126 "SK_ExtractSubvector index out of range")(static_cast <bool> ((!isa<FixedVectorType>(VTy) ||
(Index + NumSubElts) <= (int)cast<FixedVectorType>(
VTy)->getNumElements()) && "SK_ExtractSubvector index out of range"
) ? void (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_ExtractSubvector index out of range\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 126, __extension__ __PRETTY_FUNCTION__))
;
127
128 InstructionCost Cost = 0;
129 // Subvector extraction cost is equal to the cost of extracting element from
130 // the source type plus the cost of inserting them into the result vector
131 // type.
132 for (int i = 0; i != NumSubElts; ++i) {
133 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
134 i + Index);
135 Cost +=
136 thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, i);
137 }
138 return Cost;
139 }
140
141 /// Estimate a cost of subvector insertion as a sequence of extract and
142 /// insert operations.
143 InstructionCost getInsertSubvectorOverhead(VectorType *VTy, int Index,
144 FixedVectorType *SubVTy) {
145 assert(VTy && SubVTy &&(static_cast <bool> (VTy && SubVTy && "Can only insert subvectors into vectors"
) ? void (0) : __assert_fail ("VTy && SubVTy && \"Can only insert subvectors into vectors\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 146, __extension__ __PRETTY_FUNCTION__))
146 "Can only insert subvectors into vectors")(static_cast <bool> (VTy && SubVTy && "Can only insert subvectors into vectors"
) ? void (0) : __assert_fail ("VTy && SubVTy && \"Can only insert subvectors into vectors\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 146, __extension__ __PRETTY_FUNCTION__))
;
147 int NumSubElts = SubVTy->getNumElements();
148 assert((!isa<FixedVectorType>(VTy) ||(static_cast <bool> ((!isa<FixedVectorType>(VTy) ||
(Index + NumSubElts) <= (int)cast<FixedVectorType>(
VTy)->getNumElements()) && "SK_InsertSubvector index out of range"
) ? void (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_InsertSubvector index out of range\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 151, __extension__ __PRETTY_FUNCTION__))
149 (Index + NumSubElts) <=(static_cast <bool> ((!isa<FixedVectorType>(VTy) ||
(Index + NumSubElts) <= (int)cast<FixedVectorType>(
VTy)->getNumElements()) && "SK_InsertSubvector index out of range"
) ? void (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_InsertSubvector index out of range\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 151, __extension__ __PRETTY_FUNCTION__))
150 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&(static_cast <bool> ((!isa<FixedVectorType>(VTy) ||
(Index + NumSubElts) <= (int)cast<FixedVectorType>(
VTy)->getNumElements()) && "SK_InsertSubvector index out of range"
) ? void (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_InsertSubvector index out of range\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 151, __extension__ __PRETTY_FUNCTION__))
151 "SK_InsertSubvector index out of range")(static_cast <bool> ((!isa<FixedVectorType>(VTy) ||
(Index + NumSubElts) <= (int)cast<FixedVectorType>(
VTy)->getNumElements()) && "SK_InsertSubvector index out of range"
) ? void (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_InsertSubvector index out of range\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 151, __extension__ __PRETTY_FUNCTION__))
;
152
153 InstructionCost Cost = 0;
154 // Subvector insertion cost is equal to the cost of extracting element from
155 // the source type plus the cost of inserting them into the result vector
156 // type.
157 for (int i = 0; i != NumSubElts; ++i) {
158 Cost +=
159 thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, i);
160 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
161 i + Index);
162 }
163 return Cost;
164 }
165
166 /// Local query method delegates up to T which *must* implement this!
167 const TargetSubtargetInfo *getST() const {
168 return static_cast<const T *>(this)->getST();
169 }
170
171 /// Local query method delegates up to T which *must* implement this!
172 const TargetLoweringBase *getTLI() const {
173 return static_cast<const T *>(this)->getTLI();
174 }
175
176 static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
177 switch (M) {
178 case TTI::MIM_Unindexed:
179 return ISD::UNINDEXED;
180 case TTI::MIM_PreInc:
181 return ISD::PRE_INC;
182 case TTI::MIM_PreDec:
183 return ISD::PRE_DEC;
184 case TTI::MIM_PostInc:
185 return ISD::POST_INC;
186 case TTI::MIM_PostDec:
187 return ISD::POST_DEC;
188 }
189 llvm_unreachable("Unexpected MemIndexedMode")::llvm::llvm_unreachable_internal("Unexpected MemIndexedMode"
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 189)
;
190 }
191
192 InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
193 Align Alignment,
194 bool VariableMask,
195 bool IsGatherScatter,
196 TTI::TargetCostKind CostKind) {
197 auto *VT = cast<FixedVectorType>(DataTy);
198 // Assume the target does not have support for gather/scatter operations
199 // and provide a rough estimate.
200 //
201 // First, compute the cost of the individual memory operations.
202 InstructionCost AddrExtractCost =
203 IsGatherScatter
204 ? getVectorInstrCost(Instruction::ExtractElement,
205 FixedVectorType::get(
206 PointerType::get(VT->getElementType(), 0),
207 VT->getNumElements()),
208 -1)
209 : 0;
210 InstructionCost LoadCost =
211 VT->getNumElements() *
212 (AddrExtractCost +
213 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind));
214
215 // Next, compute the cost of packing the result in a vector.
216 InstructionCost PackingCost = getScalarizationOverhead(
217 VT, Opcode != Instruction::Store, Opcode == Instruction::Store);
218
219 InstructionCost ConditionalCost = 0;
220 if (VariableMask) {
221 // Compute the cost of conditionally executing the memory operations with
222 // variable masks. This includes extracting the individual conditions, a
223 // branches and PHIs to combine the results.
224 // NOTE: Estimating the cost of conditionally executing the memory
225 // operations accurately is quite difficult and the current solution
226 // provides a very rough estimate only.
227 ConditionalCost =
228 VT->getNumElements() *
229 (getVectorInstrCost(
230 Instruction::ExtractElement,
231 FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()),
232 VT->getNumElements()),
233 -1) +
234 getCFInstrCost(Instruction::Br, CostKind) +
235 getCFInstrCost(Instruction::PHI, CostKind));
236 }
237
238 return LoadCost + PackingCost + ConditionalCost;
239 }
240
241protected:
242 explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
243 : BaseT(DL) {}
244 virtual ~BasicTTIImplBase() = default;
245
246 using TargetTransformInfoImplBase::DL;
247
248public:
249 /// \name Scalar TTI Implementations
250 /// @{
251 bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth,
252 unsigned AddressSpace, Align Alignment,
253 bool *Fast) const {
254 EVT E = EVT::getIntegerVT(Context, BitWidth);
255 return getTLI()->allowsMisalignedMemoryAccesses(
256 E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast);
257 }
258
259 bool hasBranchDivergence() { return false; }
260
261 bool useGPUDivergenceAnalysis() { return false; }
262
263 bool isSourceOfDivergence(const Value *V) { return false; }
264
265 bool isAlwaysUniform(const Value *V) { return false; }
266
267 unsigned getFlatAddressSpace() {
268 // Return an invalid address space.
269 return -1;
270 }
271
272 bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
273 Intrinsic::ID IID) const {
274 return false;
275 }
276
277 bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
278 return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
279 }
280
281 unsigned getAssumedAddrSpace(const Value *V) const {
282 return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
283 }
284
285 Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
286 Value *NewV) const {
287 return nullptr;
288 }
289
290 bool isLegalAddImmediate(int64_t imm) {
291 return getTLI()->isLegalAddImmediate(imm);
292 }
293
294 bool isLegalICmpImmediate(int64_t imm) {
295 return getTLI()->isLegalICmpImmediate(imm);
296 }
297
298 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
299 bool HasBaseReg, int64_t Scale,
300 unsigned AddrSpace, Instruction *I = nullptr) {
301 TargetLoweringBase::AddrMode AM;
302 AM.BaseGV = BaseGV;
303 AM.BaseOffs = BaseOffset;
304 AM.HasBaseReg = HasBaseReg;
305 AM.Scale = Scale;
306 return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
307 }
308
309 bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty,
310 const DataLayout &DL) const {
311 EVT VT = getTLI()->getValueType(DL, Ty);
312 return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
313 }
314
315 bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty,
316 const DataLayout &DL) const {
317 EVT VT = getTLI()->getValueType(DL, Ty);
318 return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
319 }
320
321 bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) {
322 return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
323 }
324
325 bool isNumRegsMajorCostOfLSR() {
326 return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR();
327 }
328
329 bool isProfitableLSRChainElement(Instruction *I) {
330 return TargetTransformInfoImplBase::isProfitableLSRChainElement(I);
331 }
332
333 InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
334 int64_t BaseOffset, bool HasBaseReg,
335 int64_t Scale, unsigned AddrSpace) {
336 TargetLoweringBase::AddrMode AM;
337 AM.BaseGV = BaseGV;
338 AM.BaseOffs = BaseOffset;
339 AM.HasBaseReg = HasBaseReg;
340 AM.Scale = Scale;
341 return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace);
342 }
343
344 bool isTruncateFree(Type *Ty1, Type *Ty2) {
345 return getTLI()->isTruncateFree(Ty1, Ty2);
346 }
347
348 bool isProfitableToHoist(Instruction *I) {
349 return getTLI()->isProfitableToHoist(I);
350 }
351
352 bool useAA() const { return getST()->useAA(); }
353
354 bool isTypeLegal(Type *Ty) {
355 EVT VT = getTLI()->getValueType(DL, Ty);
356 return getTLI()->isTypeLegal(VT);
357 }
358
359 InstructionCost getRegUsageForType(Type *Ty) {
360 InstructionCost Val = getTLI()->getTypeLegalizationCost(DL, Ty).first;
361 assert(Val >= 0 && "Negative cost!")(static_cast <bool> (Val >= 0 && "Negative cost!"
) ? void (0) : __assert_fail ("Val >= 0 && \"Negative cost!\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 361, __extension__ __PRETTY_FUNCTION__))
;
362 return Val;
363 }
364
365 InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr,
366 ArrayRef<const Value *> Operands) {
367 return BaseT::getGEPCost(PointeeType, Ptr, Operands);
368 }
369
370 unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
371 unsigned &JumpTableSize,
372 ProfileSummaryInfo *PSI,
373 BlockFrequencyInfo *BFI) {
374 /// Try to find the estimated number of clusters. Note that the number of
375 /// clusters identified in this function could be different from the actual
376 /// numbers found in lowering. This function ignore switches that are
377 /// lowered with a mix of jump table / bit test / BTree. This function was
378 /// initially intended to be used when estimating the cost of switch in
379 /// inline cost heuristic, but it's a generic cost model to be used in other
380 /// places (e.g., in loop unrolling).
381 unsigned N = SI.getNumCases();
382 const TargetLoweringBase *TLI = getTLI();
383 const DataLayout &DL = this->getDataLayout();
384
385 JumpTableSize = 0;
386 bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());
387
388 // Early exit if both a jump table and bit test are not allowed.
389 if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
390 return N;
391
392 APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
393 APInt MinCaseVal = MaxCaseVal;
394 for (auto CI : SI.cases()) {
395 const APInt &CaseVal = CI.getCaseValue()->getValue();
396 if (CaseVal.sgt(MaxCaseVal))
397 MaxCaseVal = CaseVal;
398 if (CaseVal.slt(MinCaseVal))
399 MinCaseVal = CaseVal;
400 }
401
402 // Check if suitable for a bit test
403 if (N <= DL.getIndexSizeInBits(0u)) {
404 SmallPtrSet<const BasicBlock *, 4> Dests;
405 for (auto I : SI.cases())
406 Dests.insert(I.getCaseSuccessor());
407
408 if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
409 DL))
410 return 1;
411 }
412
413 // Check if suitable for a jump table.
414 if (IsJTAllowed) {
415 if (N < 2 || N < TLI->getMinimumJumpTableEntries())
416 return N;
417 uint64_t Range =
418 (MaxCaseVal - MinCaseVal)
419 .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
420 // Check whether a range of clusters is dense enough for a jump table
421 if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) {
422 JumpTableSize = Range;
423 return 1;
424 }
425 }
426 return N;
427 }
428
429 bool shouldBuildLookupTables() {
430 const TargetLoweringBase *TLI = getTLI();
431 return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
432 TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
433 }
434
435 bool shouldBuildRelLookupTables() const {
436 const TargetMachine &TM = getTLI()->getTargetMachine();
437 // If non-PIC mode, do not generate a relative lookup table.
438 if (!TM.isPositionIndependent())
439 return false;
440
441 /// Relative lookup table entries consist of 32-bit offsets.
442 /// Do not generate relative lookup tables for large code models
443 /// in 64-bit achitectures where 32-bit offsets might not be enough.
444 if (TM.getCodeModel() == CodeModel::Medium ||
445 TM.getCodeModel() == CodeModel::Large)
446 return false;
447
448 Triple TargetTriple = TM.getTargetTriple();
449 if (!TargetTriple.isArch64Bit())
450 return false;
451
452 // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it
453 // there.
454 if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin())
455 return false;
456
457 return true;
458 }
459
460 bool haveFastSqrt(Type *Ty) {
461 const TargetLoweringBase *TLI = getTLI();
462 EVT VT = TLI->getValueType(DL, Ty);
463 return TLI->isTypeLegal(VT) &&
464 TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
465 }
466
467 bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
468 return true;
469 }
470
471 InstructionCost getFPOpCost(Type *Ty) {
472 // Check whether FADD is available, as a proxy for floating-point in
473 // general.
474 const TargetLoweringBase *TLI = getTLI();
475 EVT VT = TLI->getValueType(DL, Ty);
476 if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT))
477 return TargetTransformInfo::TCC_Basic;
478 return TargetTransformInfo::TCC_Expensive;
479 }
480
481 unsigned getInliningThresholdMultiplier() { return 1; }
482 unsigned adjustInliningThreshold(const CallBase *CB) { return 0; }
483
484 int getInlinerVectorBonusPercent() { return 150; }
485
486 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
487 TTI::UnrollingPreferences &UP) {
488 // This unrolling functionality is target independent, but to provide some
489 // motivation for its intended use, for x86:
490
491 // According to the Intel 64 and IA-32 Architectures Optimization Reference
492 // Manual, Intel Core models and later have a loop stream detector (and
493 // associated uop queue) that can benefit from partial unrolling.
494 // The relevant requirements are:
495 // - The loop must have no more than 4 (8 for Nehalem and later) branches
496 // taken, and none of them may be calls.
497 // - The loop can have no more than 18 (28 for Nehalem and later) uops.
498
499 // According to the Software Optimization Guide for AMD Family 15h
500 // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
501 // and loop buffer which can benefit from partial unrolling.
502 // The relevant requirements are:
503 // - The loop must have fewer than 16 branches
504 // - The loop must have less than 40 uops in all executed loop branches
505
506 // The number of taken branches in a loop is hard to estimate here, and
507 // benchmarking has revealed that it is better not to be conservative when
508 // estimating the branch count. As a result, we'll ignore the branch limits
509 // until someone finds a case where it matters in practice.
510
511 unsigned MaxOps;
512 const TargetSubtargetInfo *ST = getST();
513 if (PartialUnrollingThreshold.getNumOccurrences() > 0)
514 MaxOps = PartialUnrollingThreshold;
515 else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
516 MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
517 else
518 return;
519
520 // Scan the loop: don't unroll loops with calls.
521 for (BasicBlock *BB : L->blocks()) {
522 for (Instruction &I : *BB) {
523 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
524 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
525 if (!thisT()->isLoweredToCall(F))
526 continue;
527 }
528
529 return;
530 }
531 }
532 }
533
534 // Enable runtime and partial unrolling up to the specified size.
535 // Enable using trip count upper bound to unroll loops.
536 UP.Partial = UP.Runtime = UP.UpperBound = true;
537 UP.PartialThreshold = MaxOps;
538
539 // Avoid unrolling when optimizing for size.
540 UP.OptSizeThreshold = 0;
541 UP.PartialOptSizeThreshold = 0;
542
543 // Set number of instructions optimized when "back edge"
544 // becomes "fall through" to default value of 2.
545 UP.BEInsns = 2;
546 }
547
548 void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
549 TTI::PeelingPreferences &PP) {
550 PP.PeelCount = 0;
551 PP.AllowPeeling = true;
552 PP.AllowLoopNestsPeeling = false;
553 PP.PeelProfiledIterations = true;
554 }
555
556 bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
557 AssumptionCache &AC,
558 TargetLibraryInfo *LibInfo,
559 HardwareLoopInfo &HWLoopInfo) {
560 return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
561 }
562
563 bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
564 AssumptionCache &AC, TargetLibraryInfo *TLI,
565 DominatorTree *DT,
566 const LoopAccessInfo *LAI) {
567 return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
568 }
569
570 bool emitGetActiveLaneMask() {
571 return BaseT::emitGetActiveLaneMask();
572 }
573
574 Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
575 IntrinsicInst &II) {
576 return BaseT::instCombineIntrinsic(IC, II);
577 }
578
579 Optional<Value *> simplifyDemandedUseBitsIntrinsic(InstCombiner &IC,
580 IntrinsicInst &II,
581 APInt DemandedMask,
582 KnownBits &Known,
583 bool &KnownBitsComputed) {
584 return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
585 KnownBitsComputed);
586 }
587
588 Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
589 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
590 APInt &UndefElts2, APInt &UndefElts3,
591 std::function<void(Instruction *, unsigned, APInt, APInt &)>
592 SimplifyAndSetOp) {
593 return BaseT::simplifyDemandedVectorEltsIntrinsic(
594 IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
595 SimplifyAndSetOp);
596 }
597
598 InstructionCost getInstructionLatency(const Instruction *I) {
599 if (isa<LoadInst>(I))
600 return getST()->getSchedModel().DefaultLoadLatency;
601
602 return BaseT::getInstructionLatency(I);
603 }
604
605 virtual Optional<unsigned>
606 getCacheSize(TargetTransformInfo::CacheLevel Level) const {
607 return Optional<unsigned>(
608 getST()->getCacheSize(static_cast<unsigned>(Level)));
609 }
610
611 virtual Optional<unsigned>
612 getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const {
613 Optional<unsigned> TargetResult =
614 getST()->getCacheAssociativity(static_cast<unsigned>(Level));
615
616 if (TargetResult)
617 return TargetResult;
618
619 return BaseT::getCacheAssociativity(Level);
620 }
621
622 virtual unsigned getCacheLineSize() const {
623 return getST()->getCacheLineSize();
624 }
625
626 virtual unsigned getPrefetchDistance() const {
627 return getST()->getPrefetchDistance();
628 }
629
630 virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
631 unsigned NumStridedMemAccesses,
632 unsigned NumPrefetches,
633 bool HasCall) const {
634 return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
635 NumPrefetches, HasCall);
636 }
637
638 virtual unsigned getMaxPrefetchIterationsAhead() const {
639 return getST()->getMaxPrefetchIterationsAhead();
640 }
641
642 virtual bool enableWritePrefetching() const {
643 return getST()->enableWritePrefetching();
644 }
645
646 /// @}
647
648 /// \name Vector TTI Implementations
649 /// @{
650
651 TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
652 return TypeSize::getFixed(32);
653 }
654
655 Optional<unsigned> getMaxVScale() const { return None; }
656
657 /// Estimate the overhead of scalarizing an instruction. Insert and Extract
658 /// are set if the demanded result elements need to be inserted and/or
659 /// extracted from vectors.
660 InstructionCost getScalarizationOverhead(VectorType *InTy,
661 const APInt &DemandedElts,
662 bool Insert, bool Extract) {
663 /// FIXME: a bitfield is not a reasonable abstraction for talking about
664 /// which elements are needed from a scalable vector
665 auto *Ty = cast<FixedVectorType>(InTy);
666
667 assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&(static_cast <bool> (DemandedElts.getBitWidth() == Ty->
getNumElements() && "Vector size mismatch") ? void (0
) : __assert_fail ("DemandedElts.getBitWidth() == Ty->getNumElements() && \"Vector size mismatch\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 668, __extension__ __PRETTY_FUNCTION__))
668 "Vector size mismatch")(static_cast <bool> (DemandedElts.getBitWidth() == Ty->
getNumElements() && "Vector size mismatch") ? void (0
) : __assert_fail ("DemandedElts.getBitWidth() == Ty->getNumElements() && \"Vector size mismatch\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 668, __extension__ __PRETTY_FUNCTION__))
;
669
670 InstructionCost Cost = 0;
671
672 for (int i = 0, e = Ty->getNumElements(); i < e; ++i) {
673 if (!DemandedElts[i])
674 continue;
675 if (Insert)
676 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, i);
677 if (Extract)
678 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
679 }
680
681 return Cost;
682 }
683
684 /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
685 InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert,
686 bool Extract) {
687 auto *Ty = cast<FixedVectorType>(InTy);
688
689 APInt DemandedElts = APInt::getAllOnesValue(Ty->getNumElements());
690 return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
691 }
692
693 /// Estimate the overhead of scalarizing an instructions unique
694 /// non-constant operands. The (potentially vector) types to use for each of
695 /// argument are passes via Tys.
696 InstructionCost getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
697 ArrayRef<Type *> Tys) {
698 assert(Args.size() == Tys.size() && "Expected matching Args and Tys")(static_cast <bool> (Args.size() == Tys.size() &&
"Expected matching Args and Tys") ? void (0) : __assert_fail
("Args.size() == Tys.size() && \"Expected matching Args and Tys\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 698, __extension__ __PRETTY_FUNCTION__))
;
699
700 InstructionCost Cost = 0;
701 SmallPtrSet<const Value*, 4> UniqueOperands;
702 for (int I = 0, E = Args.size(); I != E; I++) {
703 // Disregard things like metadata arguments.
704 const Value *A = Args[I];
705 Type *Ty = Tys[I];
706 if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
707 !Ty->isPtrOrPtrVectorTy())
708 continue;
709
710 if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
711 if (auto *VecTy = dyn_cast<VectorType>(Ty))
712 Cost += getScalarizationOverhead(VecTy, false, true);
713 }
714 }
715
716 return Cost;
717 }
718
719 /// Estimate the overhead of scalarizing the inputs and outputs of an
720 /// instruction, with return type RetTy and arguments Args of type Tys. If
721 /// Args are unknown (empty), then the cost associated with one argument is
722 /// added as a heuristic.
723 InstructionCost getScalarizationOverhead(VectorType *RetTy,
724 ArrayRef<const Value *> Args,
725 ArrayRef<Type *> Tys) {
726 InstructionCost Cost = getScalarizationOverhead(RetTy, true, false);
727 if (!Args.empty())
728 Cost += getOperandsScalarizationOverhead(Args, Tys);
729 else
730 // When no information on arguments is provided, we add the cost
731 // associated with one argument as a heuristic.
732 Cost += getScalarizationOverhead(RetTy, false, true);
733
734 return Cost;
735 }
736
737 unsigned getMaxInterleaveFactor(unsigned VF) { return 1; }
738
739 InstructionCost getArithmeticInstrCost(
740 unsigned Opcode, Type *Ty,
741 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
742 TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
743 TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
744 TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
745 TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
746 ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
747 const Instruction *CxtI = nullptr) {
748 // Check if any of the operands are vector operands.
749 const TargetLoweringBase *TLI = getTLI();
750 int ISD = TLI->InstructionOpcodeToISD(Opcode);
751 assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ?
void (0) : __assert_fail ("ISD && \"Invalid opcode\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 751, __extension__ __PRETTY_FUNCTION__))
;
752
753 // TODO: Handle more cost kinds.
754 if (CostKind != TTI::TCK_RecipThroughput)
755 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
756 Opd1Info, Opd2Info,
757 Opd1PropInfo, Opd2PropInfo,
758 Args, CxtI);
759
760 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
761
762 bool IsFloat = Ty->isFPOrFPVectorTy();
763 // Assume that floating point arithmetic operations cost twice as much as
764 // integer operations.
765 InstructionCost OpCost = (IsFloat ? 2 : 1);
766
767 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
768 // The operation is legal. Assume it costs 1.
769 // TODO: Once we have extract/insert subvector cost we need to use them.
770 return LT.first * OpCost;
771 }
772
773 if (!TLI->isOperationExpand(ISD, LT.second)) {
774 // If the operation is custom lowered, then assume that the code is twice
775 // as expensive.
776 return LT.first * 2 * OpCost;
777 }
778
779 // We cannot scalarize scalable vectors, so return Invalid.
780 if (isa<ScalableVectorType>(Ty))
781 return InstructionCost::getInvalid();
782
783 // Else, assume that we need to scalarize this op.
784 // TODO: If one of the types get legalized by splitting, handle this
785 // similarly to what getCastInstrCost() does.
786 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
787 InstructionCost Cost = thisT()->getArithmeticInstrCost(
788 Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
789 Opd1PropInfo, Opd2PropInfo, Args, CxtI);
790 // Return the cost of multiple scalar invocation plus the cost of
791 // inserting and extracting the values.
792 SmallVector<Type *> Tys(Args.size(), Ty);
793 return getScalarizationOverhead(VTy, Args, Tys) +
794 VTy->getNumElements() * Cost;
795 }
796
797 // We don't know anything about this scalar instruction.
798 return OpCost;
799 }
800
801 TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind,
802 ArrayRef<int> Mask) const {
803 int Limit = Mask.size() * 2;
804 if (Mask.empty() ||
805 // Extra check required by isSingleSourceMaskImpl function (called by
806 // ShuffleVectorInst::isSingleSourceMask).
807 any_of(Mask, [Limit](int I) { return I >= Limit; }))
808 return Kind;
809 switch (Kind) {
810 case TTI::SK_PermuteSingleSrc:
811 if (ShuffleVectorInst::isReverseMask(Mask))
812 return TTI::SK_Reverse;
813 if (ShuffleVectorInst::isZeroEltSplatMask(Mask))
814 return TTI::SK_Broadcast;
815 break;
816 case TTI::SK_PermuteTwoSrc:
817 if (ShuffleVectorInst::isSelectMask(Mask))
818 return TTI::SK_Select;
819 if (ShuffleVectorInst::isTransposeMask(Mask))
820 return TTI::SK_Transpose;
821 break;
822 case TTI::SK_Select:
823 case TTI::SK_Reverse:
824 case TTI::SK_Broadcast:
825 case TTI::SK_Transpose:
826 case TTI::SK_InsertSubvector:
827 case TTI::SK_ExtractSubvector:
828 break;
829 }
830 return Kind;
831 }
832
833 InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
834 ArrayRef<int> Mask, int Index,
835 VectorType *SubTp) {
836
837 switch (improveShuffleKindFromMask(Kind, Mask)) {
838 case TTI::SK_Broadcast:
839 return getBroadcastShuffleOverhead(cast<FixedVectorType>(Tp));
840 case TTI::SK_Select:
841 case TTI::SK_Reverse:
842 case TTI::SK_Transpose:
843 case TTI::SK_PermuteSingleSrc:
844 case TTI::SK_PermuteTwoSrc:
845 return getPermuteShuffleOverhead(cast<FixedVectorType>(Tp));
846 case TTI::SK_ExtractSubvector:
847 return getExtractSubvectorOverhead(Tp, Index,
848 cast<FixedVectorType>(SubTp));
849 case TTI::SK_InsertSubvector:
850 return getInsertSubvectorOverhead(Tp, Index,
851 cast<FixedVectorType>(SubTp));
852 }
853 llvm_unreachable("Unknown TTI::ShuffleKind")::llvm::llvm_unreachable_internal("Unknown TTI::ShuffleKind",
"/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 853)
;
854 }
855
856 InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
857 TTI::CastContextHint CCH,
858 TTI::TargetCostKind CostKind,
859 const Instruction *I = nullptr) {
860 if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0)
861 return 0;
862
863 const TargetLoweringBase *TLI = getTLI();
864 int ISD = TLI->InstructionOpcodeToISD(Opcode);
865 assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ?
void (0) : __assert_fail ("ISD && \"Invalid opcode\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 865, __extension__ __PRETTY_FUNCTION__))
;
866 std::pair<InstructionCost, MVT> SrcLT =
867 TLI->getTypeLegalizationCost(DL, Src);
868 std::pair<InstructionCost, MVT> DstLT =
869 TLI->getTypeLegalizationCost(DL, Dst);
870
871 TypeSize SrcSize = SrcLT.second.getSizeInBits();
872 TypeSize DstSize = DstLT.second.getSizeInBits();
873 bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy();
874 bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy();
875
876 switch (Opcode) {
877 default:
878 break;
879 case Instruction::Trunc:
880 // Check for NOOP conversions.
881 if (TLI->isTruncateFree(SrcLT.second, DstLT.second))
882 return 0;
883 LLVM_FALLTHROUGH[[gnu::fallthrough]];
884 case Instruction::BitCast:
885 // Bitcast between types that are legalized to the same type are free and
886 // assume int to/from ptr of the same size is also free.
887 if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst &&
888 SrcSize == DstSize)
889 return 0;
890 break;
891 case Instruction::FPExt:
892 if (I && getTLI()->isExtFree(I))
893 return 0;
894 break;
895 case Instruction::ZExt:
896 if (TLI->isZExtFree(SrcLT.second, DstLT.second))
897 return 0;
898 LLVM_FALLTHROUGH[[gnu::fallthrough]];
899 case Instruction::SExt:
900 if (I && getTLI()->isExtFree(I))
901 return 0;
902
903 // If this is a zext/sext of a load, return 0 if the corresponding
904 // extending load exists on target and the result type is legal.
905 if (CCH == TTI::CastContextHint::Normal) {
906 EVT ExtVT = EVT::getEVT(Dst);
907 EVT LoadVT = EVT::getEVT(Src);
908 unsigned LType =
909 ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
910 if (DstLT.first == SrcLT.first &&
911 TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
912 return 0;
913 }
914 break;
915 case Instruction::AddrSpaceCast:
916 if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(),
917 Dst->getPointerAddressSpace()))
918 return 0;
919 break;
920 }
921
922 auto *SrcVTy = dyn_cast<VectorType>(Src);
923 auto *DstVTy = dyn_cast<VectorType>(Dst);
924
925 // If the cast is marked as legal (or promote) then assume low cost.
926 if (SrcLT.first == DstLT.first &&
927 TLI->isOperationLegalOrPromote(ISD, DstLT.second))
928 return SrcLT.first;
929
930 // Handle scalar conversions.
931 if (!SrcVTy && !DstVTy) {
932 // Just check the op cost. If the operation is legal then assume it costs
933 // 1.
934 if (!TLI->isOperationExpand(ISD, DstLT.second))
935 return 1;
936
937 // Assume that illegal scalar instruction are expensive.
938 return 4;
939 }
940
941 // Check vector-to-vector casts.
942 if (DstVTy && SrcVTy) {
943 // If the cast is between same-sized registers, then the check is simple.
944 if (SrcLT.first == DstLT.first && SrcSize == DstSize) {
945
946 // Assume that Zext is done using AND.
947 if (Opcode == Instruction::ZExt)
948 return SrcLT.first;
949
950 // Assume that sext is done using SHL and SRA.
951 if (Opcode == Instruction::SExt)
952 return SrcLT.first * 2;
953
954 // Just check the op cost. If the operation is legal then assume it
955 // costs
956 // 1 and multiply by the type-legalization overhead.
957 if (!TLI->isOperationExpand(ISD, DstLT.second))
958 return SrcLT.first * 1;
959 }
960
961 // If we are legalizing by splitting, query the concrete TTI for the cost
962 // of casting the original vector twice. We also need to factor in the
963 // cost of the split itself. Count that as 1, to be consistent with
964 // TLI->getTypeLegalizationCost().
965 bool SplitSrc =
966 TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
967 TargetLowering::TypeSplitVector;
968 bool SplitDst =
969 TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
970 TargetLowering::TypeSplitVector;
971 if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() &&
972 DstVTy->getElementCount().isVector()) {
973 Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy);
974 Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy);
975 T *TTI = static_cast<T *>(this);
976 // If both types need to be split then the split is free.
977 InstructionCost SplitCost =
978 (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0;
979 return SplitCost +
980 (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH,
981 CostKind, I));
982 }
983
984 // In other cases where the source or destination are illegal, assume
985 // the operation will get scalarized.
986 unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements();
987 InstructionCost Cost = thisT()->getCastInstrCost(
988 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I);
989
990 // Return the cost of multiple scalar invocation plus the cost of
991 // inserting and extracting the values.
992 return getScalarizationOverhead(DstVTy, true, true) + Num * Cost;
993 }
994
995 // We already handled vector-to-vector and scalar-to-scalar conversions.
996 // This
997 // is where we handle bitcast between vectors and scalars. We need to assume
998 // that the conversion is scalarized in one way or another.
999 if (Opcode == Instruction::BitCast) {
1000 // Illegal bitcasts are done by storing and loading from a stack slot.
1001 return (SrcVTy ? getScalarizationOverhead(SrcVTy, false, true) : 0) +
1002 (DstVTy ? getScalarizationOverhead(DstVTy, true, false) : 0);
1003 }
1004
1005 llvm_unreachable("Unhandled cast")::llvm::llvm_unreachable_internal("Unhandled cast", "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 1005)
;
1006 }
1007
1008 InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst,
1009 VectorType *VecTy, unsigned Index) {
1010 return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1011 Index) +
1012 thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
1013 TTI::CastContextHint::None,
1014 TTI::TCK_RecipThroughput);
1015 }
1016
1017 InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
1018 const Instruction *I = nullptr) {
1019 return BaseT::getCFInstrCost(Opcode, CostKind, I);
1020 }
1021
1022 InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1023 CmpInst::Predicate VecPred,
1024 TTI::TargetCostKind CostKind,
1025 const Instruction *I = nullptr) {
1026 const TargetLoweringBase *TLI = getTLI();
1027 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1028 assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ?
void (0) : __assert_fail ("ISD && \"Invalid opcode\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 1028, __extension__ __PRETTY_FUNCTION__))
;
1029
1030 // TODO: Handle other cost kinds.
1031 if (CostKind != TTI::TCK_RecipThroughput)
1032 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1033 I);
1034
1035 // Selects on vectors are actually vector selects.
1036 if (ISD == ISD::SELECT) {
1037 assert(CondTy && "CondTy must exist")(static_cast <bool> (CondTy && "CondTy must exist"
) ? void (0) : __assert_fail ("CondTy && \"CondTy must exist\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 1037, __extension__ __PRETTY_FUNCTION__))
;
1038 if (CondTy->isVectorTy())
1039 ISD = ISD::VSELECT;
1040 }
1041 std::pair<InstructionCost, MVT> LT =
1042 TLI->getTypeLegalizationCost(DL, ValTy);
1043
1044 if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
1045 !TLI->isOperationExpand(ISD, LT.second)) {
1046 // The operation is legal. Assume it costs 1. Multiply
1047 // by the type-legalization overhead.
1048 return LT.first * 1;
1049 }
1050
1051 // Otherwise, assume that the cast is scalarized.
1052 // TODO: If one of the types get legalized by splitting, handle this
1053 // similarly to what getCastInstrCost() does.
1054 if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) {
1055 unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements();
1056 if (CondTy)
1057 CondTy = CondTy->getScalarType();
1058 InstructionCost Cost = thisT()->getCmpSelInstrCost(
1059 Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I);
1060
1061 // Return the cost of multiple scalar invocation plus the cost of
1062 // inserting and extracting the values.
1063 return getScalarizationOverhead(ValVTy, true, false) + Num * Cost;
1064 }
1065
1066 // Unknown scalar opcode.
1067 return 1;
1068 }
1069
1070 InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
1071 unsigned Index) {
1072 std::pair<InstructionCost, MVT> LT =
1073 getTLI()->getTypeLegalizationCost(DL, Val->getScalarType());
1074
1075 return LT.first;
1076 }
1077
1078 InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src,
1079 MaybeAlign Alignment, unsigned AddressSpace,
1080 TTI::TargetCostKind CostKind,
1081 const Instruction *I = nullptr) {
1082 assert(!Src->isVoidTy() && "Invalid type")(static_cast <bool> (!Src->isVoidTy() && "Invalid type"
) ? void (0) : __assert_fail ("!Src->isVoidTy() && \"Invalid type\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 1082, __extension__ __PRETTY_FUNCTION__))
;
1083 // Assume types, such as structs, are expensive.
1084 if (getTLI()->getValueType(DL, Src, true) == MVT::Other)
1085 return 4;
1086 std::pair<InstructionCost, MVT> LT =
1087 getTLI()->getTypeLegalizationCost(DL, Src);
1088
1089 // Assuming that all loads of legal types cost 1.
1090 InstructionCost Cost = LT.first;
1091 if (CostKind != TTI::TCK_RecipThroughput)
1092 return Cost;
1093
1094 if (Src->isVectorTy() &&
1095 // In practice it's not currently possible to have a change in lane
1096 // length for extending loads or truncating stores so both types should
1097 // have the same scalable property.
1098 TypeSize::isKnownLT(Src->getPrimitiveSizeInBits(),
1099 LT.second.getSizeInBits())) {
1100 // This is a vector load that legalizes to a larger type than the vector
1101 // itself. Unless the corresponding extending load or truncating store is
1102 // legal, then this will scalarize.
1103 TargetLowering::LegalizeAction LA = TargetLowering::Expand;
1104 EVT MemVT = getTLI()->getValueType(DL, Src);
1105 if (Opcode == Instruction::Store)
1106 LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
1107 else
1108 LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
1109
1110 if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
1111 // This is a vector load/store for some illegal type that is scalarized.
1112 // We must account for the cost of building or decomposing the vector.
1113 Cost += getScalarizationOverhead(cast<VectorType>(Src),
1114 Opcode != Instruction::Store,
1115 Opcode == Instruction::Store);
1116 }
1117 }
1118
1119 return Cost;
1120 }
1121
1122 InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
1123 Align Alignment, unsigned AddressSpace,
1124 TTI::TargetCostKind CostKind) {
1125 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false,
1126 CostKind);
1127 }
1128
1129 InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
1130 const Value *Ptr, bool VariableMask,
1131 Align Alignment,
1132 TTI::TargetCostKind CostKind,
1133 const Instruction *I = nullptr) {
1134 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask,
1135 true, CostKind);
1136 }
1137
1138 InstructionCost getInterleavedMemoryOpCost(
1139 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1140 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1141 bool UseMaskForCond = false, bool UseMaskForGaps = false) {
1142 auto *VT = cast<FixedVectorType>(VecTy);
11
'VecTy' is a 'FixedVectorType'
1143
1144 unsigned NumElts = VT->getNumElements();
1145 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor")(static_cast <bool> (Factor > 1 && NumElts %
Factor == 0 && "Invalid interleave factor") ? void (
0) : __assert_fail ("Factor > 1 && NumElts % Factor == 0 && \"Invalid interleave factor\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 1145, __extension__ __PRETTY_FUNCTION__))
;
12
Assuming 'Factor' is > 1
13
Assuming the condition is true
14
'?' condition is true
1146
1147 unsigned NumSubElts = NumElts / Factor;
1148 auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts);
1149
1150 // Firstly, the cost of load/store operation.
1151 InstructionCost Cost;
1152 if (UseMaskForCond
14.1
'UseMaskForCond' is false
14.1
'UseMaskForCond' is false
14.1
'UseMaskForCond' is false
|| UseMaskForGaps
14.2
'UseMaskForGaps' is false
14.2
'UseMaskForGaps' is false
14.2
'UseMaskForGaps' is false
)
15
Taking false branch
1153 Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment,
1154 AddressSpace, CostKind);
1155 else
1156 Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace,
1157 CostKind);
1158
1159 // Legalize the vector type, and get the legalized and unlegalized type
1160 // sizes.
1161 MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
1162 unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy);
1163 unsigned VecTyLTSize = VecTyLT.getStoreSize();
1164
1165 // Scale the cost of the memory operation by the fraction of legalized
1166 // instructions that will actually be used. We shouldn't account for the
1167 // cost of dead instructions since they will be removed.
1168 //
1169 // E.g., An interleaved load of factor 8:
1170 // %vec = load <16 x i64>, <16 x i64>* %ptr
1171 // %v0 = shufflevector %vec, undef, <0, 8>
1172 //
1173 // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
1174 // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
1175 // type). The other loads are unused.
1176 //
1177 // We only scale the cost of loads since interleaved store groups aren't
1178 // allowed to have gaps.
1179 if (Opcode == Instruction::Load && VecTySize > VecTyLTSize) {
16
Assuming 'Opcode' is not equal to Load
1180 // The number of loads of a legal type it will take to represent a load
1181 // of the unlegalized vector type.
1182 unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize);
1183
1184 // The number of elements of the unlegalized type that correspond to a
1185 // single legal instruction.
1186 unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts);
1187
1188 // Determine which legal instructions will be used.
1189 BitVector UsedInsts(NumLegalInsts, false);
1190 for (unsigned Index : Indices)
1191 for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
1192 UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
1193
1194 // Scale the cost of the load by the fraction of legal instructions that
1195 // will be used.
1196 Cost *= UsedInsts.count() / NumLegalInsts;
1197 }
1198
1199 // Then plus the cost of interleave operation.
1200 if (Opcode
16.1
'Opcode' is not equal to Load
16.1
'Opcode' is not equal to Load
16.1
'Opcode' is not equal to Load
== Instruction::Load) {
17
Taking false branch
1201 // The interleave cost is similar to extract sub vectors' elements
1202 // from the wide vector, and insert them into sub vectors.
1203 //
1204 // E.g. An interleaved load of factor 2 (with one member of index 0):
1205 // %vec = load <8 x i32>, <8 x i32>* %ptr
1206 // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0
1207 // The cost is estimated as extract elements at 0, 2, 4, 6 from the
1208 // <8 x i32> vector and insert them into a <4 x i32> vector.
1209
1210 assert(Indices.size() <= Factor &&(static_cast <bool> (Indices.size() <= Factor &&
"Interleaved memory op has too many members") ? void (0) : __assert_fail
("Indices.size() <= Factor && \"Interleaved memory op has too many members\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 1211, __extension__ __PRETTY_FUNCTION__))
1211 "Interleaved memory op has too many members")(static_cast <bool> (Indices.size() <= Factor &&
"Interleaved memory op has too many members") ? void (0) : __assert_fail
("Indices.size() <= Factor && \"Interleaved memory op has too many members\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 1211, __extension__ __PRETTY_FUNCTION__))
;
1212
1213 for (unsigned Index : Indices) {
1214 assert(Index < Factor && "Invalid index for interleaved memory op")(static_cast <bool> (Index < Factor && "Invalid index for interleaved memory op"
) ? void (0) : __assert_fail ("Index < Factor && \"Invalid index for interleaved memory op\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 1214, __extension__ __PRETTY_FUNCTION__))
;
1215
1216 // Extract elements from loaded vector for each sub vector.
1217 for (unsigned i = 0; i < NumSubElts; i++)
1218 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VT,
1219 Index + i * Factor);
1220 }
1221
1222 InstructionCost InsSubCost = 0;
1223 for (unsigned i = 0; i < NumSubElts; i++)
1224 InsSubCost +=
1225 thisT()->getVectorInstrCost(Instruction::InsertElement, SubVT, i);
1226
1227 Cost += Indices.size() * InsSubCost;
1228 } else {
1229 // The interleave cost is extract all elements from sub vectors, and
1230 // insert them into the wide vector.
1231 //
1232 // E.g. An interleaved store of factor 2:
1233 // %v0_v1 = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7>
1234 // store <8 x i32> %interleaved.vec, <8 x i32>* %ptr
1235 // The cost is estimated as extract all elements from both <4 x i32>
1236 // vectors and insert into the <8 x i32> vector.
1237
1238 InstructionCost ExtSubCost = 0;
1239 for (unsigned i = 0; i < NumSubElts; i++)
18
Assuming 'i' is < 'NumSubElts'
19
Loop condition is true. Entering loop body
1240 ExtSubCost +=
1241 thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i);
20
Calling 'X86TTIImpl::getVectorInstrCost'
1242 Cost += ExtSubCost * Factor;
1243
1244 for (unsigned i = 0; i < NumElts; i++)
1245 Cost += static_cast<T *>(this)
1246 ->getVectorInstrCost(Instruction::InsertElement, VT, i);
1247 }
1248
1249 if (!UseMaskForCond)
1250 return Cost;
1251
1252 Type *I8Type = Type::getInt8Ty(VT->getContext());
1253 auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
1254 SubVT = FixedVectorType::get(I8Type, NumSubElts);
1255
1256 // The Mask shuffling cost is extract all the elements of the Mask
1257 // and insert each of them Factor times into the wide vector:
1258 //
1259 // E.g. an interleaved group with factor 3:
1260 // %mask = icmp ult <8 x i32> %vec1, %vec2
1261 // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
1262 // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
1263 // The cost is estimated as extract all mask elements from the <8xi1> mask
1264 // vector and insert them factor times into the <24xi1> shuffled mask
1265 // vector.
1266 for (unsigned i = 0; i < NumSubElts; i++)
1267 Cost +=
1268 thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i);
1269
1270 for (unsigned i = 0; i < NumElts; i++)
1271 Cost +=
1272 thisT()->getVectorInstrCost(Instruction::InsertElement, MaskVT, i);
1273
1274 // The Gaps mask is invariant and created outside the loop, therefore the
1275 // cost of creating it is not accounted for here. However if we have both
1276 // a MaskForGaps and some other mask that guards the execution of the
1277 // memory access, we need to account for the cost of And-ing the two masks
1278 // inside the loop.
1279 if (UseMaskForGaps)
1280 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT,
1281 CostKind);
1282
1283 return Cost;
1284 }
1285
1286 /// Get intrinsic cost based on arguments.
1287 InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1288 TTI::TargetCostKind CostKind) {
1289 // Check for generically free intrinsics.
1290 if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0)
1291 return 0;
1292
1293 // Assume that target intrinsics are cheap.
1294 Intrinsic::ID IID = ICA.getID();
1295 if (Function::isTargetIntrinsic(IID))
1296 return TargetTransformInfo::TCC_Basic;
1297
1298 if (ICA.isTypeBasedOnly())
1299 return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
1300
1301 Type *RetTy = ICA.getReturnType();
1302
1303 ElementCount RetVF =
1304 (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
1305 : ElementCount::getFixed(1));
1306 const IntrinsicInst *I = ICA.getInst();
1307 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
1308 FastMathFlags FMF = ICA.getFlags();
1309 switch (IID) {
1310 default:
1311 break;
1312
1313 case Intrinsic::cttz:
1314 // FIXME: If necessary, this should go in target-specific overrides.
1315 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz())
1316 return TargetTransformInfo::TCC_Basic;
1317 break;
1318
1319 case Intrinsic::ctlz:
1320 // FIXME: If necessary, this should go in target-specific overrides.
1321 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz())
1322 return TargetTransformInfo::TCC_Basic;
1323 break;
1324
1325 case Intrinsic::memcpy:
1326 return thisT()->getMemcpyCost(ICA.getInst());
1327
1328 case Intrinsic::masked_scatter: {
1329 const Value *Mask = Args[3];
1330 bool VarMask = !isa<Constant>(Mask);
1331 Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue();
1332 return thisT()->getGatherScatterOpCost(Instruction::Store,
1333 ICA.getArgTypes()[0], Args[1],
1334 VarMask, Alignment, CostKind, I);
1335 }
1336 case Intrinsic::masked_gather: {
1337 const Value *Mask = Args[2];
1338 bool VarMask = !isa<Constant>(Mask);
1339 Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
1340 return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
1341 VarMask, Alignment, CostKind, I);
1342 }
1343 case Intrinsic::experimental_stepvector: {
1344 if (isa<ScalableVectorType>(RetTy))
1345 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1346 // The cost of materialising a constant integer vector.
1347 return TargetTransformInfo::TCC_Basic;
1348 }
1349 case Intrinsic::experimental_vector_extract: {
1350 // FIXME: Handle case where a scalable vector is extracted from a scalable
1351 // vector
1352 if (isa<ScalableVectorType>(RetTy))
1353 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1354 unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue();
1355 return thisT()->getShuffleCost(TTI::SK_ExtractSubvector,
1356 cast<VectorType>(Args[0]->getType()), None,
1357 Index, cast<VectorType>(RetTy));
1358 }
1359 case Intrinsic::experimental_vector_insert: {
1360 // FIXME: Handle case where a scalable vector is inserted into a scalable
1361 // vector
1362 if (isa<ScalableVectorType>(Args[1]->getType()))
1363 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1364 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1365 return thisT()->getShuffleCost(
1366 TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()), None,
1367 Index, cast<VectorType>(Args[1]->getType()));
1368 }
1369 case Intrinsic::experimental_vector_reverse: {
1370 return thisT()->getShuffleCost(TTI::SK_Reverse,
1371 cast<VectorType>(Args[0]->getType()), None,
1372 0, cast<VectorType>(RetTy));
1373 }
1374 case Intrinsic::vector_reduce_add:
1375 case Intrinsic::vector_reduce_mul:
1376 case Intrinsic::vector_reduce_and:
1377 case Intrinsic::vector_reduce_or:
1378 case Intrinsic::vector_reduce_xor:
1379 case Intrinsic::vector_reduce_smax:
1380 case Intrinsic::vector_reduce_smin:
1381 case Intrinsic::vector_reduce_fmax:
1382 case Intrinsic::vector_reduce_fmin:
1383 case Intrinsic::vector_reduce_umax:
1384 case Intrinsic::vector_reduce_umin: {
1385 IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1);
1386 return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1387 }
1388 case Intrinsic::vector_reduce_fadd:
1389 case Intrinsic::vector_reduce_fmul: {
1390 IntrinsicCostAttributes Attrs(
1391 IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1);
1392 return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1393 }
1394 case Intrinsic::fshl:
1395 case Intrinsic::fshr: {
1396 if (isa<ScalableVectorType>(RetTy))
1397 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1398 const Value *X = Args[0];
1399 const Value *Y = Args[1];
1400 const Value *Z = Args[2];
1401 TTI::OperandValueProperties OpPropsX, OpPropsY, OpPropsZ, OpPropsBW;
1402 TTI::OperandValueKind OpKindX = TTI::getOperandInfo(X, OpPropsX);
1403 TTI::OperandValueKind OpKindY = TTI::getOperandInfo(Y, OpPropsY);
1404 TTI::OperandValueKind OpKindZ = TTI::getOperandInfo(Z, OpPropsZ);
1405 TTI::OperandValueKind OpKindBW = TTI::OK_UniformConstantValue;
1406 OpPropsBW = isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2
1407 : TTI::OP_None;
1408 // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
1409 // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
1410 InstructionCost Cost = 0;
1411 Cost +=
1412 thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
1413 Cost +=
1414 thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
1415 Cost += thisT()->getArithmeticInstrCost(
1416 BinaryOperator::Shl, RetTy, CostKind, OpKindX, OpKindZ, OpPropsX);
1417 Cost += thisT()->getArithmeticInstrCost(
1418 BinaryOperator::LShr, RetTy, CostKind, OpKindY, OpKindZ, OpPropsY);
1419 // Non-constant shift amounts requires a modulo.
1420 if (OpKindZ != TTI::OK_UniformConstantValue &&
1421 OpKindZ != TTI::OK_NonUniformConstantValue)
1422 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
1423 CostKind, OpKindZ, OpKindBW,
1424 OpPropsZ, OpPropsBW);
1425 // For non-rotates (X != Y) we must add shift-by-zero handling costs.
1426 if (X != Y) {
1427 Type *CondTy = RetTy->getWithNewBitWidth(1);
1428 Cost +=
1429 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1430 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1431 Cost +=
1432 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1433 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1434 }
1435 return Cost;
1436 }
1437 }
1438
1439 // Assume that we need to scalarize this intrinsic.
1440 // Compute the scalarization overhead based on Args for a vector
1441 // intrinsic.
1442 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
1443 if (RetVF.isVector() && !RetVF.isScalable()) {
1444 ScalarizationCost = 0;
1445 if (!RetTy->isVoidTy())
1446 ScalarizationCost +=
1447 getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
1448 ScalarizationCost +=
1449 getOperandsScalarizationOverhead(Args, ICA.getArgTypes());
1450 }
1451
1452 IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
1453 ScalarizationCost);
1454 return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1455 }
1456
1457 /// Get intrinsic cost based on argument types.
1458 /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
1459 /// cost of scalarizing the arguments and the return value will be computed
1460 /// based on types.
1461 InstructionCost
1462 getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1463 TTI::TargetCostKind CostKind) {
1464 Intrinsic::ID IID = ICA.getID();
1465 Type *RetTy = ICA.getReturnType();
1466 const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes();
1467 FastMathFlags FMF = ICA.getFlags();
1468 InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost();
1469 bool SkipScalarizationCost = ICA.skipScalarizationCost();
1470
1471 VectorType *VecOpTy = nullptr;
1472 if (!Tys.empty()) {
1473 // The vector reduction operand is operand 0 except for fadd/fmul.
1474 // Their operand 0 is a scalar start value, so the vector op is operand 1.
1475 unsigned VecTyIndex = 0;
1476 if (IID == Intrinsic::vector_reduce_fadd ||
1477 IID == Intrinsic::vector_reduce_fmul)
1478 VecTyIndex = 1;
1479 assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes")(static_cast <bool> (Tys.size() > VecTyIndex &&
"Unexpected IntrinsicCostAttributes") ? void (0) : __assert_fail
("Tys.size() > VecTyIndex && \"Unexpected IntrinsicCostAttributes\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 1479, __extension__ __PRETTY_FUNCTION__))
;
1480 VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]);
1481 }
1482
1483 // Library call cost - other than size, make it expensive.
1484 unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
1485 SmallVector<unsigned, 2> ISDs;
1486 switch (IID) {
1487 default: {
1488 // Scalable vectors cannot be scalarized, so return Invalid.
1489 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
1490 return isa<ScalableVectorType>(Ty);
1491 }))
1492 return InstructionCost::getInvalid();
1493
1494 // Assume that we need to scalarize this intrinsic.
1495 InstructionCost ScalarizationCost =
1496 SkipScalarizationCost ? ScalarizationCostPassed : 0;
1497 unsigned ScalarCalls = 1;
1498 Type *ScalarRetTy = RetTy;
1499 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
1500 if (!SkipScalarizationCost)
1501 ScalarizationCost = getScalarizationOverhead(RetVTy, true, false);
1502 ScalarCalls = std::max(ScalarCalls,
1503 cast<FixedVectorType>(RetVTy)->getNumElements());
1504 ScalarRetTy = RetTy->getScalarType();
1505 }
1506 SmallVector<Type *, 4> ScalarTys;
1507 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1508 Type *Ty = Tys[i];
1509 if (auto *VTy = dyn_cast<VectorType>(Ty)) {
1510 if (!SkipScalarizationCost)
1511 ScalarizationCost += getScalarizationOverhead(VTy, false, true);
1512 ScalarCalls = std::max(ScalarCalls,
1513 cast<FixedVectorType>(VTy)->getNumElements());
1514 Ty = Ty->getScalarType();
1515 }
1516 ScalarTys.push_back(Ty);
1517 }
1518 if (ScalarCalls == 1)
1519 return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
1520
1521 IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);
1522 InstructionCost ScalarCost =
1523 thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);
1524
1525 return ScalarCalls * ScalarCost + ScalarizationCost;
1526 }
1527 // Look for intrinsics that can be lowered directly or turned into a scalar
1528 // intrinsic call.
1529 case Intrinsic::sqrt:
1530 ISDs.push_back(ISD::FSQRT);
1531 break;
1532 case Intrinsic::sin:
1533 ISDs.push_back(ISD::FSIN);
1534 break;
1535 case Intrinsic::cos:
1536 ISDs.push_back(ISD::FCOS);
1537 break;
1538 case Intrinsic::exp:
1539 ISDs.push_back(ISD::FEXP);
1540 break;
1541 case Intrinsic::exp2:
1542 ISDs.push_back(ISD::FEXP2);
1543 break;
1544 case Intrinsic::log:
1545 ISDs.push_back(ISD::FLOG);
1546 break;
1547 case Intrinsic::log10:
1548 ISDs.push_back(ISD::FLOG10);
1549 break;
1550 case Intrinsic::log2:
1551 ISDs.push_back(ISD::FLOG2);
1552 break;
1553 case Intrinsic::fabs:
1554 ISDs.push_back(ISD::FABS);
1555 break;
1556 case Intrinsic::canonicalize:
1557 ISDs.push_back(ISD::FCANONICALIZE);
1558 break;
1559 case Intrinsic::minnum:
1560 ISDs.push_back(ISD::FMINNUM);
1561 break;
1562 case Intrinsic::maxnum:
1563 ISDs.push_back(ISD::FMAXNUM);
1564 break;
1565 case Intrinsic::minimum:
1566 ISDs.push_back(ISD::FMINIMUM);
1567 break;
1568 case Intrinsic::maximum:
1569 ISDs.push_back(ISD::FMAXIMUM);
1570 break;
1571 case Intrinsic::copysign:
1572 ISDs.push_back(ISD::FCOPYSIGN);
1573 break;
1574 case Intrinsic::floor:
1575 ISDs.push_back(ISD::FFLOOR);
1576 break;
1577 case Intrinsic::ceil:
1578 ISDs.push_back(ISD::FCEIL);
1579 break;
1580 case Intrinsic::trunc:
1581 ISDs.push_back(ISD::FTRUNC);
1582 break;
1583 case Intrinsic::nearbyint:
1584 ISDs.push_back(ISD::FNEARBYINT);
1585 break;
1586 case Intrinsic::rint:
1587 ISDs.push_back(ISD::FRINT);
1588 break;
1589 case Intrinsic::round:
1590 ISDs.push_back(ISD::FROUND);
1591 break;
1592 case Intrinsic::roundeven:
1593 ISDs.push_back(ISD::FROUNDEVEN);
1594 break;
1595 case Intrinsic::pow:
1596 ISDs.push_back(ISD::FPOW);
1597 break;
1598 case Intrinsic::fma:
1599 ISDs.push_back(ISD::FMA);
1600 break;
1601 case Intrinsic::fmuladd:
1602 ISDs.push_back(ISD::FMA);
1603 break;
1604 case Intrinsic::experimental_constrained_fmuladd:
1605 ISDs.push_back(ISD::STRICT_FMA);
1606 break;
1607 // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
1608 case Intrinsic::lifetime_start:
1609 case Intrinsic::lifetime_end:
1610 case Intrinsic::sideeffect:
1611 case Intrinsic::pseudoprobe:
1612 case Intrinsic::arithmetic_fence:
1613 return 0;
1614 case Intrinsic::masked_store: {
1615 Type *Ty = Tys[0];
1616 Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
1617 return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0,
1618 CostKind);
1619 }
1620 case Intrinsic::masked_load: {
1621 Type *Ty = RetTy;
1622 Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
1623 return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
1624 CostKind);
1625 }
1626 case Intrinsic::vector_reduce_add:
1627 return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy,
1628 /*IsPairwiseForm=*/false,
1629 CostKind);
1630 case Intrinsic::vector_reduce_mul:
1631 return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy,
1632 /*IsPairwiseForm=*/false,
1633 CostKind);
1634 case Intrinsic::vector_reduce_and:
1635 return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy,
1636 /*IsPairwiseForm=*/false,
1637 CostKind);
1638 case Intrinsic::vector_reduce_or:
1639 return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy,
1640 /*IsPairwiseForm=*/false,
1641 CostKind);
1642 case Intrinsic::vector_reduce_xor:
1643 return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy,
1644 /*IsPairwiseForm=*/false,
1645 CostKind);
1646 case Intrinsic::vector_reduce_fadd:
1647 // FIXME: Add new flag for cost of strict reductions.
1648 return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy,
1649 /*IsPairwiseForm=*/false,
1650 CostKind);
1651 case Intrinsic::vector_reduce_fmul:
1652 // FIXME: Add new flag for cost of strict reductions.
1653 return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy,
1654 /*IsPairwiseForm=*/false,
1655 CostKind);
1656 case Intrinsic::vector_reduce_smax:
1657 case Intrinsic::vector_reduce_smin:
1658 case Intrinsic::vector_reduce_fmax:
1659 case Intrinsic::vector_reduce_fmin:
1660 return thisT()->getMinMaxReductionCost(
1661 VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
1662 /*IsPairwiseForm=*/false,
1663 /*IsUnsigned=*/false, CostKind);
1664 case Intrinsic::vector_reduce_umax:
1665 case Intrinsic::vector_reduce_umin:
1666 return thisT()->getMinMaxReductionCost(
1667 VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
1668 /*IsPairwiseForm=*/false,
1669 /*IsUnsigned=*/true, CostKind);
1670 case Intrinsic::abs:
1671 case Intrinsic::smax:
1672 case Intrinsic::smin:
1673 case Intrinsic::umax:
1674 case Intrinsic::umin: {
1675 // abs(X) = select(icmp(X,0),X,sub(0,X))
1676 // minmax(X,Y) = select(icmp(X,Y),X,Y)
1677 Type *CondTy = RetTy->getWithNewBitWidth(1);
1678 InstructionCost Cost = 0;
1679 // TODO: Ideally getCmpSelInstrCost would accept an icmp condition code.
1680 Cost +=
1681 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1682 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1683 Cost +=
1684 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1685 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1686 // TODO: Should we add an OperandValueProperties::OP_Zero property?
1687 if (IID == Intrinsic::abs)
1688 Cost += thisT()->getArithmeticInstrCost(
1689 BinaryOperator::Sub, RetTy, CostKind, TTI::OK_UniformConstantValue);
1690 return Cost;
1691 }
1692 case Intrinsic::sadd_sat:
1693 case Intrinsic::ssub_sat: {
1694 Type *CondTy = RetTy->getWithNewBitWidth(1);
1695
1696 Type *OpTy = StructType::create({RetTy, CondTy});
1697 Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
1698 ? Intrinsic::sadd_with_overflow
1699 : Intrinsic::ssub_with_overflow;
1700
1701 // SatMax -> Overflow && SumDiff < 0
1702 // SatMin -> Overflow && SumDiff >= 0
1703 InstructionCost Cost = 0;
1704 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
1705 nullptr, ScalarizationCostPassed);
1706 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1707 Cost +=
1708 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1709 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1710 Cost += 2 * thisT()->getCmpSelInstrCost(
1711 BinaryOperator::Select, RetTy, CondTy,
1712 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1713 return Cost;
1714 }
1715 case Intrinsic::uadd_sat:
1716 case Intrinsic::usub_sat: {
1717 Type *CondTy = RetTy->getWithNewBitWidth(1);
1718
1719 Type *OpTy = StructType::create({RetTy, CondTy});
1720 Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
1721 ? Intrinsic::uadd_with_overflow
1722 : Intrinsic::usub_with_overflow;
1723
1724 InstructionCost Cost = 0;
1725 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
1726 nullptr, ScalarizationCostPassed);
1727 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1728 Cost +=
1729 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1730 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1731 return Cost;
1732 }
1733 case Intrinsic::smul_fix:
1734 case Intrinsic::umul_fix: {
1735 unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
1736 Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
1737
1738 unsigned ExtOp =
1739 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
1740 TTI::CastContextHint CCH = TTI::CastContextHint::None;
1741
1742 InstructionCost Cost = 0;
1743 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
1744 Cost +=
1745 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
1746 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
1747 CCH, CostKind);
1748 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy,
1749 CostKind, TTI::OK_AnyValue,
1750 TTI::OK_UniformConstantValue);
1751 Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind,
1752 TTI::OK_AnyValue,
1753 TTI::OK_UniformConstantValue);
1754 Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
1755 return Cost;
1756 }
1757 case Intrinsic::sadd_with_overflow:
1758 case Intrinsic::ssub_with_overflow: {
1759 Type *SumTy = RetTy->getContainedType(0);
1760 Type *OverflowTy = RetTy->getContainedType(1);
1761 unsigned Opcode = IID == Intrinsic::sadd_with_overflow
1762 ? BinaryOperator::Add
1763 : BinaryOperator::Sub;
1764
1765 // LHSSign -> LHS >= 0
1766 // RHSSign -> RHS >= 0
1767 // SumSign -> Sum >= 0
1768 //
1769 // Add:
1770 // Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
1771 // Sub:
1772 // Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
1773 InstructionCost Cost = 0;
1774 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
1775 Cost += 3 * thisT()->getCmpSelInstrCost(
1776 Instruction::ICmp, SumTy, OverflowTy,
1777 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1778 Cost += 2 * thisT()->getCmpSelInstrCost(
1779 Instruction::Select, OverflowTy, OverflowTy,
1780 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1781 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, OverflowTy,
1782 CostKind);
1783 return Cost;
1784 }
1785 case Intrinsic::uadd_with_overflow:
1786 case Intrinsic::usub_with_overflow: {
1787 Type *SumTy = RetTy->getContainedType(0);
1788 Type *OverflowTy = RetTy->getContainedType(1);
1789 unsigned Opcode = IID == Intrinsic::uadd_with_overflow
1790 ? BinaryOperator::Add
1791 : BinaryOperator::Sub;
1792
1793 InstructionCost Cost = 0;
1794 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
1795 Cost +=
1796 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy,
1797 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1798 return Cost;
1799 }
1800 case Intrinsic::smul_with_overflow:
1801 case Intrinsic::umul_with_overflow: {
1802 Type *MulTy = RetTy->getContainedType(0);
1803 Type *OverflowTy = RetTy->getContainedType(1);
1804 unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
1805 Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
1806
1807 unsigned ExtOp =
1808 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
1809 TTI::CastContextHint CCH = TTI::CastContextHint::None;
1810
1811 InstructionCost Cost = 0;
1812 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
1813 Cost +=
1814 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
1815 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
1816 CCH, CostKind);
1817 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, MulTy,
1818 CostKind, TTI::OK_AnyValue,
1819 TTI::OK_UniformConstantValue);
1820
1821 if (IID == Intrinsic::smul_with_overflow)
1822 Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy,
1823 CostKind, TTI::OK_AnyValue,
1824 TTI::OK_UniformConstantValue);
1825
1826 Cost +=
1827 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, MulTy, OverflowTy,
1828 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1829 return Cost;
1830 }
1831 case Intrinsic::ctpop:
1832 ISDs.push_back(ISD::CTPOP);
1833 // In case of legalization use TCC_Expensive. This is cheaper than a
1834 // library call but still not a cheap instruction.
1835 SingleCallCost = TargetTransformInfo::TCC_Expensive;
1836 break;
1837 case Intrinsic::ctlz:
1838 ISDs.push_back(ISD::CTLZ);
1839 break;
1840 case Intrinsic::cttz:
1841 ISDs.push_back(ISD::CTTZ);
1842 break;
1843 case Intrinsic::bswap:
1844 ISDs.push_back(ISD::BSWAP);
1845 break;
1846 case Intrinsic::bitreverse:
1847 ISDs.push_back(ISD::BITREVERSE);
1848 break;
1849 }
1850
1851 const TargetLoweringBase *TLI = getTLI();
1852 std::pair<InstructionCost, MVT> LT =
1853 TLI->getTypeLegalizationCost(DL, RetTy);
1854
1855 SmallVector<InstructionCost, 2> LegalCost;
1856 SmallVector<InstructionCost, 2> CustomCost;
1857 for (unsigned ISD : ISDs) {
1858 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
1859 if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
1860 TLI->isFAbsFree(LT.second)) {
1861 return 0;
1862 }
1863
1864 // The operation is legal. Assume it costs 1.
1865 // If the type is split to multiple registers, assume that there is some
1866 // overhead to this.
1867 // TODO: Once we have extract/insert subvector cost we need to use them.
1868 if (LT.first > 1)
1869 LegalCost.push_back(LT.first * 2);
1870 else
1871 LegalCost.push_back(LT.first * 1);
1872 } else if (!TLI->isOperationExpand(ISD, LT.second)) {
1873 // If the operation is custom lowered then assume
1874 // that the code is twice as expensive.
1875 CustomCost.push_back(LT.first * 2);
1876 }
1877 }
1878
1879 auto *MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end());
1880 if (MinLegalCostI != LegalCost.end())
1881 return *MinLegalCostI;
1882
1883 auto MinCustomCostI =
1884 std::min_element(CustomCost.begin(), CustomCost.end());
1885 if (MinCustomCostI != CustomCost.end())
1886 return *MinCustomCostI;
1887
1888 // If we can't lower fmuladd into an FMA estimate the cost as a floating
1889 // point mul followed by an add.
1890 if (IID == Intrinsic::fmuladd)
1891 return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy,
1892 CostKind) +
1893 thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy,
1894 CostKind);
1895 if (IID == Intrinsic::experimental_constrained_fmuladd) {
1896 IntrinsicCostAttributes FMulAttrs(
1897 Intrinsic::experimental_constrained_fmul, RetTy, Tys);
1898 IntrinsicCostAttributes FAddAttrs(
1899 Intrinsic::experimental_constrained_fadd, RetTy, Tys);
1900 return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) +
1901 thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind);
1902 }
1903
1904 // Else, assume that we need to scalarize this intrinsic. For math builtins
1905 // this will emit a costly libcall, adding call overhead and spills. Make it
1906 // very expensive.
1907 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
1908 // Scalable vectors cannot be scalarized, so return Invalid.
1909 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
1910 return isa<ScalableVectorType>(Ty);
1911 }))
1912 return InstructionCost::getInvalid();
1913
1914 InstructionCost ScalarizationCost =
1915 SkipScalarizationCost ? ScalarizationCostPassed
1916 : getScalarizationOverhead(RetVTy, true, false);
1917
1918 unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
1919 SmallVector<Type *, 4> ScalarTys;
1920 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1921 Type *Ty = Tys[i];
1922 if (Ty->isVectorTy())
1923 Ty = Ty->getScalarType();
1924 ScalarTys.push_back(Ty);
1925 }
1926 IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
1927 InstructionCost ScalarCost =
1928 thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1929 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1930 if (auto *VTy = dyn_cast<VectorType>(Tys[i])) {
1931 if (!ICA.skipScalarizationCost())
1932 ScalarizationCost += getScalarizationOverhead(VTy, false, true);
1933 ScalarCalls = std::max(ScalarCalls,
1934 cast<FixedVectorType>(VTy)->getNumElements());
1935 }
1936 }
1937 return ScalarCalls * ScalarCost + ScalarizationCost;
1938 }
1939
1940 // This is going to be turned into a library call, make it expensive.
1941 return SingleCallCost;
1942 }
1943
1944 /// Compute a cost of the given call instruction.
1945 ///
1946 /// Compute the cost of calling function F with return type RetTy and
1947 /// argument types Tys. F might be nullptr, in this case the cost of an
1948 /// arbitrary call with the specified signature will be returned.
1949 /// This is used, for instance, when we estimate call of a vector
1950 /// counterpart of the given function.
1951 /// \param F Called function, might be nullptr.
1952 /// \param RetTy Return value types.
1953 /// \param Tys Argument types.
1954 /// \returns The cost of Call instruction.
1955 InstructionCost
1956 getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys,
1957 TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) {
1958 return 10;
1959 }
1960
1961 unsigned getNumberOfParts(Type *Tp) {
1962 std::pair<InstructionCost, MVT> LT =
1963 getTLI()->getTypeLegalizationCost(DL, Tp);
1964 return *LT.first.getValue();
1965 }
1966
1967 InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *,
1968 const SCEV *) {
1969 return 0;
1970 }
1971
1972 /// Try to calculate arithmetic and shuffle op costs for reduction operations.
1973 /// We're assuming that reduction operation are performing the following way:
1974 /// 1. Non-pairwise reduction
1975 /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
1976 /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
1977 /// \----------------v-------------/ \----------v------------/
1978 /// n/2 elements n/2 elements
1979 /// %red1 = op <n x t> %val, <n x t> val1
1980 /// After this operation we have a vector %red1 where only the first n/2
1981 /// elements are meaningful, the second n/2 elements are undefined and can be
1982 /// dropped. All other operations are actually working with the vector of
1983 /// length n/2, not n, though the real vector length is still n.
1984 /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
1985 /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
1986 /// \----------------v-------------/ \----------v------------/
1987 /// n/4 elements 3*n/4 elements
1988 /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of
1989 /// length n/2, the resulting vector has length n/4 etc.
1990 /// 2. Pairwise reduction:
1991 /// Everything is the same except for an additional shuffle operation which
1992 /// is used to produce operands for pairwise kind of reductions.
1993 /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
1994 /// <n x i32> <i32 0, i32 2, ..., i32 n-2, i32 undef, ..., i32 undef>
1995 /// \-------------v----------/ \----------v------------/
1996 /// n/2 elements n/2 elements
1997 /// %val2 = shufflevector<n x t> %val, <n x t> %undef,
1998 /// <n x i32> <i32 1, i32 3, ..., i32 n-1, i32 undef, ..., i32 undef>
1999 /// \-------------v----------/ \----------v------------/
2000 /// n/2 elements n/2 elements
2001 /// %red1 = op <n x t> %val1, <n x t> val2
2002 /// Again, the operation is performed on <n x t> vector, but the resulting
2003 /// vector %red1 is <n/2 x t> vector.
2004 ///
2005 /// The cost model should take into account that the actual length of the
2006 /// vector is reduced on each iteration.
2007 InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
2008 bool IsPairwise,
2009 TTI::TargetCostKind CostKind) {
2010 Type *ScalarTy = Ty->getElementType();
2011 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2012 if ((Opcode == Instruction::Or || Opcode == Instruction::And) &&
2013 ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) &&
2014 NumVecElts >= 2) {
2015 // Or reduction for i1 is represented as:
2016 // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2017 // %res = cmp ne iReduxWidth %val, 0
2018 // And reduction for i1 is represented as:
2019 // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2020 // %res = cmp eq iReduxWidth %val, 11111
2021 Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts);
2022 return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty,
2023 TTI::CastContextHint::None, CostKind) +
2024 thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy,
2025 CmpInst::makeCmpResultType(ValTy),
2026 CmpInst::BAD_ICMP_PREDICATE, CostKind);
2027 }
2028 unsigned NumReduxLevels = Log2_32(NumVecElts);
2029 InstructionCost ArithCost = 0;
2030 InstructionCost ShuffleCost = 0;
2031 std::pair<InstructionCost, MVT> LT =
2032 thisT()->getTLI()->getTypeLegalizationCost(DL, Ty);
2033 unsigned LongVectorCount = 0;
2034 unsigned MVTLen =
2035 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2036 while (NumVecElts > MVTLen) {
2037 NumVecElts /= 2;
2038 VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2039 // Assume the pairwise shuffles add a cost.
2040 ShuffleCost += (IsPairwise + 1) *
2041 thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
2042 NumVecElts, SubTy);
2043 ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind);
2044 Ty = SubTy;
2045 ++LongVectorCount;
2046 }
2047
2048 NumReduxLevels -= LongVectorCount;
2049
2050 // The minimal length of the vector is limited by the real length of vector
2051 // operations performed on the current platform. That's why several final
2052 // reduction operations are performed on the vectors with the same
2053 // architecture-dependent length.
2054
2055 // Non pairwise reductions need one shuffle per reduction level. Pairwise
2056 // reductions need two shuffles on every level, but the last one. On that
2057 // level one of the shuffles is <0, u, u, ...> which is identity.
2058 unsigned NumShuffles = NumReduxLevels;
2059 if (IsPairwise && NumReduxLevels >= 1)
2060 NumShuffles += NumReduxLevels - 1;
2061 ShuffleCost += NumShuffles * thisT()->getShuffleCost(
2062 TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty);
2063 ArithCost += NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty);
2064 return ShuffleCost + ArithCost +
2065 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
2066 }
2067
2068 /// Try to calculate op costs for min/max reduction operations.
2069 /// \param CondTy Conditional type for the Select instruction.
2070 InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
2071 bool IsPairwise, bool IsUnsigned,
2072 TTI::TargetCostKind CostKind) {
2073 Type *ScalarTy = Ty->getElementType();
2074 Type *ScalarCondTy = CondTy->getElementType();
2075 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2076 unsigned NumReduxLevels = Log2_32(NumVecElts);
2077 unsigned CmpOpcode;
2078 if (Ty->isFPOrFPVectorTy()) {
2079 CmpOpcode = Instruction::FCmp;
2080 } else {
2081 assert(Ty->isIntOrIntVectorTy() &&(static_cast <bool> (Ty->isIntOrIntVectorTy() &&
"expecting floating point or integer type for min/max reduction"
) ? void (0) : __assert_fail ("Ty->isIntOrIntVectorTy() && \"expecting floating point or integer type for min/max reduction\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 2082, __extension__ __PRETTY_FUNCTION__))
2082 "expecting floating point or integer type for min/max reduction")(static_cast <bool> (Ty->isIntOrIntVectorTy() &&
"expecting floating point or integer type for min/max reduction"
) ? void (0) : __assert_fail ("Ty->isIntOrIntVectorTy() && \"expecting floating point or integer type for min/max reduction\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/CodeGen/BasicTTIImpl.h"
, 2082, __extension__ __PRETTY_FUNCTION__))
;
2083 CmpOpcode = Instruction::ICmp;
2084 }
2085 InstructionCost MinMaxCost = 0;
2086 InstructionCost ShuffleCost = 0;
2087 std::pair<InstructionCost, MVT> LT =
2088 thisT()->getTLI()->getTypeLegalizationCost(DL, Ty);
2089 unsigned LongVectorCount = 0;
2090 unsigned MVTLen =
2091 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2092 while (NumVecElts > MVTLen) {
2093 NumVecElts /= 2;
2094 auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2095 CondTy = FixedVectorType::get(ScalarCondTy, NumVecElts);
2096
2097 // Assume the pairwise shuffles add a cost.
2098 ShuffleCost += (IsPairwise + 1) *
2099 thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
2100 NumVecElts, SubTy);
2101 MinMaxCost +=
2102 thisT()->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy,
2103 CmpInst::BAD_ICMP_PREDICATE, CostKind) +
2104 thisT()->getCmpSelInstrCost(Instruction::Select, SubTy, CondTy,
2105 CmpInst::BAD_ICMP_PREDICATE, CostKind);
2106 Ty = SubTy;
2107 ++LongVectorCount;
2108 }
2109
2110 NumReduxLevels -= LongVectorCount;
2111
2112 // The minimal length of the vector is limited by the real length of vector
2113 // operations performed on the current platform. That's why several final
2114 // reduction opertions are perfomed on the vectors with the same
2115 // architecture-dependent length.
2116
2117 // Non pairwise reductions need one shuffle per reduction level. Pairwise
2118 // reductions need two shuffles on every level, but the last one. On that
2119 // level one of the shuffles is <0, u, u, ...> which is identity.
2120 unsigned NumShuffles = NumReduxLevels;
2121 if (IsPairwise && NumReduxLevels >= 1)
2122 NumShuffles += NumReduxLevels - 1;
2123 ShuffleCost += NumShuffles * thisT()->getShuffleCost(
2124 TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty);
2125 MinMaxCost +=
2126 NumReduxLevels *
2127 (thisT()->getCmpSelInstrCost(CmpOpcode, Ty, CondTy,
2128 CmpInst::BAD_ICMP_PREDICATE, CostKind) +
2129 thisT()->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
2130 CmpInst::BAD_ICMP_PREDICATE, CostKind));
2131 // The last min/max should be in vector registers and we counted it above.
2132 // So just need a single extractelement.
2133 return ShuffleCost + MinMaxCost +
2134 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
2135 }
2136
2137 InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
2138 Type *ResTy, VectorType *Ty,
2139 TTI::TargetCostKind CostKind) {
2140 // Without any native support, this is equivalent to the cost of
2141 // vecreduce.add(ext) or if IsMLA vecreduce.add(mul(ext, ext))
2142 VectorType *ExtTy = VectorType::get(ResTy, Ty);
2143 InstructionCost RedCost = thisT()->getArithmeticReductionCost(
2144 Instruction::Add, ExtTy, false, CostKind);
2145 InstructionCost MulCost = 0;
2146 InstructionCost ExtCost = thisT()->getCastInstrCost(
2147 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2148 TTI::CastContextHint::None, CostKind);
2149 if (IsMLA) {
2150 MulCost =
2151 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2152 ExtCost *= 2;
2153 }
2154
2155 return RedCost + MulCost + ExtCost;
2156 }
2157
2158 InstructionCost getVectorSplitCost() { return 1; }
2159
2160 /// @}
2161};
2162
2163/// Concrete BasicTTIImpl that can be used if no further customization
2164/// is needed.
2165class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
2166 using BaseT = BasicTTIImplBase<BasicTTIImpl>;
2167
2168 friend class BasicTTIImplBase<BasicTTIImpl>;
2169
2170 const TargetSubtargetInfo *ST;
2171 const TargetLoweringBase *TLI;
2172
2173 const TargetSubtargetInfo *getST() const { return ST; }
2174 const TargetLoweringBase *getTLI() const { return TLI; }
2175
2176public:
2177 explicit BasicTTIImpl(const TargetMachine *TM, const Function &F);
2178};
2179
2180} // end namespace llvm
2181
2182#endif // LLVM_CODEGEN_BASICTTIIMPL_H

/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/Support/MachineValueType.h

1//===- Support/MachineValueType.h - Machine-Level types ---------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the set of machine-level target independent types which
10// legal values in the code generator use.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_SUPPORT_MACHINEVALUETYPE_H
15#define LLVM_SUPPORT_MACHINEVALUETYPE_H
16
17#include "llvm/ADT/iterator_range.h"
18#include "llvm/Support/ErrorHandling.h"
19#include "llvm/Support/MathExtras.h"
20#include "llvm/Support/TypeSize.h"
21#include <cassert>
22
23namespace llvm {
24
25 class Type;
26
27 /// Machine Value Type. Every type that is supported natively by some
28 /// processor targeted by LLVM occurs here. This means that any legal value
29 /// type can be represented by an MVT.
30 class MVT {
31 public:
32 enum SimpleValueType : uint8_t {
33 // clang-format off
34
35 // Simple value types that aren't explicitly part of this enumeration
36 // are considered extended value types.
37 INVALID_SIMPLE_VALUE_TYPE = 0,
38
39 // If you change this numbering, you must change the values in
40 // ValueTypes.td as well!
41 Other = 1, // This is a non-standard value
42 i1 = 2, // This is a 1 bit integer value
43 i8 = 3, // This is an 8 bit integer value
44 i16 = 4, // This is a 16 bit integer value
45 i32 = 5, // This is a 32 bit integer value
46 i64 = 6, // This is a 64 bit integer value
47 i128 = 7, // This is a 128 bit integer value
48
49 FIRST_INTEGER_VALUETYPE = i1,
50 LAST_INTEGER_VALUETYPE = i128,
51
52 bf16 = 8, // This is a 16 bit brain floating point value
53 f16 = 9, // This is a 16 bit floating point value
54 f32 = 10, // This is a 32 bit floating point value
55 f64 = 11, // This is a 64 bit floating point value
56 f80 = 12, // This is a 80 bit floating point value
57 f128 = 13, // This is a 128 bit floating point value
58 ppcf128 = 14, // This is a PPC 128-bit floating point value
59
60 FIRST_FP_VALUETYPE = bf16,
61 LAST_FP_VALUETYPE = ppcf128,
62
63 v1i1 = 15, // 1 x i1
64 v2i1 = 16, // 2 x i1
65 v4i1 = 17, // 4 x i1
66 v8i1 = 18, // 8 x i1
67 v16i1 = 19, // 16 x i1
68 v32i1 = 20, // 32 x i1
69 v64i1 = 21, // 64 x i1
70 v128i1 = 22, // 128 x i1
71 v256i1 = 23, // 256 x i1
72 v512i1 = 24, // 512 x i1
73 v1024i1 = 25, // 1024 x i1
74
75 v1i8 = 26, // 1 x i8
76 v2i8 = 27, // 2 x i8
77 v4i8 = 28, // 4 x i8
78 v8i8 = 29, // 8 x i8
79 v16i8 = 30, // 16 x i8
80 v32i8 = 31, // 32 x i8
81 v64i8 = 32, // 64 x i8
82 v128i8 = 33, // 128 x i8
83 v256i8 = 34, // 256 x i8
84 v512i8 = 35, // 512 x i8
85 v1024i8 = 36, // 1024 x i8
86
87 v1i16 = 37, // 1 x i16
88 v2i16 = 38, // 2 x i16
89 v3i16 = 39, // 3 x i16
90 v4i16 = 40, // 4 x i16
91 v8i16 = 41, // 8 x i16
92 v16i16 = 42, // 16 x i16
93 v32i16 = 43, // 32 x i16
94 v64i16 = 44, // 64 x i16
95 v128i16 = 45, // 128 x i16
96 v256i16 = 46, // 256 x i16
97 v512i16 = 47, // 512 x i16
98
99 v1i32 = 48, // 1 x i32
100 v2i32 = 49, // 2 x i32
101 v3i32 = 50, // 3 x i32
102 v4i32 = 51, // 4 x i32
103 v5i32 = 52, // 5 x i32
104 v6i32 = 53, // 6 x i32
105 v7i32 = 54, // 7 x i32
106 v8i32 = 55, // 8 x i32
107 v16i32 = 56, // 16 x i32
108 v32i32 = 57, // 32 x i32
109 v64i32 = 58, // 64 x i32
110 v128i32 = 59, // 128 x i32
111 v256i32 = 60, // 256 x i32
112 v512i32 = 61, // 512 x i32
113 v1024i32 = 62, // 1024 x i32
114 v2048i32 = 63, // 2048 x i32
115
116 v1i64 = 64, // 1 x i64
117 v2i64 = 65, // 2 x i64
118 v3i64 = 66, // 3 x i64
119 v4i64 = 67, // 4 x i64
120 v8i64 = 68, // 8 x i64
121 v16i64 = 69, // 16 x i64
122 v32i64 = 70, // 32 x i64
123 v64i64 = 71, // 64 x i64
124 v128i64 = 72, // 128 x i64
125 v256i64 = 73, // 256 x i64
126
127 v1i128 = 74, // 1 x i128
128
129 FIRST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE = v1i1,
130 LAST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE = v1i128,
131
132 v1f16 = 75, // 1 x f16
133 v2f16 = 76, // 2 x f16
134 v3f16 = 77, // 3 x f16
135 v4f16 = 78, // 4 x f16
136 v8f16 = 79, // 8 x f16
137 v16f16 = 80, // 16 x f16
138 v32f16 = 81, // 32 x f16
139 v64f16 = 82, // 64 x f16
140 v128f16 = 83, // 128 x f16
141 v256f16 = 84, // 256 x f16
142 v512f16 = 85, // 256 x f16
143
144 v2bf16 = 86, // 2 x bf16
145 v3bf16 = 87, // 3 x bf16
146 v4bf16 = 88, // 4 x bf16
147 v8bf16 = 89, // 8 x bf16
148 v16bf16 = 90, // 16 x bf16
149 v32bf16 = 91, // 32 x bf16
150 v64bf16 = 92, // 64 x bf16
151 v128bf16 = 93, // 128 x bf16
152
153 v1f32 = 94, // 1 x f32
154 v2f32 = 95, // 2 x f32
155 v3f32 = 96, // 3 x f32
156 v4f32 = 97, // 4 x f32
157 v5f32 = 98, // 5 x f32
158 v6f32 = 99, // 6 x f32
159 v7f32 = 100, // 7 x f32
160 v8f32 = 101, // 8 x f32
161 v16f32 = 102, // 16 x f32
162 v32f32 = 103, // 32 x f32
163 v64f32 = 104, // 64 x f32
164 v128f32 = 105, // 128 x f32
165 v256f32 = 106, // 256 x f32
166 v512f32 = 107, // 512 x f32
167 v1024f32 = 108, // 1024 x f32
168 v2048f32 = 109, // 2048 x f32
169
170 v1f64 = 110, // 1 x f64
171 v2f64 = 111, // 2 x f64
172 v3f64 = 112, // 3 x f64
173 v4f64 = 113, // 4 x f64
174 v8f64 = 114, // 8 x f64
175 v16f64 = 115, // 16 x f64
176 v32f64 = 116, // 32 x f64
177 v64f64 = 117, // 64 x f64
178 v128f64 = 118, // 128 x f64
179 v256f64 = 119, // 256 x f64
180
181 FIRST_FP_FIXEDLEN_VECTOR_VALUETYPE = v1f16,
182 LAST_FP_FIXEDLEN_VECTOR_VALUETYPE = v256f64,
183
184 FIRST_FIXEDLEN_VECTOR_VALUETYPE = v1i1,
185 LAST_FIXEDLEN_VECTOR_VALUETYPE = v256f64,
186
187 nxv1i1 = 120, // n x 1 x i1
188 nxv2i1 = 121, // n x 2 x i1
189 nxv4i1 = 122, // n x 4 x i1
190 nxv8i1 = 123, // n x 8 x i1
191 nxv16i1 = 124, // n x 16 x i1
192 nxv32i1 = 125, // n x 32 x i1
193 nxv64i1 = 126, // n x 64 x i1
194
195 nxv1i8 = 127, // n x 1 x i8
196 nxv2i8 = 128, // n x 2 x i8
197 nxv4i8 = 129, // n x 4 x i8
198 nxv8i8 = 130, // n x 8 x i8
199 nxv16i8 = 131, // n x 16 x i8
200 nxv32i8 = 132, // n x 32 x i8
201 nxv64i8 = 133, // n x 64 x i8
202
203 nxv1i16 = 134, // n x 1 x i16
204 nxv2i16 = 135, // n x 2 x i16
205 nxv4i16 = 136, // n x 4 x i16
206 nxv8i16 = 137, // n x 8 x i16
207 nxv16i16 = 138, // n x 16 x i16
208 nxv32i16 = 139, // n x 32 x i16
209
210 nxv1i32 = 140, // n x 1 x i32
211 nxv2i32 = 141, // n x 2 x i32
212 nxv4i32 = 142, // n x 4 x i32
213 nxv8i32 = 143, // n x 8 x i32
214 nxv16i32 = 144, // n x 16 x i32
215 nxv32i32 = 145, // n x 32 x i32
216
217 nxv1i64 = 146, // n x 1 x i64
218 nxv2i64 = 147, // n x 2 x i64
219 nxv4i64 = 148, // n x 4 x i64
220 nxv8i64 = 149, // n x 8 x i64
221 nxv16i64 = 150, // n x 16 x i64
222 nxv32i64 = 151, // n x 32 x i64
223
224 FIRST_INTEGER_SCALABLE_VECTOR_VALUETYPE = nxv1i1,
225 LAST_INTEGER_SCALABLE_VECTOR_VALUETYPE = nxv32i64,
226
227 nxv1f16 = 152, // n x 1 x f16
228 nxv2f16 = 153, // n x 2 x f16
229 nxv4f16 = 154, // n x 4 x f16
230 nxv8f16 = 155, // n x 8 x f16
231 nxv16f16 = 156, // n x 16 x f16
232 nxv32f16 = 157, // n x 32 x f16
233
234 nxv1bf16 = 158, // n x 1 x bf16
235 nxv2bf16 = 159, // n x 2 x bf16
236 nxv4bf16 = 160, // n x 4 x bf16
237 nxv8bf16 = 161, // n x 8 x bf16
238
239 nxv1f32 = 162, // n x 1 x f32
240 nxv2f32 = 163, // n x 2 x f32
241 nxv4f32 = 164, // n x 4 x f32
242 nxv8f32 = 165, // n x 8 x f32
243 nxv16f32 = 166, // n x 16 x f32
244
245 nxv1f64 = 167, // n x 1 x f64
246 nxv2f64 = 168, // n x 2 x f64
247 nxv4f64 = 169, // n x 4 x f64
248 nxv8f64 = 170, // n x 8 x f64
249
250 FIRST_FP_SCALABLE_VECTOR_VALUETYPE = nxv1f16,
251 LAST_FP_SCALABLE_VECTOR_VALUETYPE = nxv8f64,
252
253 FIRST_SCALABLE_VECTOR_VALUETYPE = nxv1i1,
254 LAST_SCALABLE_VECTOR_VALUETYPE = nxv8f64,
255
256 FIRST_VECTOR_VALUETYPE = v1i1,
257 LAST_VECTOR_VALUETYPE = nxv8f64,
258
259 x86mmx = 171, // This is an X86 MMX value
260
261 Glue = 172, // This glues nodes together during pre-RA sched
262
263 isVoid = 173, // This has no value
264
265 Untyped = 174, // This value takes a register, but has
266 // unspecified type. The register class
267 // will be determined by the opcode.
268
269 funcref = 175, // WebAssembly's funcref type
270 externref = 176, // WebAssembly's externref type
271 x86amx = 177, // This is an X86 AMX value
272
273 FIRST_VALUETYPE = 1, // This is always the beginning of the list.
274 LAST_VALUETYPE = x86amx, // This always remains at the end of the list.
275 VALUETYPE_SIZE = LAST_VALUETYPE + 1,
276
277 // This is the current maximum for LAST_VALUETYPE.
278 // MVT::MAX_ALLOWED_VALUETYPE is used for asserts and to size bit vectors
279 // This value must be a multiple of 32.
280 MAX_ALLOWED_VALUETYPE = 192,
281
282 // A value of type llvm::TokenTy
283 token = 248,
284
285 // This is MDNode or MDString.
286 Metadata = 249,
287
288 // An int value the size of the pointer of the current
289 // target to any address space. This must only be used internal to
290 // tblgen. Other than for overloading, we treat iPTRAny the same as iPTR.
291 iPTRAny = 250,
292
293 // A vector with any length and element size. This is used
294 // for intrinsics that have overloadings based on vector types.
295 // This is only for tblgen's consumption!
296 vAny = 251,
297
298 // Any floating-point or vector floating-point value. This is used
299 // for intrinsics that have overloadings based on floating-point types.
300 // This is only for tblgen's consumption!
301 fAny = 252,
302
303 // An integer or vector integer value of any bit width. This is
304 // used for intrinsics that have overloadings based on integer bit widths.
305 // This is only for tblgen's consumption!
306 iAny = 253,
307
308 // An int value the size of the pointer of the current
309 // target. This should only be used internal to tblgen!
310 iPTR = 254,
311
312 // Any type. This is used for intrinsics that have overloadings.
313 // This is only for tblgen's consumption!
314 Any = 255
315
316 // clang-format on
317 };
318
319 SimpleValueType SimpleTy = INVALID_SIMPLE_VALUE_TYPE;
320
321 constexpr MVT() = default;
322 constexpr MVT(SimpleValueType SVT) : SimpleTy(SVT) {}
323
324 bool operator>(const MVT& S) const { return SimpleTy > S.SimpleTy; }
325 bool operator<(const MVT& S) const { return SimpleTy < S.SimpleTy; }
326 bool operator==(const MVT& S) const { return SimpleTy == S.SimpleTy; }
327 bool operator!=(const MVT& S) const { return SimpleTy != S.SimpleTy; }
328 bool operator>=(const MVT& S) const { return SimpleTy >= S.SimpleTy; }
329 bool operator<=(const MVT& S) const { return SimpleTy <= S.SimpleTy; }
330
331 /// Return true if this is a valid simple valuetype.
332 bool isValid() const {
333 return (SimpleTy >= MVT::FIRST_VALUETYPE &&
334 SimpleTy <= MVT::LAST_VALUETYPE);
335 }
336
337 /// Return true if this is a FP or a vector FP type.
338 bool isFloatingPoint() const {
339 return ((SimpleTy >= MVT::FIRST_FP_VALUETYPE &&
340 SimpleTy <= MVT::LAST_FP_VALUETYPE) ||
341 (SimpleTy >= MVT::FIRST_FP_FIXEDLEN_VECTOR_VALUETYPE &&
342 SimpleTy <= MVT::LAST_FP_FIXEDLEN_VECTOR_VALUETYPE) ||
343 (SimpleTy >= MVT::FIRST_FP_SCALABLE_VECTOR_VALUETYPE &&
344 SimpleTy <= MVT::LAST_FP_SCALABLE_VECTOR_VALUETYPE));
345 }
346
347 /// Return true if this is an integer or a vector integer type.
348 bool isInteger() const {
349 return ((SimpleTy >= MVT::FIRST_INTEGER_VALUETYPE &&
350 SimpleTy <= MVT::LAST_INTEGER_VALUETYPE) ||
351 (SimpleTy >= MVT::FIRST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE &&
352 SimpleTy <= MVT::LAST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE) ||
353 (SimpleTy >= MVT::FIRST_INTEGER_SCALABLE_VECTOR_VALUETYPE &&
354 SimpleTy <= MVT::LAST_INTEGER_SCALABLE_VECTOR_VALUETYPE));
355 }
356
357 /// Return true if this is an integer, not including vectors.
358 bool isScalarInteger() const {
359 return (SimpleTy >= MVT::FIRST_INTEGER_VALUETYPE &&
360 SimpleTy <= MVT::LAST_INTEGER_VALUETYPE);
361 }
362
363 /// Return true if this is a vector value type.
364 bool isVector() const {
365 return (SimpleTy >= MVT::FIRST_VECTOR_VALUETYPE &&
23
Assuming field 'SimpleTy' is >= FIRST_VECTOR_VALUETYPE
25
Returning the value 1, which participates in a condition later
366 SimpleTy <= MVT::LAST_VECTOR_VALUETYPE);
24
Assuming field 'SimpleTy' is <= LAST_VECTOR_VALUETYPE
367 }
368
369 /// Return true if this is a vector value type where the
370 /// runtime length is machine dependent
371 bool isScalableVector() const {
372 return (SimpleTy >= MVT::FIRST_SCALABLE_VECTOR_VALUETYPE &&
373 SimpleTy <= MVT::LAST_SCALABLE_VECTOR_VALUETYPE);
374 }
375
376 bool isFixedLengthVector() const {
377 return (SimpleTy >= MVT::FIRST_FIXEDLEN_VECTOR_VALUETYPE &&
378 SimpleTy <= MVT::LAST_FIXEDLEN_VECTOR_VALUETYPE);
379 }
380
381 /// Return true if this is a 16-bit vector type.
382 bool is16BitVector() const {
383 return (SimpleTy == MVT::v2i8 || SimpleTy == MVT::v1i16 ||
384 SimpleTy == MVT::v16i1 || SimpleTy == MVT::v1f16);
385 }
386
387 /// Return true if this is a 32-bit vector type.
388 bool is32BitVector() const {
389 return (SimpleTy == MVT::v32i1 || SimpleTy == MVT::v4i8 ||
390 SimpleTy == MVT::v2i16 || SimpleTy == MVT::v1i32 ||
391 SimpleTy == MVT::v2f16 || SimpleTy == MVT::v2bf16 ||
392 SimpleTy == MVT::v1f32);
393 }
394
395 /// Return true if this is a 64-bit vector type.
396 bool is64BitVector() const {
397 return (SimpleTy == MVT::v64i1 || SimpleTy == MVT::v8i8 ||
398 SimpleTy == MVT::v4i16 || SimpleTy == MVT::v2i32 ||
399 SimpleTy == MVT::v1i64 || SimpleTy == MVT::v4f16 ||
400 SimpleTy == MVT::v4bf16 ||SimpleTy == MVT::v2f32 ||
401 SimpleTy == MVT::v1f64);
402 }
403
404 /// Return true if this is a 128-bit vector type.
405 bool is128BitVector() const {
406 return (SimpleTy == MVT::v128i1 || SimpleTy == MVT::v16i8 ||
407 SimpleTy == MVT::v8i16 || SimpleTy == MVT::v4i32 ||
408 SimpleTy == MVT::v2i64 || SimpleTy == MVT::v1i128 ||
409 SimpleTy == MVT::v8f16 || SimpleTy == MVT::v8bf16 ||
410 SimpleTy == MVT::v4f32 || SimpleTy == MVT::v2f64);
411 }
412
413 /// Return true if this is a 256-bit vector type.
414 bool is256BitVector() const {
415 return (SimpleTy == MVT::v16f16 || SimpleTy == MVT::v16bf16 ||
416 SimpleTy == MVT::v8f32 || SimpleTy == MVT::v4f64 ||
417 SimpleTy == MVT::v32i8 || SimpleTy == MVT::v16i16 ||
418 SimpleTy == MVT::v8i32 || SimpleTy == MVT::v4i64 ||
419 SimpleTy == MVT::v256i1);
420 }
421
422 /// Return true if this is a 512-bit vector type.
423 bool is512BitVector() const {
424 return (SimpleTy == MVT::v32f16 || SimpleTy == MVT::v32bf16 ||
425 SimpleTy == MVT::v16f32 || SimpleTy == MVT::v8f64 ||
426 SimpleTy == MVT::v512i1 || SimpleTy == MVT::v64i8 ||
427 SimpleTy == MVT::v32i16 || SimpleTy == MVT::v16i32 ||
428 SimpleTy == MVT::v8i64);
429 }
430
431 /// Return true if this is a 1024-bit vector type.
432 bool is1024BitVector() const {
433 return (SimpleTy == MVT::v1024i1 || SimpleTy == MVT::v128i8 ||
434 SimpleTy == MVT::v64i16 || SimpleTy == MVT::v32i32 ||
435 SimpleTy == MVT::v16i64 || SimpleTy == MVT::v64f16 ||
436 SimpleTy == MVT::v32f32 || SimpleTy == MVT::v16f64 ||
437 SimpleTy == MVT::v64bf16);
438 }
439
440 /// Return true if this is a 2048-bit vector type.
441 bool is2048BitVector() const {
442 return (SimpleTy == MVT::v256i8 || SimpleTy == MVT::v128i16 ||
443 SimpleTy == MVT::v64i32 || SimpleTy == MVT::v32i64 ||
444 SimpleTy == MVT::v128f16 || SimpleTy == MVT::v64f32 ||
445 SimpleTy == MVT::v32f64 || SimpleTy == MVT::v128bf16);
446 }
447
448 /// Return true if this is an overloaded type for TableGen.
449 bool isOverloaded() const {
450 return (SimpleTy == MVT::Any || SimpleTy == MVT::iAny ||
451 SimpleTy == MVT::fAny || SimpleTy == MVT::vAny ||
452 SimpleTy == MVT::iPTRAny);
453 }
454
455 /// Return a vector with the same number of elements as this vector, but
456 /// with the element type converted to an integer type with the same
457 /// bitwidth.
458 MVT changeVectorElementTypeToInteger() const {
459 MVT EltTy = getVectorElementType();
460 MVT IntTy = MVT::getIntegerVT(EltTy.getSizeInBits());
461 MVT VecTy = MVT::getVectorVT(IntTy, getVectorElementCount());
462 assert(VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE &&(static_cast <bool> (VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE
&& "Simple vector VT not representable by simple integer vector VT!"
) ? void (0) : __assert_fail ("VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE && \"Simple vector VT not representable by simple integer vector VT!\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/Support/MachineValueType.h"
, 463, __extension__ __PRETTY_FUNCTION__))
463 "Simple vector VT not representable by simple integer vector VT!")(static_cast <bool> (VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE
&& "Simple vector VT not representable by simple integer vector VT!"
) ? void (0) : __assert_fail ("VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE && \"Simple vector VT not representable by simple integer vector VT!\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/Support/MachineValueType.h"
, 463, __extension__ __PRETTY_FUNCTION__))
;
464 return VecTy;
465 }
466
467 /// Return a VT for a vector type whose attributes match ourselves
468 /// with the exception of the element type that is chosen by the caller.
469 MVT changeVectorElementType(MVT EltVT) const {
470 MVT VecTy = MVT::getVectorVT(EltVT, getVectorElementCount());
471 assert(VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE &&(static_cast <bool> (VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE
&& "Simple vector VT not representable by simple integer vector VT!"
) ? void (0) : __assert_fail ("VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE && \"Simple vector VT not representable by simple integer vector VT!\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/Support/MachineValueType.h"
, 472, __extension__ __PRETTY_FUNCTION__))
472 "Simple vector VT not representable by simple integer vector VT!")(static_cast <bool> (VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE
&& "Simple vector VT not representable by simple integer vector VT!"
) ? void (0) : __assert_fail ("VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE && \"Simple vector VT not representable by simple integer vector VT!\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/Support/MachineValueType.h"
, 472, __extension__ __PRETTY_FUNCTION__))
;
473 return VecTy;
474 }
475
476 /// Return the type converted to an equivalently sized integer or vector
477 /// with integer element type. Similar to changeVectorElementTypeToInteger,
478 /// but also handles scalars.
479 MVT changeTypeToInteger() {
480 if (isVector())
481 return changeVectorElementTypeToInteger();
482 return MVT::getIntegerVT(getSizeInBits());
483 }
484
485 /// Return a VT for a vector type with the same element type but
486 /// half the number of elements.
487 MVT getHalfNumVectorElementsVT() const {
488 MVT EltVT = getVectorElementType();
489 auto EltCnt = getVectorElementCount();
490 assert(EltCnt.isKnownEven() && "Splitting vector, but not in half!")(static_cast <bool> (EltCnt.isKnownEven() && "Splitting vector, but not in half!"
) ? void (0) : __assert_fail ("EltCnt.isKnownEven() && \"Splitting vector, but not in half!\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/Support/MachineValueType.h"
, 490, __extension__ __PRETTY_FUNCTION__))
;
491 return getVectorVT(EltVT, EltCnt.divideCoefficientBy(2));
492 }
493
494 /// Returns true if the given vector is a power of 2.
495 bool isPow2VectorType() const {
496 unsigned NElts = getVectorMinNumElements();
497 return !(NElts & (NElts - 1));
498 }
499
500 /// Widens the length of the given vector MVT up to the nearest power of 2
501 /// and returns that type.
502 MVT getPow2VectorType() const {
503 if (isPow2VectorType())
504 return *this;
505
506 ElementCount NElts = getVectorElementCount();
507 unsigned NewMinCount = 1 << Log2_32_Ceil(NElts.getKnownMinValue());
508 NElts = ElementCount::get(NewMinCount, NElts.isScalable());
509 return MVT::getVectorVT(getVectorElementType(), NElts);
510 }
511
512 /// If this is a vector, return the element type, otherwise return this.
513 MVT getScalarType() const {
514 return isVector() ? getVectorElementType() : *this;
515 }
516
517 MVT getVectorElementType() const {
518 switch (SimpleTy) {
519 default:
520 llvm_unreachable("Not a vector MVT!")::llvm::llvm_unreachable_internal("Not a vector MVT!", "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/Support/MachineValueType.h"
, 520)
;
521 case v1i1:
522 case v2i1:
523 case v4i1:
524 case v8i1:
525 case v16i1:
526 case v32i1:
527 case v64i1:
528 case v128i1:
529 case v256i1:
530 case v512i1:
531 case v1024i1:
532 case nxv1i1:
533 case nxv2i1:
534 case nxv4i1:
535 case nxv8i1:
536 case nxv16i1:
537 case nxv32i1:
538 case nxv64i1: return i1;
539 case v1i8:
540 case v2i8:
541 case v4i8:
542 case v8i8:
543 case v16i8:
544 case v32i8:
545 case v64i8:
546 case v128i8:
547 case v256i8:
548 case v512i8:
549 case v1024i8:
550 case nxv1i8:
551 case nxv2i8:
552 case nxv4i8:
553 case nxv8i8:
554 case nxv16i8:
555 case nxv32i8:
556 case nxv64i8: return i8;
557 case v1i16:
558 case v2i16:
559 case v3i16:
560 case v4i16:
561 case v8i16:
562 case v16i16:
563 case v32i16:
564 case v64i16:
565 case v128i16:
566 case v256i16:
567 case v512i16:
568 case nxv1i16:
569 case nxv2i16:
570 case nxv4i16:
571 case nxv8i16:
572 case nxv16i16:
573 case nxv32i16: return i16;
574 case v1i32:
575 case v2i32:
576 case v3i32:
577 case v4i32:
578 case v5i32:
579 case v6i32:
580 case v7i32:
581 case v8i32:
582 case v16i32:
583 case v32i32:
584 case v64i32:
585 case v128i32:
586 case v256i32:
587 case v512i32:
588 case v1024i32:
589 case v2048i32:
590 case nxv1i32:
591 case nxv2i32:
592 case nxv4i32:
593 case nxv8i32:
594 case nxv16i32:
595 case nxv32i32: return i32;
596 case v1i64:
597 case v2i64:
598 case v3i64:
599 case v4i64:
600 case v8i64:
601 case v16i64:
602 case v32i64:
603 case v64i64:
604 case v128i64:
605 case v256i64:
606 case nxv1i64:
607 case nxv2i64:
608 case nxv4i64:
609 case nxv8i64:
610 case nxv16i64:
611 case nxv32i64: return i64;
612 case v1i128: return i128;
613 case v1f16:
614 case v2f16:
615 case v3f16:
616 case v4f16:
617 case v8f16:
618 case v16f16:
619 case v32f16:
620 case v64f16:
621 case v128f16:
622 case v256f16:
623 case v512f16:
624 case nxv1f16:
625 case nxv2f16:
626 case nxv4f16:
627 case nxv8f16:
628 case nxv16f16:
629 case nxv32f16: return f16;
630 case v2bf16:
631 case v3bf16:
632 case v4bf16:
633 case v8bf16:
634 case v16bf16:
635 case v32bf16:
636 case v64bf16:
637 case v128bf16:
638 case nxv1bf16:
639 case nxv2bf16:
640 case nxv4bf16:
641 case nxv8bf16: return bf16;
642 case v1f32:
643 case v2f32:
644 case v3f32:
645 case v4f32:
646 case v5f32:
647 case v6f32:
648 case v7f32:
649 case v8f32:
650 case v16f32:
651 case v32f32:
652 case v64f32:
653 case v128f32:
654 case v256f32:
655 case v512f32:
656 case v1024f32:
657 case v2048f32:
658 case nxv1f32:
659 case nxv2f32:
660 case nxv4f32:
661 case nxv8f32:
662 case nxv16f32: return f32;
663 case v1f64:
664 case v2f64:
665 case v3f64:
666 case v4f64:
667 case v8f64:
668 case v16f64:
669 case v32f64:
670 case v64f64:
671 case v128f64:
672 case v256f64:
673 case nxv1f64:
674 case nxv2f64:
675 case nxv4f64:
676 case nxv8f64: return f64;
677 }
678 }
679
680 /// Given a vector type, return the minimum number of elements it contains.
681 unsigned getVectorMinNumElements() const {
682 switch (SimpleTy) {
683 default:
684 llvm_unreachable("Not a vector MVT!")::llvm::llvm_unreachable_internal("Not a vector MVT!", "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/Support/MachineValueType.h"
, 684)
;
685 case v2048i32:
686 case v2048f32: return 2048;
687 case v1024i1:
688 case v1024i8:
689 case v1024i32:
690 case v1024f32: return 1024;
691 case v512i1:
692 case v512i8:
693 case v512i16:
694 case v512i32:
695 case v512f16:
696 case v512f32: return 512;
697 case v256i1:
698 case v256i8:
699 case v256i16:
700 case v256f16:
701 case v256i32:
702 case v256i64:
703 case v256f32:
704 case v256f64: return 256;
705 case v128i1:
706 case v128i8:
707 case v128i16:
708 case v128i32:
709 case v128i64:
710 case v128f16:
711 case v128bf16:
712 case v128f32:
713 case v128f64: return 128;
714 case v64i1:
715 case v64i8:
716 case v64i16:
717 case v64i32:
718 case v64i64:
719 case v64f16:
720 case v64bf16:
721 case v64f32:
722 case v64f64:
723 case nxv64i1:
724 case nxv64i8: return 64;
725 case v32i1:
726 case v32i8:
727 case v32i16:
728 case v32i32:
729 case v32i64:
730 case v32f16:
731 case v32bf16:
732 case v32f32:
733 case v32f64:
734 case nxv32i1:
735 case nxv32i8:
736 case nxv32i16:
737 case nxv32i32:
738 case nxv32i64:
739 case nxv32f16: return 32;
740 case v16i1:
741 case v16i8:
742 case v16i16:
743 case v16i32:
744 case v16i64:
745 case v16f16:
746 case v16bf16:
747 case v16f32:
748 case v16f64:
749 case nxv16i1:
750 case nxv16i8:
751 case nxv16i16:
752 case nxv16i32:
753 case nxv16i64:
754 case nxv16f16:
755 case nxv16f32: return 16;
756 case v8i1:
757 case v8i8:
758 case v8i16:
759 case v8i32:
760 case v8i64:
761 case v8f16:
762 case v8bf16:
763 case v8f32:
764 case v8f64:
765 case nxv8i1:
766 case nxv8i8:
767 case nxv8i16:
768 case nxv8i32:
769 case nxv8i64:
770 case nxv8f16:
771 case nxv8bf16:
772 case nxv8f32:
773 case nxv8f64: return 8;
774 case v7i32:
775 case v7f32: return 7;
776 case v6i32:
777 case v6f32: return 6;
778 case v5i32:
779 case v5f32: return 5;
780 case v4i1:
781 case v4i8:
782 case v4i16:
783 case v4i32:
784 case v4i64:
785 case v4f16:
786 case v4bf16:
787 case v4f32:
788 case v4f64:
789 case nxv4i1:
790 case nxv4i8:
791 case nxv4i16:
792 case nxv4i32:
793 case nxv4i64:
794 case nxv4f16:
795 case nxv4bf16:
796 case nxv4f32:
797 case nxv4f64: return 4;
798 case v3i16:
799 case v3i32:
800 case v3i64:
801 case v3f16:
802 case v3bf16:
803 case v3f32:
804 case v3f64: return 3;
805 case v2i1:
806 case v2i8:
807 case v2i16:
808 case v2i32:
809 case v2i64:
810 case v2f16:
811 case v2bf16:
812 case v2f32:
813 case v2f64:
814 case nxv2i1:
815 case nxv2i8:
816 case nxv2i16:
817 case nxv2i32:
818 case nxv2i64:
819 case nxv2f16:
820 case nxv2bf16:
821 case nxv2f32:
822 case nxv2f64: return 2;
823 case v1i1:
824 case v1i8:
825 case v1i16:
826 case v1i32:
827 case v1i64:
828 case v1i128:
829 case v1f16:
830 case v1f32:
831 case v1f64:
832 case nxv1i1:
833 case nxv1i8:
834 case nxv1i16:
835 case nxv1i32:
836 case nxv1i64:
837 case nxv1f16:
838 case nxv1bf16:
839 case nxv1f32:
840 case nxv1f64: return 1;
841 }
842 }
843
844 ElementCount getVectorElementCount() const {
845 return ElementCount::get(getVectorMinNumElements(), isScalableVector());
846 }
847
848 unsigned getVectorNumElements() const {
849 // TODO: Check that this isn't a scalable vector.
850 return getVectorMinNumElements();
851 }
852
853 /// Returns the size of the specified MVT in bits.
854 ///
855 /// If the value type is a scalable vector type, the scalable property will
856 /// be set and the runtime size will be a positive integer multiple of the
857 /// base size.
858 TypeSize getSizeInBits() const {
859 switch (SimpleTy) {
860 default:
861 llvm_unreachable("getSizeInBits called on extended MVT.")::llvm::llvm_unreachable_internal("getSizeInBits called on extended MVT."
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/Support/MachineValueType.h"
, 861)
;
862 case Other:
863 llvm_unreachable("Value type is non-standard value, Other.")::llvm::llvm_unreachable_internal("Value type is non-standard value, Other."
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/Support/MachineValueType.h"
, 863)
;
864 case iPTR:
865 llvm_unreachable("Value type size is target-dependent. Ask TLI.")::llvm::llvm_unreachable_internal("Value type size is target-dependent. Ask TLI."
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/Support/MachineValueType.h"
, 865)
;
866 case iPTRAny:
867 case iAny:
868 case fAny:
869 case vAny:
870 case Any:
871 llvm_unreachable("Value type is overloaded.")::llvm::llvm_unreachable_internal("Value type is overloaded."
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/Support/MachineValueType.h"
, 871)
;
872 case token:
873 llvm_unreachable("Token type is a sentinel that cannot be used "::llvm::llvm_unreachable_internal("Token type is a sentinel that cannot be used "
"in codegen and has no size", "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/Support/MachineValueType.h"
, 874)
874 "in codegen and has no size")::llvm::llvm_unreachable_internal("Token type is a sentinel that cannot be used "
"in codegen and has no size", "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/Support/MachineValueType.h"
, 874)
;
875 case Metadata:
876 llvm_unreachable("Value type is metadata.")::llvm::llvm_unreachable_internal("Value type is metadata.", "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/Support/MachineValueType.h"
, 876)
;
877 case i1:
878 case v1i1: return TypeSize::Fixed(1);
879 case nxv1i1: return TypeSize::Scalable(1);
880 case v2i1: return TypeSize::Fixed(2);
881 case nxv2i1: return TypeSize::Scalable(2);
882 case v4i1: return TypeSize::Fixed(4);
883 case nxv4i1: return TypeSize::Scalable(4);
884 case i8 :
885 case v1i8:
886 case v8i1: return TypeSize::Fixed(8);
887 case nxv1i8:
888 case nxv8i1: return TypeSize::Scalable(8);
889 case i16 :
890 case f16:
891 case bf16:
892 case v16i1:
893 case v2i8:
894 case v1i16:
895 case v1f16: return TypeSize::Fixed(16);
896 case nxv16i1:
897 case nxv2i8:
898 case nxv1i16:
899 case nxv1bf16:
900 case nxv1f16: return TypeSize::Scalable(16);
901 case f32 :
902 case i32 :
903 case v32i1:
904 case v4i8:
905 case v2i16:
906 case v2f16:
907 case v2bf16:
908 case v1f32:
909 case v1i32: return TypeSize::Fixed(32);
910 case nxv32i1:
911 case nxv4i8:
912 case nxv2i16:
913 case nxv1i32:
914 case nxv2f16:
915 case nxv2bf16:
916 case nxv1f32: return TypeSize::Scalable(32);
917 case v3i16:
918 case v3f16:
919 case v3bf16: return TypeSize::Fixed(48);
920 case x86mmx:
921 case f64 :
922 case i64 :
923 case v64i1:
924 case v8i8:
925 case v4i16:
926 case v2i32:
927 case v1i64:
928 case v4f16:
929 case v4bf16:
930 case v2f32:
931 case v1f64: return TypeSize::Fixed(64);
932 case nxv64i1:
933 case nxv8i8:
934 case nxv4i16:
935 case nxv2i32:
936 case nxv1i64:
937 case nxv4f16:
938 case nxv4bf16:
939 case nxv2f32:
940 case nxv1f64: return TypeSize::Scalable(64);
941 case f80 : return TypeSize::Fixed(80);
942 case v3i32:
943 case v3f32: return TypeSize::Fixed(96);
944 case f128:
945 case ppcf128:
946 case i128:
947 case v128i1:
948 case v16i8:
949 case v8i16:
950 case v4i32:
951 case v2i64:
952 case v1i128:
953 case v8f16:
954 case v8bf16:
955 case v4f32:
956 case v2f64: return TypeSize::Fixed(128);
957 case nxv16i8:
958 case nxv8i16:
959 case nxv4i32:
960 case nxv2i64:
961 case nxv8f16:
962 case nxv8bf16:
963 case nxv4f32:
964 case nxv2f64: return TypeSize::Scalable(128);
965 case v5i32:
966 case v5f32: return TypeSize::Fixed(160);
967 case v6i32:
968 case v3i64:
969 case v6f32:
970 case v3f64: return TypeSize::Fixed(192);
971 case v7i32:
972 case v7f32: return TypeSize::Fixed(224);
973 case v256i1:
974 case v32i8:
975 case v16i16:
976 case v8i32:
977 case v4i64:
978 case v16f16:
979 case v16bf16:
980 case v8f32:
981 case v4f64: return TypeSize::Fixed(256);
982 case nxv32i8:
983 case nxv16i16:
984 case nxv8i32:
985 case nxv4i64:
986 case nxv16f16:
987 case nxv8f32:
988 case nxv4f64: return TypeSize::Scalable(256);
989 case v512i1:
990 case v64i8:
991 case v32i16:
992 case v16i32:
993 case v8i64:
994 case v32f16:
995 case v32bf16:
996 case v16f32:
997 case v8f64: return TypeSize::Fixed(512);
998 case nxv64i8:
999 case nxv32i16:
1000 case nxv16i32:
1001 case nxv8i64:
1002 case nxv32f16:
1003 case nxv16f32:
1004 case nxv8f64: return TypeSize::Scalable(512);
1005 case v1024i1:
1006 case v128i8:
1007 case v64i16:
1008 case v32i32:
1009 case v16i64:
1010 case v64f16:
1011 case v64bf16:
1012 case v32f32:
1013 case v16f64: return TypeSize::Fixed(1024);
1014 case nxv32i32:
1015 case nxv16i64: return TypeSize::Scalable(1024);
1016 case v256i8:
1017 case v128i16:
1018 case v64i32:
1019 case v32i64:
1020 case v128f16:
1021 case v128bf16:
1022 case v64f32:
1023 case v32f64: return TypeSize::Fixed(2048);
1024 case nxv32i64: return TypeSize::Scalable(2048);
1025 case v512i8:
1026 case v256i16:
1027 case v128i32:
1028 case v64i64:
1029 case v256f16:
1030 case v128f32:
1031 case v64f64: return TypeSize::Fixed(4096);
1032 case v1024i8:
1033 case v512i16:
1034 case v256i32:
1035 case v128i64:
1036 case v512f16:
1037 case v256f32:
1038 case x86amx:
1039 case v128f64: return TypeSize::Fixed(8192);
1040 case v512i32:
1041 case v256i64:
1042 case v512f32:
1043 case v256f64: return TypeSize::Fixed(16384);
1044 case v1024i32:
1045 case v1024f32: return TypeSize::Fixed(32768);
1046 case v2048i32:
1047 case v2048f32: return TypeSize::Fixed(65536);
1048 case funcref:
1049 case externref: return TypeSize::Fixed(0); // opaque type
1050 }
1051 }
1052
1053 /// Return the size of the specified fixed width value type in bits. The
1054 /// function will assert if the type is scalable.
1055 uint64_t getFixedSizeInBits() const {
1056 return getSizeInBits().getFixedSize();
1057 }
1058
1059 uint64_t getScalarSizeInBits() const {
1060 return getScalarType().getSizeInBits().getFixedSize();
1061 }
1062
1063 /// Return the number of bytes overwritten by a store of the specified value
1064 /// type.
1065 ///
1066 /// If the value type is a scalable vector type, the scalable property will
1067 /// be set and the runtime size will be a positive integer multiple of the
1068 /// base size.
1069 TypeSize getStoreSize() const {
1070 TypeSize BaseSize = getSizeInBits();
1071 return {(BaseSize.getKnownMinSize() + 7) / 8, BaseSize.isScalable()};
1072 }
1073
1074 /// Return the number of bits overwritten by a store of the specified value
1075 /// type.
1076 ///
1077 /// If the value type is a scalable vector type, the scalable property will
1078 /// be set and the runtime size will be a positive integer multiple of the
1079 /// base size.
1080 TypeSize getStoreSizeInBits() const {
1081 return getStoreSize() * 8;
1082 }
1083
1084 /// Returns true if the number of bits for the type is a multiple of an
1085 /// 8-bit byte.
1086 bool isByteSized() const { return getSizeInBits().isKnownMultipleOf(8); }
1087
1088 /// Return true if we know at compile time this has more bits than VT.
1089 bool knownBitsGT(MVT VT) const {
1090 return TypeSize::isKnownGT(getSizeInBits(), VT.getSizeInBits());
1091 }
1092
1093 /// Return true if we know at compile time this has more than or the same
1094 /// bits as VT.
1095 bool knownBitsGE(MVT VT) const {
1096 return TypeSize::isKnownGE(getSizeInBits(), VT.getSizeInBits());
1097 }
1098
1099 /// Return true if we know at compile time this has fewer bits than VT.
1100 bool knownBitsLT(MVT VT) const {
1101 return TypeSize::isKnownLT(getSizeInBits(), VT.getSizeInBits());
1102 }
1103
1104 /// Return true if we know at compile time this has fewer than or the same
1105 /// bits as VT.
1106 bool knownBitsLE(MVT VT) const {
1107 return TypeSize::isKnownLE(getSizeInBits(), VT.getSizeInBits());
1108 }
1109
1110 /// Return true if this has more bits than VT.
1111 bool bitsGT(MVT VT) const {
1112 assert(isScalableVector() == VT.isScalableVector() &&(static_cast <bool> (isScalableVector() == VT.isScalableVector
() && "Comparison between scalable and fixed types") ?
void (0) : __assert_fail ("isScalableVector() == VT.isScalableVector() && \"Comparison between scalable and fixed types\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/Support/MachineValueType.h"
, 1113, __extension__ __PRETTY_FUNCTION__))
1113 "Comparison between scalable and fixed types")(static_cast <bool> (isScalableVector() == VT.isScalableVector
() && "Comparison between scalable and fixed types") ?
void (0) : __assert_fail ("isScalableVector() == VT.isScalableVector() && \"Comparison between scalable and fixed types\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/Support/MachineValueType.h"
, 1113, __extension__ __PRETTY_FUNCTION__))
;
1114 return knownBitsGT(VT);
1115 }
1116
1117 /// Return true if this has no less bits than VT.
1118 bool bitsGE(MVT VT) const {
1119 assert(isScalableVector() == VT.isScalableVector() &&(static_cast <bool> (isScalableVector() == VT.isScalableVector
() && "Comparison between scalable and fixed types") ?
void (0) : __assert_fail ("isScalableVector() == VT.isScalableVector() && \"Comparison between scalable and fixed types\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/Support/MachineValueType.h"
, 1120, __extension__ __PRETTY_FUNCTION__))
1120 "Comparison between scalable and fixed types")(static_cast <bool> (isScalableVector() == VT.isScalableVector
() && "Comparison between scalable and fixed types") ?
void (0) : __assert_fail ("isScalableVector() == VT.isScalableVector() && \"Comparison between scalable and fixed types\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/Support/MachineValueType.h"
, 1120, __extension__ __PRETTY_FUNCTION__))
;
1121 return knownBitsGE(VT);
1122 }
1123
1124 /// Return true if this has less bits than VT.
1125 bool bitsLT(MVT VT) const {
1126 assert(isScalableVector() == VT.isScalableVector() &&(static_cast <bool> (isScalableVector() == VT.isScalableVector
() && "Comparison between scalable and fixed types") ?
void (0) : __assert_fail ("isScalableVector() == VT.isScalableVector() && \"Comparison between scalable and fixed types\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/Support/MachineValueType.h"
, 1127, __extension__ __PRETTY_FUNCTION__))
1127 "Comparison between scalable and fixed types")(static_cast <bool> (isScalableVector() == VT.isScalableVector
() && "Comparison between scalable and fixed types") ?
void (0) : __assert_fail ("isScalableVector() == VT.isScalableVector() && \"Comparison between scalable and fixed types\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/Support/MachineValueType.h"
, 1127, __extension__ __PRETTY_FUNCTION__))
;
1128 return knownBitsLT(VT);
1129 }
1130
1131 /// Return true if this has no more bits than VT.
1132 bool bitsLE(MVT VT) const {
1133 assert(isScalableVector() == VT.isScalableVector() &&(static_cast <bool> (isScalableVector() == VT.isScalableVector
() && "Comparison between scalable and fixed types") ?
void (0) : __assert_fail ("isScalableVector() == VT.isScalableVector() && \"Comparison between scalable and fixed types\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/Support/MachineValueType.h"
, 1134, __extension__ __PRETTY_FUNCTION__))
1134 "Comparison between scalable and fixed types")(static_cast <bool> (isScalableVector() == VT.isScalableVector
() && "Comparison between scalable and fixed types") ?
void (0) : __assert_fail ("isScalableVector() == VT.isScalableVector() && \"Comparison between scalable and fixed types\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/Support/MachineValueType.h"
, 1134, __extension__ __PRETTY_FUNCTION__))
;
1135 return knownBitsLE(VT);
1136 }
1137
1138 static MVT getFloatingPointVT(unsigned BitWidth) {
1139 switch (BitWidth) {
1140 default:
1141 llvm_unreachable("Bad bit width!")::llvm::llvm_unreachable_internal("Bad bit width!", "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/Support/MachineValueType.h"
, 1141)
;
1142 case 16:
1143 return MVT::f16;
1144 case 32:
1145 return MVT::f32;
1146 case 64:
1147 return MVT::f64;
1148 case 80:
1149 return MVT::f80;
1150 case 128:
1151 return MVT::f128;
1152 }
1153 }
1154
1155 static MVT getIntegerVT(unsigned BitWidth) {
1156 switch (BitWidth) {
1157 default:
1158 return (MVT::SimpleValueType)(MVT::INVALID_SIMPLE_VALUE_TYPE);
1159 case 1:
1160 return MVT::i1;
1161 case 8:
1162 return MVT::i8;
1163 case 16:
1164 return MVT::i16;
1165 case 32:
1166 return MVT::i32;
1167 case 64:
1168 return MVT::i64;
1169 case 128:
1170 return MVT::i128;
1171 }
1172 }
1173
1174 static MVT getVectorVT(MVT VT, unsigned NumElements) {
1175 switch (VT.SimpleTy) {
1176 default:
1177 break;
1178 case MVT::i1:
1179 if (NumElements == 1) return MVT::v1i1;
1180 if (NumElements == 2) return MVT::v2i1;
1181 if (NumElements == 4) return MVT::v4i1;
1182 if (NumElements == 8) return MVT::v8i1;
1183 if (NumElements == 16) return MVT::v16i1;
1184 if (NumElements == 32) return MVT::v32i1;
1185 if (NumElements == 64) return MVT::v64i1;
1186 if (NumElements == 128) return MVT::v128i1;
1187 if (NumElements == 256) return MVT::v256i1;
1188 if (NumElements == 512) return MVT::v512i1;
1189 if (NumElements == 1024) return MVT::v1024i1;
1190 break;
1191 case MVT::i8:
1192 if (NumElements == 1) return MVT::v1i8;
1193 if (NumElements == 2) return MVT::v2i8;
1194 if (NumElements == 4) return MVT::v4i8;
1195 if (NumElements == 8) return MVT::v8i8;
1196 if (NumElements == 16) return MVT::v16i8;
1197 if (NumElements == 32) return MVT::v32i8;
1198 if (NumElements == 64) return MVT::v64i8;
1199 if (NumElements == 128) return MVT::v128i8;
1200 if (NumElements == 256) return MVT::v256i8;
1201 if (NumElements == 512) return MVT::v512i8;
1202 if (NumElements == 1024) return MVT::v1024i8;
1203 break;
1204 case MVT::i16:
1205 if (NumElements == 1) return MVT::v1i16;
1206 if (NumElements == 2) return MVT::v2i16;
1207 if (NumElements == 3) return MVT::v3i16;
1208 if (NumElements == 4) return MVT::v4i16;
1209 if (NumElements == 8) return MVT::v8i16;
1210 if (NumElements == 16) return MVT::v16i16;
1211 if (NumElements == 32) return MVT::v32i16;
1212 if (NumElements == 64) return MVT::v64i16;
1213 if (NumElements == 128) return MVT::v128i16;
1214 if (NumElements == 256) return MVT::v256i16;
1215 if (NumElements == 512) return MVT::v512i16;
1216 break;
1217 case MVT::i32:
1218 if (NumElements == 1) return MVT::v1i32;
1219 if (NumElements == 2) return MVT::v2i32;
1220 if (NumElements == 3) return MVT::v3i32;
1221 if (NumElements == 4) return MVT::v4i32;
1222 if (NumElements == 5) return MVT::v5i32;
1223 if (NumElements == 6) return MVT::v6i32;
1224 if (NumElements == 7) return MVT::v7i32;
1225 if (NumElements == 8) return MVT::v8i32;
1226 if (NumElements == 16) return MVT::v16i32;
1227 if (NumElements == 32) return MVT::v32i32;
1228 if (NumElements == 64) return MVT::v64i32;
1229 if (NumElements == 128) return MVT::v128i32;
1230 if (NumElements == 256) return MVT::v256i32;
1231 if (NumElements == 512) return MVT::v512i32;
1232 if (NumElements == 1024) return MVT::v1024i32;
1233 if (NumElements == 2048) return MVT::v2048i32;
1234 break;
1235 case MVT::i64:
1236 if (NumElements == 1) return MVT::v1i64;
1237 if (NumElements == 2) return MVT::v2i64;
1238 if (NumElements == 3) return MVT::v3i64;
1239 if (NumElements == 4) return MVT::v4i64;
1240 if (NumElements == 8) return MVT::v8i64;
1241 if (NumElements == 16) return MVT::v16i64;
1242 if (NumElements == 32) return MVT::v32i64;
1243 if (NumElements == 64) return MVT::v64i64;
1244 if (NumElements == 128) return MVT::v128i64;
1245 if (NumElements == 256) return MVT::v256i64;
1246 break;
1247 case MVT::i128:
1248 if (NumElements == 1) return MVT::v1i128;
1249 break;
1250 case MVT::f16:
1251 if (NumElements == 1) return MVT::v1f16;
1252 if (NumElements == 2) return MVT::v2f16;
1253 if (NumElements == 3) return MVT::v3f16;
1254 if (NumElements == 4) return MVT::v4f16;
1255 if (NumElements == 8) return MVT::v8f16;
1256 if (NumElements == 16) return MVT::v16f16;
1257 if (NumElements == 32) return MVT::v32f16;
1258 if (NumElements == 64) return MVT::v64f16;
1259 if (NumElements == 128) return MVT::v128f16;
1260 if (NumElements == 256) return MVT::v256f16;
1261 if (NumElements == 512) return MVT::v512f16;
1262 break;
1263 case MVT::bf16:
1264 if (NumElements == 2) return MVT::v2bf16;
1265 if (NumElements == 3) return MVT::v3bf16;
1266 if (NumElements == 4) return MVT::v4bf16;
1267 if (NumElements == 8) return MVT::v8bf16;
1268 if (NumElements == 16) return MVT::v16bf16;
1269 if (NumElements == 32) return MVT::v32bf16;
1270 if (NumElements == 64) return MVT::v64bf16;
1271 if (NumElements == 128) return MVT::v128bf16;
1272 break;
1273 case MVT::f32:
1274 if (NumElements == 1) return MVT::v1f32;
1275 if (NumElements == 2) return MVT::v2f32;
1276 if (NumElements == 3) return MVT::v3f32;
1277 if (NumElements == 4) return MVT::v4f32;
1278 if (NumElements == 5) return MVT::v5f32;
1279 if (NumElements == 6) return MVT::v6f32;
1280 if (NumElements == 7) return MVT::v7f32;
1281 if (NumElements == 8) return MVT::v8f32;
1282 if (NumElements == 16) return MVT::v16f32;
1283 if (NumElements == 32) return MVT::v32f32;
1284 if (NumElements == 64) return MVT::v64f32;
1285 if (NumElements == 128) return MVT::v128f32;
1286 if (NumElements == 256) return MVT::v256f32;
1287 if (NumElements == 512) return MVT::v512f32;
1288 if (NumElements == 1024) return MVT::v1024f32;
1289 if (NumElements == 2048) return MVT::v2048f32;
1290 break;
1291 case MVT::f64:
1292 if (NumElements == 1) return MVT::v1f64;
1293 if (NumElements == 2) return MVT::v2f64;
1294 if (NumElements == 3) return MVT::v3f64;
1295 if (NumElements == 4) return MVT::v4f64;
1296 if (NumElements == 8) return MVT::v8f64;
1297 if (NumElements == 16) return MVT::v16f64;
1298 if (NumElements == 32) return MVT::v32f64;
1299 if (NumElements == 64) return MVT::v64f64;
1300 if (NumElements == 128) return MVT::v128f64;
1301 if (NumElements == 256) return MVT::v256f64;
1302 break;
1303 }
1304 return (MVT::SimpleValueType)(MVT::INVALID_SIMPLE_VALUE_TYPE);
1305 }
1306
1307 static MVT getScalableVectorVT(MVT VT, unsigned NumElements) {
1308 switch(VT.SimpleTy) {
1309 default:
1310 break;
1311 case MVT::i1:
1312 if (NumElements == 1) return MVT::nxv1i1;
1313 if (NumElements == 2) return MVT::nxv2i1;
1314 if (NumElements == 4) return MVT::nxv4i1;
1315 if (NumElements == 8) return MVT::nxv8i1;
1316 if (NumElements == 16) return MVT::nxv16i1;
1317 if (NumElements == 32) return MVT::nxv32i1;
1318 if (NumElements == 64) return MVT::nxv64i1;
1319 break;
1320 case MVT::i8:
1321 if (NumElements == 1) return MVT::nxv1i8;
1322 if (NumElements == 2) return MVT::nxv2i8;
1323 if (NumElements == 4) return MVT::nxv4i8;
1324 if (NumElements == 8) return MVT::nxv8i8;
1325 if (NumElements == 16) return MVT::nxv16i8;
1326 if (NumElements == 32) return MVT::nxv32i8;
1327 if (NumElements == 64) return MVT::nxv64i8;
1328 break;
1329 case MVT::i16:
1330 if (NumElements == 1) return MVT::nxv1i16;
1331 if (NumElements == 2) return MVT::nxv2i16;
1332 if (NumElements == 4) return MVT::nxv4i16;
1333 if (NumElements == 8) return MVT::nxv8i16;
1334 if (NumElements == 16) return MVT::nxv16i16;
1335 if (NumElements == 32) return MVT::nxv32i16;
1336 break;
1337 case MVT::i32:
1338 if (NumElements == 1) return MVT::nxv1i32;
1339 if (NumElements == 2) return MVT::nxv2i32;
1340 if (NumElements == 4) return MVT::nxv4i32;
1341 if (NumElements == 8) return MVT::nxv8i32;
1342 if (NumElements == 16) return MVT::nxv16i32;
1343 if (NumElements == 32) return MVT::nxv32i32;
1344 break;
1345 case MVT::i64:
1346 if (NumElements == 1) return MVT::nxv1i64;
1347 if (NumElements == 2) return MVT::nxv2i64;
1348 if (NumElements == 4) return MVT::nxv4i64;
1349 if (NumElements == 8) return MVT::nxv8i64;
1350 if (NumElements == 16) return MVT::nxv16i64;
1351 if (NumElements == 32) return MVT::nxv32i64;
1352 break;
1353 case MVT::f16:
1354 if (NumElements == 1) return MVT::nxv1f16;
1355 if (NumElements == 2) return MVT::nxv2f16;
1356 if (NumElements == 4) return MVT::nxv4f16;
1357 if (NumElements == 8) return MVT::nxv8f16;
1358 if (NumElements == 16) return MVT::nxv16f16;
1359 if (NumElements == 32) return MVT::nxv32f16;
1360 break;
1361 case MVT::bf16:
1362 if (NumElements == 1) return MVT::nxv1bf16;
1363 if (NumElements == 2) return MVT::nxv2bf16;
1364 if (NumElements == 4) return MVT::nxv4bf16;
1365 if (NumElements == 8) return MVT::nxv8bf16;
1366 break;
1367 case MVT::f32:
1368 if (NumElements == 1) return MVT::nxv1f32;
1369 if (NumElements == 2) return MVT::nxv2f32;
1370 if (NumElements == 4) return MVT::nxv4f32;
1371 if (NumElements == 8) return MVT::nxv8f32;
1372 if (NumElements == 16) return MVT::nxv16f32;
1373 break;
1374 case MVT::f64:
1375 if (NumElements == 1) return MVT::nxv1f64;
1376 if (NumElements == 2) return MVT::nxv2f64;
1377 if (NumElements == 4) return MVT::nxv4f64;
1378 if (NumElements == 8) return MVT::nxv8f64;
1379 break;
1380 }
1381 return (MVT::SimpleValueType)(MVT::INVALID_SIMPLE_VALUE_TYPE);
1382 }
1383
1384 static MVT getVectorVT(MVT VT, unsigned NumElements, bool IsScalable) {
1385 if (IsScalable)
1386 return getScalableVectorVT(VT, NumElements);
1387 return getVectorVT(VT, NumElements);
1388 }
1389
1390 static MVT getVectorVT(MVT VT, ElementCount EC) {
1391 if (EC.isScalable())
1392 return getScalableVectorVT(VT, EC.getKnownMinValue());
1393 return getVectorVT(VT, EC.getKnownMinValue());
1394 }
1395
1396 /// Return the value type corresponding to the specified type. This returns
1397 /// all pointers as iPTR. If HandleUnknown is true, unknown types are
1398 /// returned as Other, otherwise they are invalid.
1399 static MVT getVT(Type *Ty, bool HandleUnknown = false);
1400
1401 private:
1402 /// A simple iterator over the MVT::SimpleValueType enum.
1403 struct mvt_iterator {
1404 SimpleValueType VT;
1405
1406 mvt_iterator(SimpleValueType VT) : VT(VT) {}
1407
1408 MVT operator*() const { return VT; }
1409 bool operator!=(const mvt_iterator &LHS) const { return VT != LHS.VT; }
1410
1411 mvt_iterator& operator++() {
1412 VT = (MVT::SimpleValueType)((int)VT + 1);
1413 assert((int)VT <= MVT::MAX_ALLOWED_VALUETYPE &&(static_cast <bool> ((int)VT <= MVT::MAX_ALLOWED_VALUETYPE
&& "MVT iterator overflowed.") ? void (0) : __assert_fail
("(int)VT <= MVT::MAX_ALLOWED_VALUETYPE && \"MVT iterator overflowed.\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/Support/MachineValueType.h"
, 1414, __extension__ __PRETTY_FUNCTION__))
1414 "MVT iterator overflowed.")(static_cast <bool> ((int)VT <= MVT::MAX_ALLOWED_VALUETYPE
&& "MVT iterator overflowed.") ? void (0) : __assert_fail
("(int)VT <= MVT::MAX_ALLOWED_VALUETYPE && \"MVT iterator overflowed.\""
, "/build/llvm-toolchain-snapshot-13~++20210705111146+4aaf87875039/llvm/include/llvm/Support/MachineValueType.h"
, 1414, __extension__ __PRETTY_FUNCTION__))
;
1415 return *this;
1416 }
1417 };
1418
1419 /// A range of the MVT::SimpleValueType enum.
1420 using mvt_range = iterator_range<mvt_iterator>;
1421
1422 public:
1423 /// SimpleValueType Iteration
1424 /// @{
1425 static mvt_range all_valuetypes() {
1426 return mvt_range(MVT::FIRST_VALUETYPE,
1427 (MVT::SimpleValueType)(MVT::LAST_VALUETYPE + 1));
1428 }
1429
1430 static mvt_range integer_valuetypes() {
1431 return mvt_range(MVT::FIRST_INTEGER_VALUETYPE,
1432 (MVT::SimpleValueType)(MVT::LAST_INTEGER_VALUETYPE + 1));
1433 }
1434
1435 static mvt_range fp_valuetypes() {
1436 return mvt_range(MVT::FIRST_FP_VALUETYPE,
1437 (MVT::SimpleValueType)(MVT::LAST_FP_VALUETYPE + 1));
1438 }
1439
1440 static mvt_range vector_valuetypes() {
1441 return mvt_range(MVT::FIRST_VECTOR_VALUETYPE,
1442 (MVT::SimpleValueType)(MVT::LAST_VECTOR_VALUETYPE + 1));
1443 }
1444
1445 static mvt_range fixedlen_vector_valuetypes() {
1446 return mvt_range(
1447 MVT::FIRST_FIXEDLEN_VECTOR_VALUETYPE,
1448 (MVT::SimpleValueType)(MVT::LAST_FIXEDLEN_VECTOR_VALUETYPE + 1));
1449 }
1450
1451 static mvt_range scalable_vector_valuetypes() {
1452 return mvt_range(
1453 MVT::FIRST_SCALABLE_VECTOR_VALUETYPE,
1454 (MVT::SimpleValueType)(MVT::LAST_SCALABLE_VECTOR_VALUETYPE + 1));
1455 }
1456
1457 static mvt_range integer_fixedlen_vector_valuetypes() {
1458 return mvt_range(
1459 MVT::FIRST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE,
1460 (MVT::SimpleValueType)(MVT::LAST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE + 1));
1461 }
1462
1463 static mvt_range fp_fixedlen_vector_valuetypes() {
1464 return mvt_range(
1465 MVT::FIRST_FP_FIXEDLEN_VECTOR_VALUETYPE,
1466 (MVT::SimpleValueType)(MVT::LAST_FP_FIXEDLEN_VECTOR_VALUETYPE + 1));
1467 }
1468
1469 static mvt_range integer_scalable_vector_valuetypes() {
1470 return mvt_range(
1471 MVT::FIRST_INTEGER_SCALABLE_VECTOR_VALUETYPE,
1472 (MVT::SimpleValueType)(MVT::LAST_INTEGER_SCALABLE_VECTOR_VALUETYPE + 1));
1473 }
1474
1475 static mvt_range fp_scalable_vector_valuetypes() {
1476 return mvt_range(
1477 MVT::FIRST_FP_SCALABLE_VECTOR_VALUETYPE,
1478 (MVT::SimpleValueType)(MVT::LAST_FP_SCALABLE_VECTOR_VALUETYPE + 1));
1479 }
1480 /// @}
1481 };
1482
1483} // end namespace llvm
1484
1485#endif // LLVM_SUPPORT_MACHINEVALUETYPE_H