File: | llvm/lib/Target/X86/X86TargetTransformInfo.cpp |
Warning: | line 3612, column 15 Division by zero |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===// | ||||||
2 | // | ||||||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||||||
4 | // See https://llvm.org/LICENSE.txt for license information. | ||||||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||||
6 | // | ||||||
7 | //===----------------------------------------------------------------------===// | ||||||
8 | /// \file | ||||||
9 | /// This file implements a TargetTransformInfo analysis pass specific to the | ||||||
10 | /// X86 target machine. It uses the target's detailed information to provide | ||||||
11 | /// more precise answers to certain TTI queries, while letting the target | ||||||
12 | /// independent and default TTI implementations handle the rest. | ||||||
13 | /// | ||||||
14 | //===----------------------------------------------------------------------===// | ||||||
15 | /// About Cost Model numbers used below it's necessary to say the following: | ||||||
16 | /// the numbers correspond to some "generic" X86 CPU instead of usage of | ||||||
17 | /// concrete CPU model. Usually the numbers correspond to CPU where the feature | ||||||
18 | /// apeared at the first time. For example, if we do Subtarget.hasSSE42() in | ||||||
19 | /// the lookups below the cost is based on Nehalem as that was the first CPU | ||||||
20 | /// to support that feature level and thus has most likely the worst case cost. | ||||||
21 | /// Some examples of other technologies/CPUs: | ||||||
22 | /// SSE 3 - Pentium4 / Athlon64 | ||||||
23 | /// SSE 4.1 - Penryn | ||||||
24 | /// SSE 4.2 - Nehalem | ||||||
25 | /// AVX - Sandy Bridge | ||||||
26 | /// AVX2 - Haswell | ||||||
27 | /// AVX-512 - Xeon Phi / Skylake | ||||||
28 | /// And some examples of instruction target dependent costs (latency) | ||||||
29 | /// divss sqrtss rsqrtss | ||||||
30 | /// AMD K7 11-16 19 3 | ||||||
31 | /// Piledriver 9-24 13-15 5 | ||||||
32 | /// Jaguar 14 16 2 | ||||||
33 | /// Pentium II,III 18 30 2 | ||||||
34 | /// Nehalem 7-14 7-18 3 | ||||||
35 | /// Haswell 10-13 11 5 | ||||||
36 | /// TODO: Develop and implement the target dependent cost model and | ||||||
37 | /// specialize cost numbers for different Cost Model Targets such as throughput, | ||||||
38 | /// code size, latency and uop count. | ||||||
39 | //===----------------------------------------------------------------------===// | ||||||
40 | |||||||
41 | #include "X86TargetTransformInfo.h" | ||||||
42 | #include "llvm/Analysis/TargetTransformInfo.h" | ||||||
43 | #include "llvm/CodeGen/BasicTTIImpl.h" | ||||||
44 | #include "llvm/CodeGen/CostTable.h" | ||||||
45 | #include "llvm/CodeGen/TargetLowering.h" | ||||||
46 | #include "llvm/IR/InstIterator.h" | ||||||
47 | #include "llvm/IR/IntrinsicInst.h" | ||||||
48 | #include "llvm/Support/Debug.h" | ||||||
49 | |||||||
50 | using namespace llvm; | ||||||
51 | |||||||
52 | #define DEBUG_TYPE"x86tti" "x86tti" | ||||||
53 | |||||||
54 | //===----------------------------------------------------------------------===// | ||||||
55 | // | ||||||
56 | // X86 cost model. | ||||||
57 | // | ||||||
58 | //===----------------------------------------------------------------------===// | ||||||
59 | |||||||
60 | TargetTransformInfo::PopcntSupportKind | ||||||
61 | X86TTIImpl::getPopcntSupport(unsigned TyWidth) { | ||||||
62 | assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2")(static_cast <bool> (isPowerOf2_32(TyWidth) && "Ty width must be power of 2" ) ? void (0) : __assert_fail ("isPowerOf2_32(TyWidth) && \"Ty width must be power of 2\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 62, __extension__ __PRETTY_FUNCTION__)); | ||||||
63 | // TODO: Currently the __builtin_popcount() implementation using SSE3 | ||||||
64 | // instructions is inefficient. Once the problem is fixed, we should | ||||||
65 | // call ST->hasSSE3() instead of ST->hasPOPCNT(). | ||||||
66 | return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software; | ||||||
67 | } | ||||||
68 | |||||||
69 | llvm::Optional<unsigned> X86TTIImpl::getCacheSize( | ||||||
70 | TargetTransformInfo::CacheLevel Level) const { | ||||||
71 | switch (Level) { | ||||||
72 | case TargetTransformInfo::CacheLevel::L1D: | ||||||
73 | // - Penryn | ||||||
74 | // - Nehalem | ||||||
75 | // - Westmere | ||||||
76 | // - Sandy Bridge | ||||||
77 | // - Ivy Bridge | ||||||
78 | // - Haswell | ||||||
79 | // - Broadwell | ||||||
80 | // - Skylake | ||||||
81 | // - Kabylake | ||||||
82 | return 32 * 1024; // 32 KByte | ||||||
83 | case TargetTransformInfo::CacheLevel::L2D: | ||||||
84 | // - Penryn | ||||||
85 | // - Nehalem | ||||||
86 | // - Westmere | ||||||
87 | // - Sandy Bridge | ||||||
88 | // - Ivy Bridge | ||||||
89 | // - Haswell | ||||||
90 | // - Broadwell | ||||||
91 | // - Skylake | ||||||
92 | // - Kabylake | ||||||
93 | return 256 * 1024; // 256 KByte | ||||||
94 | } | ||||||
95 | |||||||
96 | llvm_unreachable("Unknown TargetTransformInfo::CacheLevel")::llvm::llvm_unreachable_internal("Unknown TargetTransformInfo::CacheLevel" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 96); | ||||||
97 | } | ||||||
98 | |||||||
99 | llvm::Optional<unsigned> X86TTIImpl::getCacheAssociativity( | ||||||
100 | TargetTransformInfo::CacheLevel Level) const { | ||||||
101 | // - Penryn | ||||||
102 | // - Nehalem | ||||||
103 | // - Westmere | ||||||
104 | // - Sandy Bridge | ||||||
105 | // - Ivy Bridge | ||||||
106 | // - Haswell | ||||||
107 | // - Broadwell | ||||||
108 | // - Skylake | ||||||
109 | // - Kabylake | ||||||
110 | switch (Level) { | ||||||
111 | case TargetTransformInfo::CacheLevel::L1D: | ||||||
112 | LLVM_FALLTHROUGH[[gnu::fallthrough]]; | ||||||
113 | case TargetTransformInfo::CacheLevel::L2D: | ||||||
114 | return 8; | ||||||
115 | } | ||||||
116 | |||||||
117 | llvm_unreachable("Unknown TargetTransformInfo::CacheLevel")::llvm::llvm_unreachable_internal("Unknown TargetTransformInfo::CacheLevel" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 117); | ||||||
118 | } | ||||||
119 | |||||||
120 | unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const { | ||||||
121 | bool Vector = (ClassID == 1); | ||||||
122 | if (Vector && !ST->hasSSE1()) | ||||||
123 | return 0; | ||||||
124 | |||||||
125 | if (ST->is64Bit()) { | ||||||
126 | if (Vector && ST->hasAVX512()) | ||||||
127 | return 32; | ||||||
128 | return 16; | ||||||
129 | } | ||||||
130 | return 8; | ||||||
131 | } | ||||||
132 | |||||||
133 | TypeSize | ||||||
134 | X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { | ||||||
135 | unsigned PreferVectorWidth = ST->getPreferVectorWidth(); | ||||||
136 | switch (K) { | ||||||
137 | case TargetTransformInfo::RGK_Scalar: | ||||||
138 | return TypeSize::getFixed(ST->is64Bit() ? 64 : 32); | ||||||
139 | case TargetTransformInfo::RGK_FixedWidthVector: | ||||||
140 | if (ST->hasAVX512() && PreferVectorWidth >= 512) | ||||||
141 | return TypeSize::getFixed(512); | ||||||
142 | if (ST->hasAVX() && PreferVectorWidth >= 256) | ||||||
143 | return TypeSize::getFixed(256); | ||||||
144 | if (ST->hasSSE1() && PreferVectorWidth >= 128) | ||||||
145 | return TypeSize::getFixed(128); | ||||||
146 | return TypeSize::getFixed(0); | ||||||
147 | case TargetTransformInfo::RGK_ScalableVector: | ||||||
148 | return TypeSize::getScalable(0); | ||||||
149 | } | ||||||
150 | |||||||
151 | llvm_unreachable("Unsupported register kind")::llvm::llvm_unreachable_internal("Unsupported register kind" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 151); | ||||||
152 | } | ||||||
153 | |||||||
154 | unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const { | ||||||
155 | return getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) | ||||||
156 | .getFixedSize(); | ||||||
157 | } | ||||||
158 | |||||||
159 | unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) { | ||||||
160 | // If the loop will not be vectorized, don't interleave the loop. | ||||||
161 | // Let regular unroll to unroll the loop, which saves the overflow | ||||||
162 | // check and memory check cost. | ||||||
163 | if (VF == 1) | ||||||
164 | return 1; | ||||||
165 | |||||||
166 | if (ST->isAtom()) | ||||||
167 | return 1; | ||||||
168 | |||||||
169 | // Sandybridge and Haswell have multiple execution ports and pipelined | ||||||
170 | // vector units. | ||||||
171 | if (ST->hasAVX()) | ||||||
172 | return 4; | ||||||
173 | |||||||
174 | return 2; | ||||||
175 | } | ||||||
176 | |||||||
177 | InstructionCost X86TTIImpl::getArithmeticInstrCost( | ||||||
178 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, | ||||||
179 | TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, | ||||||
180 | TTI::OperandValueProperties Opd1PropInfo, | ||||||
181 | TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args, | ||||||
182 | const Instruction *CxtI) { | ||||||
183 | // TODO: Handle more cost kinds. | ||||||
184 | if (CostKind != TTI::TCK_RecipThroughput) | ||||||
185 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, | ||||||
186 | Op2Info, Opd1PropInfo, | ||||||
187 | Opd2PropInfo, Args, CxtI); | ||||||
188 | |||||||
189 | // vXi8 multiplications are always promoted to vXi16. | ||||||
190 | if (Opcode == Instruction::Mul && Ty->isVectorTy() && | ||||||
191 | Ty->getScalarSizeInBits() == 8) { | ||||||
192 | Type *WideVecTy = | ||||||
193 | VectorType::getExtendedElementVectorType(cast<VectorType>(Ty)); | ||||||
194 | return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty, | ||||||
195 | TargetTransformInfo::CastContextHint::None, | ||||||
196 | CostKind) + | ||||||
197 | getCastInstrCost(Instruction::Trunc, Ty, WideVecTy, | ||||||
198 | TargetTransformInfo::CastContextHint::None, | ||||||
199 | CostKind) + | ||||||
200 | getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info, | ||||||
201 | Opd1PropInfo, Opd2PropInfo); | ||||||
202 | } | ||||||
203 | |||||||
204 | // Legalize the type. | ||||||
205 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); | ||||||
206 | |||||||
207 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | ||||||
208 | assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ? void (0) : __assert_fail ("ISD && \"Invalid opcode\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 208, __extension__ __PRETTY_FUNCTION__)); | ||||||
209 | |||||||
210 | if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() && | ||||||
211 | LT.second.getScalarType() == MVT::i32) { | ||||||
212 | // Check if the operands can be represented as a smaller datatype. | ||||||
213 | bool Op1Signed = false, Op2Signed = false; | ||||||
214 | unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed); | ||||||
215 | unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed); | ||||||
216 | unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize); | ||||||
217 | |||||||
218 | // If both are representable as i15 and at least one is constant, | ||||||
219 | // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we | ||||||
220 | // can treat this as PMADDWD which has the same costs as a vXi16 multiply. | ||||||
221 | if (OpMinSize <= 15 && !ST->isPMADDWDSlow()) { | ||||||
222 | bool Op1Constant = | ||||||
223 | isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]); | ||||||
224 | bool Op2Constant = | ||||||
225 | isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]); | ||||||
226 | bool Op1Sext = isa<SExtInst>(Args[0]) && | ||||||
227 | (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41())); | ||||||
228 | bool Op2Sext = isa<SExtInst>(Args[1]) && | ||||||
229 | (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41())); | ||||||
230 | |||||||
231 | bool IsZeroExtended = !Op1Signed || !Op2Signed; | ||||||
232 | bool IsConstant = Op1Constant || Op2Constant; | ||||||
233 | bool IsSext = Op1Sext || Op2Sext; | ||||||
234 | if (IsConstant || IsZeroExtended || IsSext) | ||||||
235 | LT.second = | ||||||
236 | MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements()); | ||||||
237 | } | ||||||
238 | } | ||||||
239 | |||||||
240 | // Vector multiply by pow2 will be simplified to shifts. | ||||||
241 | if (ISD == ISD::MUL && | ||||||
242 | (Op2Info == TargetTransformInfo::OK_UniformConstantValue || | ||||||
243 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && | ||||||
244 | Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) | ||||||
245 | return getArithmeticInstrCost(Instruction::Shl, Ty, CostKind, Op1Info, | ||||||
246 | Op2Info, TargetTransformInfo::OP_None, | ||||||
247 | TargetTransformInfo::OP_None); | ||||||
248 | |||||||
249 | // On X86, vector signed division by constants power-of-two are | ||||||
250 | // normally expanded to the sequence SRA + SRL + ADD + SRA. | ||||||
251 | // The OperandValue properties may not be the same as that of the previous | ||||||
252 | // operation; conservatively assume OP_None. | ||||||
253 | if ((ISD == ISD::SDIV || ISD == ISD::SREM) && | ||||||
254 | (Op2Info == TargetTransformInfo::OK_UniformConstantValue || | ||||||
255 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && | ||||||
256 | Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { | ||||||
257 | InstructionCost Cost = | ||||||
258 | 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Op1Info, | ||||||
259 | Op2Info, TargetTransformInfo::OP_None, | ||||||
260 | TargetTransformInfo::OP_None); | ||||||
261 | Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info, | ||||||
262 | Op2Info, TargetTransformInfo::OP_None, | ||||||
263 | TargetTransformInfo::OP_None); | ||||||
264 | Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, Op1Info, | ||||||
265 | Op2Info, TargetTransformInfo::OP_None, | ||||||
266 | TargetTransformInfo::OP_None); | ||||||
267 | |||||||
268 | if (ISD == ISD::SREM) { | ||||||
269 | // For SREM: (X % C) is the equivalent of (X - (X/C)*C) | ||||||
270 | Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info, | ||||||
271 | Op2Info); | ||||||
272 | Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info, | ||||||
273 | Op2Info); | ||||||
274 | } | ||||||
275 | |||||||
276 | return Cost; | ||||||
277 | } | ||||||
278 | |||||||
279 | // Vector unsigned division/remainder will be simplified to shifts/masks. | ||||||
280 | if ((ISD == ISD::UDIV || ISD == ISD::UREM) && | ||||||
281 | (Op2Info == TargetTransformInfo::OK_UniformConstantValue || | ||||||
282 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && | ||||||
283 | Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { | ||||||
284 | if (ISD == ISD::UDIV) | ||||||
285 | return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info, | ||||||
286 | Op2Info, TargetTransformInfo::OP_None, | ||||||
287 | TargetTransformInfo::OP_None); | ||||||
288 | // UREM | ||||||
289 | return getArithmeticInstrCost(Instruction::And, Ty, CostKind, Op1Info, | ||||||
290 | Op2Info, TargetTransformInfo::OP_None, | ||||||
291 | TargetTransformInfo::OP_None); | ||||||
292 | } | ||||||
293 | |||||||
294 | static const CostTblEntry GLMCostTable[] = { | ||||||
295 | { ISD::FDIV, MVT::f32, 18 }, // divss | ||||||
296 | { ISD::FDIV, MVT::v4f32, 35 }, // divps | ||||||
297 | { ISD::FDIV, MVT::f64, 33 }, // divsd | ||||||
298 | { ISD::FDIV, MVT::v2f64, 65 }, // divpd | ||||||
299 | }; | ||||||
300 | |||||||
301 | if (ST->useGLMDivSqrtCosts()) | ||||||
302 | if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, | ||||||
303 | LT.second)) | ||||||
304 | return LT.first * Entry->Cost; | ||||||
305 | |||||||
306 | static const CostTblEntry SLMCostTable[] = { | ||||||
307 | { ISD::MUL, MVT::v4i32, 11 }, // pmulld | ||||||
308 | { ISD::MUL, MVT::v8i16, 2 }, // pmullw | ||||||
309 | { ISD::FMUL, MVT::f64, 2 }, // mulsd | ||||||
310 | { ISD::FMUL, MVT::v2f64, 4 }, // mulpd | ||||||
311 | { ISD::FMUL, MVT::v4f32, 2 }, // mulps | ||||||
312 | { ISD::FDIV, MVT::f32, 17 }, // divss | ||||||
313 | { ISD::FDIV, MVT::v4f32, 39 }, // divps | ||||||
314 | { ISD::FDIV, MVT::f64, 32 }, // divsd | ||||||
315 | { ISD::FDIV, MVT::v2f64, 69 }, // divpd | ||||||
316 | { ISD::FADD, MVT::v2f64, 2 }, // addpd | ||||||
317 | { ISD::FSUB, MVT::v2f64, 2 }, // subpd | ||||||
318 | // v2i64/v4i64 mul is custom lowered as a series of long: | ||||||
319 | // multiplies(3), shifts(3) and adds(2) | ||||||
320 | // slm muldq version throughput is 2 and addq throughput 4 | ||||||
321 | // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) + | ||||||
322 | // 3X4 (addq throughput) = 17 | ||||||
323 | { ISD::MUL, MVT::v2i64, 17 }, | ||||||
324 | // slm addq\subq throughput is 4 | ||||||
325 | { ISD::ADD, MVT::v2i64, 4 }, | ||||||
326 | { ISD::SUB, MVT::v2i64, 4 }, | ||||||
327 | }; | ||||||
328 | |||||||
329 | if (ST->useSLMArithCosts()) { | ||||||
330 | if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) { | ||||||
331 | // Check if the operands can be shrinked into a smaller datatype. | ||||||
332 | // TODO: Merge this into generiic vXi32 MUL patterns above. | ||||||
333 | bool Op1Signed = false; | ||||||
334 | unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed); | ||||||
335 | bool Op2Signed = false; | ||||||
336 | unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed); | ||||||
337 | |||||||
338 | bool SignedMode = Op1Signed || Op2Signed; | ||||||
339 | unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize); | ||||||
340 | |||||||
341 | if (OpMinSize <= 7) | ||||||
342 | return LT.first * 3; // pmullw/sext | ||||||
343 | if (!SignedMode && OpMinSize <= 8) | ||||||
344 | return LT.first * 3; // pmullw/zext | ||||||
345 | if (OpMinSize <= 15) | ||||||
346 | return LT.first * 5; // pmullw/pmulhw/pshuf | ||||||
347 | if (!SignedMode && OpMinSize <= 16) | ||||||
348 | return LT.first * 5; // pmullw/pmulhw/pshuf | ||||||
349 | } | ||||||
350 | |||||||
351 | if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, | ||||||
352 | LT.second)) { | ||||||
353 | return LT.first * Entry->Cost; | ||||||
354 | } | ||||||
355 | } | ||||||
356 | |||||||
357 | static const CostTblEntry AVX512BWUniformConstCostTable[] = { | ||||||
358 | { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand. | ||||||
359 | { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand. | ||||||
360 | { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb. | ||||||
361 | }; | ||||||
362 | |||||||
363 | if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && | ||||||
364 | ST->hasBWI()) { | ||||||
365 | if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD, | ||||||
366 | LT.second)) | ||||||
367 | return LT.first * Entry->Cost; | ||||||
368 | } | ||||||
369 | |||||||
370 | static const CostTblEntry AVX512UniformConstCostTable[] = { | ||||||
371 | { ISD::SRA, MVT::v2i64, 1 }, | ||||||
372 | { ISD::SRA, MVT::v4i64, 1 }, | ||||||
373 | { ISD::SRA, MVT::v8i64, 1 }, | ||||||
374 | |||||||
375 | { ISD::SHL, MVT::v64i8, 4 }, // psllw + pand. | ||||||
376 | { ISD::SRL, MVT::v64i8, 4 }, // psrlw + pand. | ||||||
377 | { ISD::SRA, MVT::v64i8, 8 }, // psrlw, pand, pxor, psubb. | ||||||
378 | |||||||
379 | { ISD::SDIV, MVT::v16i32, 6 }, // pmuludq sequence | ||||||
380 | { ISD::SREM, MVT::v16i32, 8 }, // pmuludq+mul+sub sequence | ||||||
381 | { ISD::UDIV, MVT::v16i32, 5 }, // pmuludq sequence | ||||||
382 | { ISD::UREM, MVT::v16i32, 7 }, // pmuludq+mul+sub sequence | ||||||
383 | }; | ||||||
384 | |||||||
385 | if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && | ||||||
386 | ST->hasAVX512()) { | ||||||
387 | if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD, | ||||||
388 | LT.second)) | ||||||
389 | return LT.first * Entry->Cost; | ||||||
390 | } | ||||||
391 | |||||||
392 | static const CostTblEntry AVX2UniformConstCostTable[] = { | ||||||
393 | { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand. | ||||||
394 | { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand. | ||||||
395 | { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb. | ||||||
396 | |||||||
397 | { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle. | ||||||
398 | |||||||
399 | { ISD::SDIV, MVT::v8i32, 6 }, // pmuludq sequence | ||||||
400 | { ISD::SREM, MVT::v8i32, 8 }, // pmuludq+mul+sub sequence | ||||||
401 | { ISD::UDIV, MVT::v8i32, 5 }, // pmuludq sequence | ||||||
402 | { ISD::UREM, MVT::v8i32, 7 }, // pmuludq+mul+sub sequence | ||||||
403 | }; | ||||||
404 | |||||||
405 | if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && | ||||||
406 | ST->hasAVX2()) { | ||||||
407 | if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD, | ||||||
408 | LT.second)) | ||||||
409 | return LT.first * Entry->Cost; | ||||||
410 | } | ||||||
411 | |||||||
412 | static const CostTblEntry SSE2UniformConstCostTable[] = { | ||||||
413 | { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand. | ||||||
414 | { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand. | ||||||
415 | { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. | ||||||
416 | |||||||
417 | { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split. | ||||||
418 | { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split. | ||||||
419 | { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split. | ||||||
420 | |||||||
421 | { ISD::SDIV, MVT::v8i32, 12+2 }, // 2*pmuludq sequence + split. | ||||||
422 | { ISD::SREM, MVT::v8i32, 16+2 }, // 2*pmuludq+mul+sub sequence + split. | ||||||
423 | { ISD::SDIV, MVT::v4i32, 6 }, // pmuludq sequence | ||||||
424 | { ISD::SREM, MVT::v4i32, 8 }, // pmuludq+mul+sub sequence | ||||||
425 | { ISD::UDIV, MVT::v8i32, 10+2 }, // 2*pmuludq sequence + split. | ||||||
426 | { ISD::UREM, MVT::v8i32, 14+2 }, // 2*pmuludq+mul+sub sequence + split. | ||||||
427 | { ISD::UDIV, MVT::v4i32, 5 }, // pmuludq sequence | ||||||
428 | { ISD::UREM, MVT::v4i32, 7 }, // pmuludq+mul+sub sequence | ||||||
429 | }; | ||||||
430 | |||||||
431 | // XOP has faster vXi8 shifts. | ||||||
432 | if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && | ||||||
433 | ST->hasSSE2() && !ST->hasXOP()) { | ||||||
434 | if (const auto *Entry = | ||||||
435 | CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second)) | ||||||
436 | return LT.first * Entry->Cost; | ||||||
437 | } | ||||||
438 | |||||||
439 | static const CostTblEntry AVX512BWConstCostTable[] = { | ||||||
440 | { ISD::SDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence | ||||||
441 | { ISD::SREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence | ||||||
442 | { ISD::UDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence | ||||||
443 | { ISD::UREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence | ||||||
444 | { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence | ||||||
445 | { ISD::SREM, MVT::v32i16, 8 }, // vpmulhw+mul+sub sequence | ||||||
446 | { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence | ||||||
447 | { ISD::UREM, MVT::v32i16, 8 }, // vpmulhuw+mul+sub sequence | ||||||
448 | }; | ||||||
449 | |||||||
450 | if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue || | ||||||
451 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && | ||||||
452 | ST->hasBWI()) { | ||||||
453 | if (const auto *Entry = | ||||||
454 | CostTableLookup(AVX512BWConstCostTable, ISD, LT.second)) | ||||||
455 | return LT.first * Entry->Cost; | ||||||
456 | } | ||||||
457 | |||||||
458 | static const CostTblEntry AVX512ConstCostTable[] = { | ||||||
459 | { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence | ||||||
460 | { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence | ||||||
461 | { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence | ||||||
462 | { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence | ||||||
463 | { ISD::SDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence | ||||||
464 | { ISD::SREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence | ||||||
465 | { ISD::UDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence | ||||||
466 | { ISD::UREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence | ||||||
467 | { ISD::SDIV, MVT::v32i16, 12 }, // 2*vpmulhw sequence | ||||||
468 | { ISD::SREM, MVT::v32i16, 16 }, // 2*vpmulhw+mul+sub sequence | ||||||
469 | { ISD::UDIV, MVT::v32i16, 12 }, // 2*vpmulhuw sequence | ||||||
470 | { ISD::UREM, MVT::v32i16, 16 }, // 2*vpmulhuw+mul+sub sequence | ||||||
471 | }; | ||||||
472 | |||||||
473 | if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue || | ||||||
474 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && | ||||||
475 | ST->hasAVX512()) { | ||||||
476 | if (const auto *Entry = | ||||||
477 | CostTableLookup(AVX512ConstCostTable, ISD, LT.second)) | ||||||
478 | return LT.first * Entry->Cost; | ||||||
479 | } | ||||||
480 | |||||||
481 | static const CostTblEntry AVX2ConstCostTable[] = { | ||||||
482 | { ISD::SDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence | ||||||
483 | { ISD::SREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence | ||||||
484 | { ISD::UDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence | ||||||
485 | { ISD::UREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence | ||||||
486 | { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence | ||||||
487 | { ISD::SREM, MVT::v16i16, 8 }, // vpmulhw+mul+sub sequence | ||||||
488 | { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence | ||||||
489 | { ISD::UREM, MVT::v16i16, 8 }, // vpmulhuw+mul+sub sequence | ||||||
490 | { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence | ||||||
491 | { ISD::SREM, MVT::v8i32, 19 }, // vpmuldq+mul+sub sequence | ||||||
492 | { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence | ||||||
493 | { ISD::UREM, MVT::v8i32, 19 }, // vpmuludq+mul+sub sequence | ||||||
494 | }; | ||||||
495 | |||||||
496 | if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue || | ||||||
497 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && | ||||||
498 | ST->hasAVX2()) { | ||||||
499 | if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second)) | ||||||
500 | return LT.first * Entry->Cost; | ||||||
501 | } | ||||||
502 | |||||||
503 | static const CostTblEntry SSE2ConstCostTable[] = { | ||||||
504 | { ISD::SDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split. | ||||||
505 | { ISD::SREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split. | ||||||
506 | { ISD::SDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence | ||||||
507 | { ISD::SREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence | ||||||
508 | { ISD::UDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split. | ||||||
509 | { ISD::UREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split. | ||||||
510 | { ISD::UDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence | ||||||
511 | { ISD::UREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence | ||||||
512 | { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split. | ||||||
513 | { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split. | ||||||
514 | { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence | ||||||
515 | { ISD::SREM, MVT::v8i16, 8 }, // pmulhw+mul+sub sequence | ||||||
516 | { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split. | ||||||
517 | { ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split. | ||||||
518 | { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence | ||||||
519 | { ISD::UREM, MVT::v8i16, 8 }, // pmulhuw+mul+sub sequence | ||||||
520 | { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split. | ||||||
521 | { ISD::SREM, MVT::v8i32, 48+2 }, // 2*pmuludq+mul+sub sequence + split. | ||||||
522 | { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence | ||||||
523 | { ISD::SREM, MVT::v4i32, 24 }, // pmuludq+mul+sub sequence | ||||||
524 | { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split. | ||||||
525 | { ISD::UREM, MVT::v8i32, 40+2 }, // 2*pmuludq+mul+sub sequence + split. | ||||||
526 | { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence | ||||||
527 | { ISD::UREM, MVT::v4i32, 20 }, // pmuludq+mul+sub sequence | ||||||
528 | }; | ||||||
529 | |||||||
530 | if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue || | ||||||
531 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && | ||||||
532 | ST->hasSSE2()) { | ||||||
533 | // pmuldq sequence. | ||||||
534 | if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX()) | ||||||
535 | return LT.first * 32; | ||||||
536 | if (ISD == ISD::SREM && LT.second == MVT::v8i32 && ST->hasAVX()) | ||||||
537 | return LT.first * 38; | ||||||
538 | if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41()) | ||||||
539 | return LT.first * 15; | ||||||
540 | if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41()) | ||||||
541 | return LT.first * 20; | ||||||
542 | |||||||
543 | if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second)) | ||||||
544 | return LT.first * Entry->Cost; | ||||||
545 | } | ||||||
546 | |||||||
547 | static const CostTblEntry AVX512BWShiftCostTable[] = { | ||||||
548 | { ISD::SHL, MVT::v16i8, 4 }, // extend/vpsllvw/pack sequence. | ||||||
549 | { ISD::SRL, MVT::v16i8, 4 }, // extend/vpsrlvw/pack sequence. | ||||||
550 | { ISD::SRA, MVT::v16i8, 4 }, // extend/vpsravw/pack sequence. | ||||||
551 | { ISD::SHL, MVT::v32i8, 4 }, // extend/vpsllvw/pack sequence. | ||||||
552 | { ISD::SRL, MVT::v32i8, 4 }, // extend/vpsrlvw/pack sequence. | ||||||
553 | { ISD::SRA, MVT::v32i8, 6 }, // extend/vpsravw/pack sequence. | ||||||
554 | { ISD::SHL, MVT::v64i8, 6 }, // extend/vpsllvw/pack sequence. | ||||||
555 | { ISD::SRL, MVT::v64i8, 7 }, // extend/vpsrlvw/pack sequence. | ||||||
556 | { ISD::SRA, MVT::v64i8, 15 }, // extend/vpsravw/pack sequence. | ||||||
557 | |||||||
558 | { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw | ||||||
559 | { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw | ||||||
560 | { ISD::SRA, MVT::v8i16, 1 }, // vpsravw | ||||||
561 | { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw | ||||||
562 | { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw | ||||||
563 | { ISD::SRA, MVT::v16i16, 1 }, // vpsravw | ||||||
564 | { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw | ||||||
565 | { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw | ||||||
566 | { ISD::SRA, MVT::v32i16, 1 }, // vpsravw | ||||||
567 | }; | ||||||
568 | |||||||
569 | if (ST->hasBWI()) | ||||||
570 | if (const auto *Entry = CostTableLookup(AVX512BWShiftCostTable, ISD, LT.second)) | ||||||
571 | return LT.first * Entry->Cost; | ||||||
572 | |||||||
573 | static const CostTblEntry AVX2UniformCostTable[] = { | ||||||
574 | // Uniform splats are cheaper for the following instructions. | ||||||
575 | { ISD::SHL, MVT::v16i16, 1 }, // psllw. | ||||||
576 | { ISD::SRL, MVT::v16i16, 1 }, // psrlw. | ||||||
577 | { ISD::SRA, MVT::v16i16, 1 }, // psraw. | ||||||
578 | { ISD::SHL, MVT::v32i16, 2 }, // 2*psllw. | ||||||
579 | { ISD::SRL, MVT::v32i16, 2 }, // 2*psrlw. | ||||||
580 | { ISD::SRA, MVT::v32i16, 2 }, // 2*psraw. | ||||||
581 | |||||||
582 | { ISD::SHL, MVT::v8i32, 1 }, // pslld | ||||||
583 | { ISD::SRL, MVT::v8i32, 1 }, // psrld | ||||||
584 | { ISD::SRA, MVT::v8i32, 1 }, // psrad | ||||||
585 | { ISD::SHL, MVT::v4i64, 1 }, // psllq | ||||||
586 | { ISD::SRL, MVT::v4i64, 1 }, // psrlq | ||||||
587 | }; | ||||||
588 | |||||||
589 | if (ST->hasAVX2() && | ||||||
590 | ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || | ||||||
591 | (Op2Info == TargetTransformInfo::OK_UniformValue))) { | ||||||
592 | if (const auto *Entry = | ||||||
593 | CostTableLookup(AVX2UniformCostTable, ISD, LT.second)) | ||||||
594 | return LT.first * Entry->Cost; | ||||||
595 | } | ||||||
596 | |||||||
597 | static const CostTblEntry SSE2UniformCostTable[] = { | ||||||
598 | // Uniform splats are cheaper for the following instructions. | ||||||
599 | { ISD::SHL, MVT::v8i16, 1 }, // psllw. | ||||||
600 | { ISD::SHL, MVT::v4i32, 1 }, // pslld | ||||||
601 | { ISD::SHL, MVT::v2i64, 1 }, // psllq. | ||||||
602 | |||||||
603 | { ISD::SRL, MVT::v8i16, 1 }, // psrlw. | ||||||
604 | { ISD::SRL, MVT::v4i32, 1 }, // psrld. | ||||||
605 | { ISD::SRL, MVT::v2i64, 1 }, // psrlq. | ||||||
606 | |||||||
607 | { ISD::SRA, MVT::v8i16, 1 }, // psraw. | ||||||
608 | { ISD::SRA, MVT::v4i32, 1 }, // psrad. | ||||||
609 | }; | ||||||
610 | |||||||
611 | if (ST->hasSSE2() && | ||||||
612 | ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || | ||||||
613 | (Op2Info == TargetTransformInfo::OK_UniformValue))) { | ||||||
614 | if (const auto *Entry = | ||||||
615 | CostTableLookup(SSE2UniformCostTable, ISD, LT.second)) | ||||||
616 | return LT.first * Entry->Cost; | ||||||
617 | } | ||||||
618 | |||||||
619 | static const CostTblEntry AVX512DQCostTable[] = { | ||||||
620 | { ISD::MUL, MVT::v2i64, 2 }, // pmullq | ||||||
621 | { ISD::MUL, MVT::v4i64, 2 }, // pmullq | ||||||
622 | { ISD::MUL, MVT::v8i64, 2 } // pmullq | ||||||
623 | }; | ||||||
624 | |||||||
625 | // Look for AVX512DQ lowering tricks for custom cases. | ||||||
626 | if (ST->hasDQI()) | ||||||
627 | if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second)) | ||||||
628 | return LT.first * Entry->Cost; | ||||||
629 | |||||||
630 | static const CostTblEntry AVX512BWCostTable[] = { | ||||||
631 | { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence. | ||||||
632 | { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence. | ||||||
633 | { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence. | ||||||
634 | }; | ||||||
635 | |||||||
636 | // Look for AVX512BW lowering tricks for custom cases. | ||||||
637 | if (ST->hasBWI()) | ||||||
638 | if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second)) | ||||||
639 | return LT.first * Entry->Cost; | ||||||
640 | |||||||
641 | static const CostTblEntry AVX512CostTable[] = { | ||||||
642 | { ISD::SHL, MVT::v4i32, 1 }, | ||||||
643 | { ISD::SRL, MVT::v4i32, 1 }, | ||||||
644 | { ISD::SRA, MVT::v4i32, 1 }, | ||||||
645 | { ISD::SHL, MVT::v8i32, 1 }, | ||||||
646 | { ISD::SRL, MVT::v8i32, 1 }, | ||||||
647 | { ISD::SRA, MVT::v8i32, 1 }, | ||||||
648 | { ISD::SHL, MVT::v16i32, 1 }, | ||||||
649 | { ISD::SRL, MVT::v16i32, 1 }, | ||||||
650 | { ISD::SRA, MVT::v16i32, 1 }, | ||||||
651 | |||||||
652 | { ISD::SHL, MVT::v2i64, 1 }, | ||||||
653 | { ISD::SRL, MVT::v2i64, 1 }, | ||||||
654 | { ISD::SHL, MVT::v4i64, 1 }, | ||||||
655 | { ISD::SRL, MVT::v4i64, 1 }, | ||||||
656 | { ISD::SHL, MVT::v8i64, 1 }, | ||||||
657 | { ISD::SRL, MVT::v8i64, 1 }, | ||||||
658 | |||||||
659 | { ISD::SRA, MVT::v2i64, 1 }, | ||||||
660 | { ISD::SRA, MVT::v4i64, 1 }, | ||||||
661 | { ISD::SRA, MVT::v8i64, 1 }, | ||||||
662 | |||||||
663 | { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org) | ||||||
664 | { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org) | ||||||
665 | { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org) | ||||||
666 | { ISD::MUL, MVT::v8i64, 6 }, // 3*pmuludq/3*shift/2*add | ||||||
667 | { ISD::MUL, MVT::i64, 1 }, // Skylake from http://www.agner.org/ | ||||||
668 | |||||||
669 | { ISD::FNEG, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ | ||||||
670 | { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ | ||||||
671 | { ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ | ||||||
672 | { ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ | ||||||
673 | { ISD::FDIV, MVT::f64, 4 }, // Skylake from http://www.agner.org/ | ||||||
674 | { ISD::FDIV, MVT::v2f64, 4 }, // Skylake from http://www.agner.org/ | ||||||
675 | { ISD::FDIV, MVT::v4f64, 8 }, // Skylake from http://www.agner.org/ | ||||||
676 | { ISD::FDIV, MVT::v8f64, 16 }, // Skylake from http://www.agner.org/ | ||||||
677 | |||||||
678 | { ISD::FNEG, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ | ||||||
679 | { ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ | ||||||
680 | { ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ | ||||||
681 | { ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ | ||||||
682 | { ISD::FDIV, MVT::f32, 3 }, // Skylake from http://www.agner.org/ | ||||||
683 | { ISD::FDIV, MVT::v4f32, 3 }, // Skylake from http://www.agner.org/ | ||||||
684 | { ISD::FDIV, MVT::v8f32, 5 }, // Skylake from http://www.agner.org/ | ||||||
685 | { ISD::FDIV, MVT::v16f32, 10 }, // Skylake from http://www.agner.org/ | ||||||
686 | }; | ||||||
687 | |||||||
688 | if (ST->hasAVX512()) | ||||||
689 | if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second)) | ||||||
690 | return LT.first * Entry->Cost; | ||||||
691 | |||||||
692 | static const CostTblEntry AVX2ShiftCostTable[] = { | ||||||
693 | // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to | ||||||
694 | // customize them to detect the cases where shift amount is a scalar one. | ||||||
695 | { ISD::SHL, MVT::v4i32, 2 }, // vpsllvd (Haswell from agner.org) | ||||||
696 | { ISD::SRL, MVT::v4i32, 2 }, // vpsrlvd (Haswell from agner.org) | ||||||
697 | { ISD::SRA, MVT::v4i32, 2 }, // vpsravd (Haswell from agner.org) | ||||||
698 | { ISD::SHL, MVT::v8i32, 2 }, // vpsllvd (Haswell from agner.org) | ||||||
699 | { ISD::SRL, MVT::v8i32, 2 }, // vpsrlvd (Haswell from agner.org) | ||||||
700 | { ISD::SRA, MVT::v8i32, 2 }, // vpsravd (Haswell from agner.org) | ||||||
701 | { ISD::SHL, MVT::v2i64, 1 }, // vpsllvq (Haswell from agner.org) | ||||||
702 | { ISD::SRL, MVT::v2i64, 1 }, // vpsrlvq (Haswell from agner.org) | ||||||
703 | { ISD::SHL, MVT::v4i64, 1 }, // vpsllvq (Haswell from agner.org) | ||||||
704 | { ISD::SRL, MVT::v4i64, 1 }, // vpsrlvq (Haswell from agner.org) | ||||||
705 | }; | ||||||
706 | |||||||
707 | if (ST->hasAVX512()) { | ||||||
708 | if (ISD == ISD::SHL && LT.second == MVT::v32i16 && | ||||||
709 | (Op2Info == TargetTransformInfo::OK_UniformConstantValue || | ||||||
710 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) | ||||||
711 | // On AVX512, a packed v32i16 shift left by a constant build_vector | ||||||
712 | // is lowered into a vector multiply (vpmullw). | ||||||
713 | return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, | ||||||
714 | Op1Info, Op2Info, | ||||||
715 | TargetTransformInfo::OP_None, | ||||||
716 | TargetTransformInfo::OP_None); | ||||||
717 | } | ||||||
718 | |||||||
719 | // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts). | ||||||
720 | if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) { | ||||||
721 | if (ISD == ISD::SHL && LT.second == MVT::v16i16 && | ||||||
722 | (Op2Info == TargetTransformInfo::OK_UniformConstantValue || | ||||||
723 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) | ||||||
724 | // On AVX2, a packed v16i16 shift left by a constant build_vector | ||||||
725 | // is lowered into a vector multiply (vpmullw). | ||||||
726 | return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, | ||||||
727 | Op1Info, Op2Info, | ||||||
728 | TargetTransformInfo::OP_None, | ||||||
729 | TargetTransformInfo::OP_None); | ||||||
730 | |||||||
731 | if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second)) | ||||||
732 | return LT.first * Entry->Cost; | ||||||
733 | } | ||||||
734 | |||||||
735 | static const CostTblEntry XOPShiftCostTable[] = { | ||||||
736 | // 128bit shifts take 1cy, but right shifts require negation beforehand. | ||||||
737 | { ISD::SHL, MVT::v16i8, 1 }, | ||||||
738 | { ISD::SRL, MVT::v16i8, 2 }, | ||||||
739 | { ISD::SRA, MVT::v16i8, 2 }, | ||||||
740 | { ISD::SHL, MVT::v8i16, 1 }, | ||||||
741 | { ISD::SRL, MVT::v8i16, 2 }, | ||||||
742 | { ISD::SRA, MVT::v8i16, 2 }, | ||||||
743 | { ISD::SHL, MVT::v4i32, 1 }, | ||||||
744 | { ISD::SRL, MVT::v4i32, 2 }, | ||||||
745 | { ISD::SRA, MVT::v4i32, 2 }, | ||||||
746 | { ISD::SHL, MVT::v2i64, 1 }, | ||||||
747 | { ISD::SRL, MVT::v2i64, 2 }, | ||||||
748 | { ISD::SRA, MVT::v2i64, 2 }, | ||||||
749 | // 256bit shifts require splitting if AVX2 didn't catch them above. | ||||||
750 | { ISD::SHL, MVT::v32i8, 2+2 }, | ||||||
751 | { ISD::SRL, MVT::v32i8, 4+2 }, | ||||||
752 | { ISD::SRA, MVT::v32i8, 4+2 }, | ||||||
753 | { ISD::SHL, MVT::v16i16, 2+2 }, | ||||||
754 | { ISD::SRL, MVT::v16i16, 4+2 }, | ||||||
755 | { ISD::SRA, MVT::v16i16, 4+2 }, | ||||||
756 | { ISD::SHL, MVT::v8i32, 2+2 }, | ||||||
757 | { ISD::SRL, MVT::v8i32, 4+2 }, | ||||||
758 | { ISD::SRA, MVT::v8i32, 4+2 }, | ||||||
759 | { ISD::SHL, MVT::v4i64, 2+2 }, | ||||||
760 | { ISD::SRL, MVT::v4i64, 4+2 }, | ||||||
761 | { ISD::SRA, MVT::v4i64, 4+2 }, | ||||||
762 | }; | ||||||
763 | |||||||
764 | // Look for XOP lowering tricks. | ||||||
765 | if (ST->hasXOP()) { | ||||||
766 | // If the right shift is constant then we'll fold the negation so | ||||||
767 | // it's as cheap as a left shift. | ||||||
768 | int ShiftISD = ISD; | ||||||
769 | if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && | ||||||
770 | (Op2Info == TargetTransformInfo::OK_UniformConstantValue || | ||||||
771 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) | ||||||
772 | ShiftISD = ISD::SHL; | ||||||
773 | if (const auto *Entry = | ||||||
774 | CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second)) | ||||||
775 | return LT.first * Entry->Cost; | ||||||
776 | } | ||||||
777 | |||||||
778 | static const CostTblEntry SSE2UniformShiftCostTable[] = { | ||||||
779 | // Uniform splats are cheaper for the following instructions. | ||||||
780 | { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split. | ||||||
781 | { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split. | ||||||
782 | { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split. | ||||||
783 | |||||||
784 | { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split. | ||||||
785 | { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split. | ||||||
786 | { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split. | ||||||
787 | |||||||
788 | { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split. | ||||||
789 | { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split. | ||||||
790 | { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle. | ||||||
791 | { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split. | ||||||
792 | }; | ||||||
793 | |||||||
794 | if (ST->hasSSE2() && | ||||||
795 | ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || | ||||||
796 | (Op2Info == TargetTransformInfo::OK_UniformValue))) { | ||||||
797 | |||||||
798 | // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table. | ||||||
799 | if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2()) | ||||||
800 | return LT.first * 4; // 2*psrad + shuffle. | ||||||
801 | |||||||
802 | if (const auto *Entry = | ||||||
803 | CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second)) | ||||||
804 | return LT.first * Entry->Cost; | ||||||
805 | } | ||||||
806 | |||||||
807 | if (ISD == ISD::SHL && | ||||||
808 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) { | ||||||
809 | MVT VT = LT.second; | ||||||
810 | // Vector shift left by non uniform constant can be lowered | ||||||
811 | // into vector multiply. | ||||||
812 | if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) || | ||||||
813 | ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX())) | ||||||
814 | ISD = ISD::MUL; | ||||||
815 | } | ||||||
816 | |||||||
817 | static const CostTblEntry AVX2CostTable[] = { | ||||||
818 | { ISD::SHL, MVT::v16i8, 6 }, // vpblendvb sequence. | ||||||
819 | { ISD::SHL, MVT::v32i8, 6 }, // vpblendvb sequence. | ||||||
820 | { ISD::SHL, MVT::v64i8, 12 }, // 2*vpblendvb sequence. | ||||||
821 | { ISD::SHL, MVT::v8i16, 5 }, // extend/vpsrlvd/pack sequence. | ||||||
822 | { ISD::SHL, MVT::v16i16, 7 }, // extend/vpsrlvd/pack sequence. | ||||||
823 | { ISD::SHL, MVT::v32i16, 14 }, // 2*extend/vpsrlvd/pack sequence. | ||||||
824 | |||||||
825 | { ISD::SRL, MVT::v16i8, 6 }, // vpblendvb sequence. | ||||||
826 | { ISD::SRL, MVT::v32i8, 6 }, // vpblendvb sequence. | ||||||
827 | { ISD::SRL, MVT::v64i8, 12 }, // 2*vpblendvb sequence. | ||||||
828 | { ISD::SRL, MVT::v8i16, 5 }, // extend/vpsrlvd/pack sequence. | ||||||
829 | { ISD::SRL, MVT::v16i16, 7 }, // extend/vpsrlvd/pack sequence. | ||||||
830 | { ISD::SRL, MVT::v32i16, 14 }, // 2*extend/vpsrlvd/pack sequence. | ||||||
831 | |||||||
832 | { ISD::SRA, MVT::v16i8, 17 }, // vpblendvb sequence. | ||||||
833 | { ISD::SRA, MVT::v32i8, 17 }, // vpblendvb sequence. | ||||||
834 | { ISD::SRA, MVT::v64i8, 34 }, // 2*vpblendvb sequence. | ||||||
835 | { ISD::SRA, MVT::v8i16, 5 }, // extend/vpsravd/pack sequence. | ||||||
836 | { ISD::SRA, MVT::v16i16, 7 }, // extend/vpsravd/pack sequence. | ||||||
837 | { ISD::SRA, MVT::v32i16, 14 }, // 2*extend/vpsravd/pack sequence. | ||||||
838 | { ISD::SRA, MVT::v2i64, 2 }, // srl/xor/sub sequence. | ||||||
839 | { ISD::SRA, MVT::v4i64, 2 }, // srl/xor/sub sequence. | ||||||
840 | |||||||
841 | { ISD::SUB, MVT::v32i8, 1 }, // psubb | ||||||
842 | { ISD::ADD, MVT::v32i8, 1 }, // paddb | ||||||
843 | { ISD::SUB, MVT::v16i16, 1 }, // psubw | ||||||
844 | { ISD::ADD, MVT::v16i16, 1 }, // paddw | ||||||
845 | { ISD::SUB, MVT::v8i32, 1 }, // psubd | ||||||
846 | { ISD::ADD, MVT::v8i32, 1 }, // paddd | ||||||
847 | { ISD::SUB, MVT::v4i64, 1 }, // psubq | ||||||
848 | { ISD::ADD, MVT::v4i64, 1 }, // paddq | ||||||
849 | |||||||
850 | { ISD::MUL, MVT::v16i16, 1 }, // pmullw | ||||||
851 | { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org) | ||||||
852 | { ISD::MUL, MVT::v4i64, 6 }, // 3*pmuludq/3*shift/2*add | ||||||
853 | |||||||
854 | { ISD::FNEG, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ | ||||||
855 | { ISD::FNEG, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ | ||||||
856 | { ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ | ||||||
857 | { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ | ||||||
858 | { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ | ||||||
859 | { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ | ||||||
860 | { ISD::FMUL, MVT::f64, 1 }, // Haswell from http://www.agner.org/ | ||||||
861 | { ISD::FMUL, MVT::v2f64, 1 }, // Haswell from http://www.agner.org/ | ||||||
862 | { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ | ||||||
863 | { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ | ||||||
864 | |||||||
865 | { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/ | ||||||
866 | { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/ | ||||||
867 | { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/ | ||||||
868 | { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/ | ||||||
869 | { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/ | ||||||
870 | { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/ | ||||||
871 | }; | ||||||
872 | |||||||
873 | // Look for AVX2 lowering tricks for custom cases. | ||||||
874 | if (ST->hasAVX2()) | ||||||
875 | if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second)) | ||||||
876 | return LT.first * Entry->Cost; | ||||||
877 | |||||||
878 | static const CostTblEntry AVX1CostTable[] = { | ||||||
879 | // We don't have to scalarize unsupported ops. We can issue two half-sized | ||||||
880 | // operations and we only need to extract the upper YMM half. | ||||||
881 | // Two ops + 1 extract + 1 insert = 4. | ||||||
882 | { ISD::MUL, MVT::v16i16, 4 }, | ||||||
883 | { ISD::MUL, MVT::v8i32, 5 }, // BTVER2 from http://www.agner.org/ | ||||||
884 | { ISD::MUL, MVT::v4i64, 12 }, | ||||||
885 | |||||||
886 | { ISD::SUB, MVT::v32i8, 4 }, | ||||||
887 | { ISD::ADD, MVT::v32i8, 4 }, | ||||||
888 | { ISD::SUB, MVT::v16i16, 4 }, | ||||||
889 | { ISD::ADD, MVT::v16i16, 4 }, | ||||||
890 | { ISD::SUB, MVT::v8i32, 4 }, | ||||||
891 | { ISD::ADD, MVT::v8i32, 4 }, | ||||||
892 | { ISD::SUB, MVT::v4i64, 4 }, | ||||||
893 | { ISD::ADD, MVT::v4i64, 4 }, | ||||||
894 | |||||||
895 | { ISD::SHL, MVT::v32i8, 22 }, // pblendvb sequence + split. | ||||||
896 | { ISD::SHL, MVT::v8i16, 6 }, // pblendvb sequence. | ||||||
897 | { ISD::SHL, MVT::v16i16, 13 }, // pblendvb sequence + split. | ||||||
898 | { ISD::SHL, MVT::v4i32, 3 }, // pslld/paddd/cvttps2dq/pmulld | ||||||
899 | { ISD::SHL, MVT::v8i32, 9 }, // pslld/paddd/cvttps2dq/pmulld + split | ||||||
900 | { ISD::SHL, MVT::v2i64, 2 }, // Shift each lane + blend. | ||||||
901 | { ISD::SHL, MVT::v4i64, 6 }, // Shift each lane + blend + split. | ||||||
902 | |||||||
903 | { ISD::SRL, MVT::v32i8, 23 }, // pblendvb sequence + split. | ||||||
904 | { ISD::SRL, MVT::v16i16, 28 }, // pblendvb sequence + split. | ||||||
905 | { ISD::SRL, MVT::v4i32, 6 }, // Shift each lane + blend. | ||||||
906 | { ISD::SRL, MVT::v8i32, 14 }, // Shift each lane + blend + split. | ||||||
907 | { ISD::SRL, MVT::v2i64, 2 }, // Shift each lane + blend. | ||||||
908 | { ISD::SRL, MVT::v4i64, 6 }, // Shift each lane + blend + split. | ||||||
909 | |||||||
910 | { ISD::SRA, MVT::v32i8, 44 }, // pblendvb sequence + split. | ||||||
911 | { ISD::SRA, MVT::v16i16, 28 }, // pblendvb sequence + split. | ||||||
912 | { ISD::SRA, MVT::v4i32, 6 }, // Shift each lane + blend. | ||||||
913 | { ISD::SRA, MVT::v8i32, 14 }, // Shift each lane + blend + split. | ||||||
914 | { ISD::SRA, MVT::v2i64, 5 }, // Shift each lane + blend. | ||||||
915 | { ISD::SRA, MVT::v4i64, 12 }, // Shift each lane + blend + split. | ||||||
916 | |||||||
917 | { ISD::FNEG, MVT::v4f64, 2 }, // BTVER2 from http://www.agner.org/ | ||||||
918 | { ISD::FNEG, MVT::v8f32, 2 }, // BTVER2 from http://www.agner.org/ | ||||||
919 | |||||||
920 | { ISD::FMUL, MVT::f64, 2 }, // BTVER2 from http://www.agner.org/ | ||||||
921 | { ISD::FMUL, MVT::v2f64, 2 }, // BTVER2 from http://www.agner.org/ | ||||||
922 | { ISD::FMUL, MVT::v4f64, 4 }, // BTVER2 from http://www.agner.org/ | ||||||
923 | |||||||
924 | { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/ | ||||||
925 | { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/ | ||||||
926 | { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/ | ||||||
927 | { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/ | ||||||
928 | { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/ | ||||||
929 | { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/ | ||||||
930 | }; | ||||||
931 | |||||||
932 | if (ST->hasAVX()) | ||||||
933 | if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second)) | ||||||
934 | return LT.first * Entry->Cost; | ||||||
935 | |||||||
936 | static const CostTblEntry SSE42CostTable[] = { | ||||||
937 | { ISD::FADD, MVT::f64, 1 }, // Nehalem from http://www.agner.org/ | ||||||
938 | { ISD::FADD, MVT::f32, 1 }, // Nehalem from http://www.agner.org/ | ||||||
939 | { ISD::FADD, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/ | ||||||
940 | { ISD::FADD, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/ | ||||||
941 | |||||||
942 | { ISD::FSUB, MVT::f64, 1 }, // Nehalem from http://www.agner.org/ | ||||||
943 | { ISD::FSUB, MVT::f32 , 1 }, // Nehalem from http://www.agner.org/ | ||||||
944 | { ISD::FSUB, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/ | ||||||
945 | { ISD::FSUB, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/ | ||||||
946 | |||||||
947 | { ISD::FMUL, MVT::f64, 1 }, // Nehalem from http://www.agner.org/ | ||||||
948 | { ISD::FMUL, MVT::f32, 1 }, // Nehalem from http://www.agner.org/ | ||||||
949 | { ISD::FMUL, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/ | ||||||
950 | { ISD::FMUL, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/ | ||||||
951 | |||||||
952 | { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/ | ||||||
953 | { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/ | ||||||
954 | { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/ | ||||||
955 | { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/ | ||||||
956 | |||||||
957 | { ISD::MUL, MVT::v2i64, 6 } // 3*pmuludq/3*shift/2*add | ||||||
958 | }; | ||||||
959 | |||||||
960 | if (ST->hasSSE42()) | ||||||
961 | if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second)) | ||||||
962 | return LT.first * Entry->Cost; | ||||||
963 | |||||||
964 | static const CostTblEntry SSE41CostTable[] = { | ||||||
965 | { ISD::SHL, MVT::v16i8, 10 }, // pblendvb sequence. | ||||||
966 | { ISD::SHL, MVT::v8i16, 11 }, // pblendvb sequence. | ||||||
967 | { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld | ||||||
968 | |||||||
969 | { ISD::SRL, MVT::v16i8, 11 }, // pblendvb sequence. | ||||||
970 | { ISD::SRL, MVT::v8i16, 13 }, // pblendvb sequence. | ||||||
971 | { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend. | ||||||
972 | |||||||
973 | { ISD::SRA, MVT::v16i8, 21 }, // pblendvb sequence. | ||||||
974 | { ISD::SRA, MVT::v8i16, 13 }, // pblendvb sequence. | ||||||
975 | |||||||
976 | { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org) | ||||||
977 | }; | ||||||
978 | |||||||
979 | if (ST->hasSSE41()) | ||||||
980 | if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second)) | ||||||
981 | return LT.first * Entry->Cost; | ||||||
982 | |||||||
983 | static const CostTblEntry SSE2CostTable[] = { | ||||||
984 | // We don't correctly identify costs of casts because they are marked as | ||||||
985 | // custom. | ||||||
986 | { ISD::SHL, MVT::v16i8, 13 }, // cmpgtb sequence. | ||||||
987 | { ISD::SHL, MVT::v8i16, 25 }, // cmpgtw sequence. | ||||||
988 | { ISD::SHL, MVT::v4i32, 16 }, // pslld/paddd/cvttps2dq/pmuludq. | ||||||
989 | { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence. | ||||||
990 | |||||||
991 | { ISD::SRL, MVT::v16i8, 14 }, // cmpgtb sequence. | ||||||
992 | { ISD::SRL, MVT::v8i16, 16 }, // cmpgtw sequence. | ||||||
993 | { ISD::SRL, MVT::v4i32, 12 }, // Shift each lane + blend. | ||||||
994 | { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence. | ||||||
995 | |||||||
996 | { ISD::SRA, MVT::v16i8, 27 }, // unpacked cmpgtb sequence. | ||||||
997 | { ISD::SRA, MVT::v8i16, 16 }, // cmpgtw sequence. | ||||||
998 | { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend. | ||||||
999 | { ISD::SRA, MVT::v2i64, 8 }, // srl/xor/sub splat+shuffle sequence. | ||||||
1000 | |||||||
1001 | { ISD::MUL, MVT::v8i16, 1 }, // pmullw | ||||||
1002 | { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle | ||||||
1003 | { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add | ||||||
1004 | |||||||
1005 | { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/ | ||||||
1006 | { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/ | ||||||
1007 | { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/ | ||||||
1008 | { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/ | ||||||
1009 | |||||||
1010 | { ISD::FNEG, MVT::f32, 1 }, // Pentium IV from http://www.agner.org/ | ||||||
1011 | { ISD::FNEG, MVT::f64, 1 }, // Pentium IV from http://www.agner.org/ | ||||||
1012 | { ISD::FNEG, MVT::v4f32, 1 }, // Pentium IV from http://www.agner.org/ | ||||||
1013 | { ISD::FNEG, MVT::v2f64, 1 }, // Pentium IV from http://www.agner.org/ | ||||||
1014 | |||||||
1015 | { ISD::FADD, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/ | ||||||
1016 | { ISD::FADD, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/ | ||||||
1017 | |||||||
1018 | { ISD::FSUB, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/ | ||||||
1019 | { ISD::FSUB, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/ | ||||||
1020 | }; | ||||||
1021 | |||||||
1022 | if (ST->hasSSE2()) | ||||||
1023 | if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second)) | ||||||
1024 | return LT.first * Entry->Cost; | ||||||
1025 | |||||||
1026 | static const CostTblEntry SSE1CostTable[] = { | ||||||
1027 | { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/ | ||||||
1028 | { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/ | ||||||
1029 | |||||||
1030 | { ISD::FNEG, MVT::f32, 2 }, // Pentium III from http://www.agner.org/ | ||||||
1031 | { ISD::FNEG, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/ | ||||||
1032 | |||||||
1033 | { ISD::FADD, MVT::f32, 1 }, // Pentium III from http://www.agner.org/ | ||||||
1034 | { ISD::FADD, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/ | ||||||
1035 | |||||||
1036 | { ISD::FSUB, MVT::f32, 1 }, // Pentium III from http://www.agner.org/ | ||||||
1037 | { ISD::FSUB, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/ | ||||||
1038 | }; | ||||||
1039 | |||||||
1040 | if (ST->hasSSE1()) | ||||||
1041 | if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second)) | ||||||
1042 | return LT.first * Entry->Cost; | ||||||
1043 | |||||||
1044 | static const CostTblEntry X64CostTbl[] = { // 64-bit targets | ||||||
1045 | { ISD::ADD, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/ | ||||||
1046 | { ISD::SUB, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/ | ||||||
1047 | { ISD::MUL, MVT::i64, 2 }, // Nehalem from http://www.agner.org/ | ||||||
1048 | }; | ||||||
1049 | |||||||
1050 | if (ST->is64Bit()) | ||||||
1051 | if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second)) | ||||||
1052 | return LT.first * Entry->Cost; | ||||||
1053 | |||||||
1054 | static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets | ||||||
1055 | { ISD::ADD, MVT::i8, 1 }, // Pentium III from http://www.agner.org/ | ||||||
1056 | { ISD::ADD, MVT::i16, 1 }, // Pentium III from http://www.agner.org/ | ||||||
1057 | { ISD::ADD, MVT::i32, 1 }, // Pentium III from http://www.agner.org/ | ||||||
1058 | |||||||
1059 | { ISD::SUB, MVT::i8, 1 }, // Pentium III from http://www.agner.org/ | ||||||
1060 | { ISD::SUB, MVT::i16, 1 }, // Pentium III from http://www.agner.org/ | ||||||
1061 | { ISD::SUB, MVT::i32, 1 }, // Pentium III from http://www.agner.org/ | ||||||
1062 | }; | ||||||
1063 | |||||||
1064 | if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second)) | ||||||
1065 | return LT.first * Entry->Cost; | ||||||
1066 | |||||||
1067 | // It is not a good idea to vectorize division. We have to scalarize it and | ||||||
1068 | // in the process we will often end up having to spilling regular | ||||||
1069 | // registers. The overhead of division is going to dominate most kernels | ||||||
1070 | // anyways so try hard to prevent vectorization of division - it is | ||||||
1071 | // generally a bad idea. Assume somewhat arbitrarily that we have to be able | ||||||
1072 | // to hide "20 cycles" for each lane. | ||||||
1073 | if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM || | ||||||
1074 | ISD == ISD::UDIV || ISD == ISD::UREM)) { | ||||||
1075 | InstructionCost ScalarCost = getArithmeticInstrCost( | ||||||
1076 | Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info, | ||||||
1077 | TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); | ||||||
1078 | return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost; | ||||||
1079 | } | ||||||
1080 | |||||||
1081 | // Fallback to the default implementation. | ||||||
1082 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info); | ||||||
1083 | } | ||||||
1084 | |||||||
1085 | InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, | ||||||
1086 | VectorType *BaseTp, | ||||||
1087 | ArrayRef<int> Mask, int Index, | ||||||
1088 | VectorType *SubTp) { | ||||||
1089 | // 64-bit packed float vectors (v2f32) are widened to type v4f32. | ||||||
1090 | // 64-bit packed integer vectors (v2i32) are widened to type v4i32. | ||||||
1091 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, BaseTp); | ||||||
1092 | |||||||
1093 | Kind = improveShuffleKindFromMask(Kind, Mask); | ||||||
1094 | // Treat Transpose as 2-op shuffles - there's no difference in lowering. | ||||||
1095 | if (Kind == TTI::SK_Transpose) | ||||||
1096 | Kind = TTI::SK_PermuteTwoSrc; | ||||||
1097 | |||||||
1098 | // For Broadcasts we are splatting the first element from the first input | ||||||
1099 | // register, so only need to reference that input and all the output | ||||||
1100 | // registers are the same. | ||||||
1101 | if (Kind == TTI::SK_Broadcast) | ||||||
1102 | LT.first = 1; | ||||||
1103 | |||||||
1104 | // Subvector extractions are free if they start at the beginning of a | ||||||
1105 | // vector and cheap if the subvectors are aligned. | ||||||
1106 | if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) { | ||||||
1107 | int NumElts = LT.second.getVectorNumElements(); | ||||||
1108 | if ((Index % NumElts) == 0) | ||||||
1109 | return 0; | ||||||
1110 | std::pair<InstructionCost, MVT> SubLT = | ||||||
1111 | TLI->getTypeLegalizationCost(DL, SubTp); | ||||||
1112 | if (SubLT.second.isVector()) { | ||||||
1113 | int NumSubElts = SubLT.second.getVectorNumElements(); | ||||||
1114 | if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) | ||||||
1115 | return SubLT.first; | ||||||
1116 | // Handle some cases for widening legalization. For now we only handle | ||||||
1117 | // cases where the original subvector was naturally aligned and evenly | ||||||
1118 | // fit in its legalized subvector type. | ||||||
1119 | // FIXME: Remove some of the alignment restrictions. | ||||||
1120 | // FIXME: We can use permq for 64-bit or larger extracts from 256-bit | ||||||
1121 | // vectors. | ||||||
1122 | int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements(); | ||||||
1123 | if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 && | ||||||
1124 | (NumSubElts % OrigSubElts) == 0 && | ||||||
1125 | LT.second.getVectorElementType() == | ||||||
1126 | SubLT.second.getVectorElementType() && | ||||||
1127 | LT.second.getVectorElementType().getSizeInBits() == | ||||||
1128 | BaseTp->getElementType()->getPrimitiveSizeInBits()) { | ||||||
1129 | assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&(static_cast <bool> (NumElts >= NumSubElts && NumElts > OrigSubElts && "Unexpected number of elements!" ) ? void (0) : __assert_fail ("NumElts >= NumSubElts && NumElts > OrigSubElts && \"Unexpected number of elements!\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 1130, __extension__ __PRETTY_FUNCTION__)) | ||||||
1130 | "Unexpected number of elements!")(static_cast <bool> (NumElts >= NumSubElts && NumElts > OrigSubElts && "Unexpected number of elements!" ) ? void (0) : __assert_fail ("NumElts >= NumSubElts && NumElts > OrigSubElts && \"Unexpected number of elements!\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 1130, __extension__ __PRETTY_FUNCTION__)); | ||||||
1131 | auto *VecTy = FixedVectorType::get(BaseTp->getElementType(), | ||||||
1132 | LT.second.getVectorNumElements()); | ||||||
1133 | auto *SubTy = FixedVectorType::get(BaseTp->getElementType(), | ||||||
1134 | SubLT.second.getVectorNumElements()); | ||||||
1135 | int ExtractIndex = alignDown((Index % NumElts), NumSubElts); | ||||||
1136 | InstructionCost ExtractCost = getShuffleCost( | ||||||
1137 | TTI::SK_ExtractSubvector, VecTy, None, ExtractIndex, SubTy); | ||||||
1138 | |||||||
1139 | // If the original size is 32-bits or more, we can use pshufd. Otherwise | ||||||
1140 | // if we have SSSE3 we can use pshufb. | ||||||
1141 | if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3()) | ||||||
1142 | return ExtractCost + 1; // pshufd or pshufb | ||||||
1143 | |||||||
1144 | assert(SubTp->getPrimitiveSizeInBits() == 16 &&(static_cast <bool> (SubTp->getPrimitiveSizeInBits() == 16 && "Unexpected vector size") ? void (0) : __assert_fail ("SubTp->getPrimitiveSizeInBits() == 16 && \"Unexpected vector size\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 1145, __extension__ __PRETTY_FUNCTION__)) | ||||||
1145 | "Unexpected vector size")(static_cast <bool> (SubTp->getPrimitiveSizeInBits() == 16 && "Unexpected vector size") ? void (0) : __assert_fail ("SubTp->getPrimitiveSizeInBits() == 16 && \"Unexpected vector size\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 1145, __extension__ __PRETTY_FUNCTION__)); | ||||||
1146 | |||||||
1147 | return ExtractCost + 2; // worst case pshufhw + pshufd | ||||||
1148 | } | ||||||
1149 | } | ||||||
1150 | } | ||||||
1151 | |||||||
1152 | // Subvector insertions are cheap if the subvectors are aligned. | ||||||
1153 | // Note that in general, the insertion starting at the beginning of a vector | ||||||
1154 | // isn't free, because we need to preserve the rest of the wide vector. | ||||||
1155 | if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) { | ||||||
1156 | int NumElts = LT.second.getVectorNumElements(); | ||||||
1157 | std::pair<InstructionCost, MVT> SubLT = | ||||||
1158 | TLI->getTypeLegalizationCost(DL, SubTp); | ||||||
1159 | if (SubLT.second.isVector()) { | ||||||
1160 | int NumSubElts = SubLT.second.getVectorNumElements(); | ||||||
1161 | if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) | ||||||
1162 | return SubLT.first; | ||||||
1163 | } | ||||||
1164 | |||||||
1165 | // If the insertion isn't aligned, treat it like a 2-op shuffle. | ||||||
1166 | Kind = TTI::SK_PermuteTwoSrc; | ||||||
1167 | } | ||||||
1168 | |||||||
1169 | // Handle some common (illegal) sub-vector types as they are often very cheap | ||||||
1170 | // to shuffle even on targets without PSHUFB. | ||||||
1171 | EVT VT = TLI->getValueType(DL, BaseTp); | ||||||
1172 | if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 && | ||||||
1173 | !ST->hasSSSE3()) { | ||||||
1174 | static const CostTblEntry SSE2SubVectorShuffleTbl[] = { | ||||||
1175 | {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw | ||||||
1176 | {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw | ||||||
1177 | {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw | ||||||
1178 | {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw | ||||||
1179 | {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck | ||||||
1180 | |||||||
1181 | {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw | ||||||
1182 | {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw | ||||||
1183 | {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus | ||||||
1184 | {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck | ||||||
1185 | |||||||
1186 | {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw | ||||||
1187 | {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw | ||||||
1188 | {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw | ||||||
1189 | {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw | ||||||
1190 | {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck | ||||||
1191 | |||||||
1192 | {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw | ||||||
1193 | {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw | ||||||
1194 | {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw | ||||||
1195 | {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw | ||||||
1196 | {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck | ||||||
1197 | }; | ||||||
1198 | |||||||
1199 | if (ST->hasSSE2()) | ||||||
1200 | if (const auto *Entry = | ||||||
1201 | CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT())) | ||||||
1202 | return Entry->Cost; | ||||||
1203 | } | ||||||
1204 | |||||||
1205 | // We are going to permute multiple sources and the result will be in multiple | ||||||
1206 | // destinations. Providing an accurate cost only for splits where the element | ||||||
1207 | // type remains the same. | ||||||
1208 | if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) { | ||||||
1209 | MVT LegalVT = LT.second; | ||||||
1210 | if (LegalVT.isVector() && | ||||||
1211 | LegalVT.getVectorElementType().getSizeInBits() == | ||||||
1212 | BaseTp->getElementType()->getPrimitiveSizeInBits() && | ||||||
1213 | LegalVT.getVectorNumElements() < | ||||||
1214 | cast<FixedVectorType>(BaseTp)->getNumElements()) { | ||||||
1215 | |||||||
1216 | unsigned VecTySize = DL.getTypeStoreSize(BaseTp); | ||||||
1217 | unsigned LegalVTSize = LegalVT.getStoreSize(); | ||||||
1218 | // Number of source vectors after legalization: | ||||||
1219 | unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize; | ||||||
1220 | // Number of destination vectors after legalization: | ||||||
1221 | InstructionCost NumOfDests = LT.first; | ||||||
1222 | |||||||
1223 | auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(), | ||||||
1224 | LegalVT.getVectorNumElements()); | ||||||
1225 | |||||||
1226 | InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests; | ||||||
1227 | return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, | ||||||
1228 | None, 0, nullptr); | ||||||
1229 | } | ||||||
1230 | |||||||
1231 | return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp); | ||||||
1232 | } | ||||||
1233 | |||||||
1234 | // For 2-input shuffles, we must account for splitting the 2 inputs into many. | ||||||
1235 | if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) { | ||||||
1236 | // We assume that source and destination have the same vector type. | ||||||
1237 | InstructionCost NumOfDests = LT.first; | ||||||
1238 | InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1; | ||||||
1239 | LT.first = NumOfDests * NumOfShufflesPerDest; | ||||||
1240 | } | ||||||
1241 | |||||||
1242 | static const CostTblEntry AVX512FP16ShuffleTbl[] = { | ||||||
1243 | {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw | ||||||
1244 | {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw | ||||||
1245 | {TTI::SK_Broadcast, MVT::v8f16, 1}, // vpbroadcastw | ||||||
1246 | |||||||
1247 | {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw | ||||||
1248 | {TTI::SK_Reverse, MVT::v16f16, 2}, // vpermw | ||||||
1249 | {TTI::SK_Reverse, MVT::v8f16, 1}, // vpshufb | ||||||
1250 | |||||||
1251 | {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw | ||||||
1252 | {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw | ||||||
1253 | {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // vpshufb | ||||||
1254 | |||||||
1255 | {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w | ||||||
1256 | {TTI::SK_PermuteTwoSrc, MVT::v16f16, 2}, // vpermt2w | ||||||
1257 | {TTI::SK_PermuteTwoSrc, MVT::v8f16, 2} // vpermt2w | ||||||
1258 | }; | ||||||
1259 | |||||||
1260 | if (!ST->useSoftFloat() && ST->hasFP16()) | ||||||
1261 | if (const auto *Entry = | ||||||
1262 | CostTableLookup(AVX512FP16ShuffleTbl, Kind, LT.second)) | ||||||
1263 | return LT.first * Entry->Cost; | ||||||
1264 | |||||||
1265 | static const CostTblEntry AVX512VBMIShuffleTbl[] = { | ||||||
1266 | {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb | ||||||
1267 | {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb | ||||||
1268 | |||||||
1269 | {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb | ||||||
1270 | {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb | ||||||
1271 | |||||||
1272 | {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b | ||||||
1273 | {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b | ||||||
1274 | {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b | ||||||
1275 | }; | ||||||
1276 | |||||||
1277 | if (ST->hasVBMI()) | ||||||
1278 | if (const auto *Entry = | ||||||
1279 | CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second)) | ||||||
1280 | return LT.first * Entry->Cost; | ||||||
1281 | |||||||
1282 | static const CostTblEntry AVX512BWShuffleTbl[] = { | ||||||
1283 | {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw | ||||||
1284 | {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb | ||||||
1285 | |||||||
1286 | {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw | ||||||
1287 | {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw | ||||||
1288 | {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2 | ||||||
1289 | |||||||
1290 | {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw | ||||||
1291 | {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw | ||||||
1292 | {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16 | ||||||
1293 | |||||||
1294 | {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w | ||||||
1295 | {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w | ||||||
1296 | {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w | ||||||
1297 | {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1 | ||||||
1298 | |||||||
1299 | {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw | ||||||
1300 | {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb | ||||||
1301 | }; | ||||||
1302 | |||||||
1303 | if (ST->hasBWI()) | ||||||
1304 | if (const auto *Entry = | ||||||
1305 | CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second)) | ||||||
1306 | return LT.first * Entry->Cost; | ||||||
1307 | |||||||
1308 | static const CostTblEntry AVX512ShuffleTbl[] = { | ||||||
1309 | {TTI::SK_Broadcast, MVT::v8f64, 1}, // vbroadcastpd | ||||||
1310 | {TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps | ||||||
1311 | {TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq | ||||||
1312 | {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd | ||||||
1313 | {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw | ||||||
1314 | {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb | ||||||
1315 | |||||||
1316 | {TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd | ||||||
1317 | {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps | ||||||
1318 | {TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq | ||||||
1319 | {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd | ||||||
1320 | {TTI::SK_Reverse, MVT::v32i16, 7}, // per mca | ||||||
1321 | {TTI::SK_Reverse, MVT::v64i8, 7}, // per mca | ||||||
1322 | |||||||
1323 | {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd | ||||||
1324 | {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd | ||||||
1325 | {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // vpermpd | ||||||
1326 | {TTI::SK_PermuteSingleSrc, MVT::v16f32, 1}, // vpermps | ||||||
1327 | {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps | ||||||
1328 | {TTI::SK_PermuteSingleSrc, MVT::v4f32, 1}, // vpermps | ||||||
1329 | {TTI::SK_PermuteSingleSrc, MVT::v8i64, 1}, // vpermq | ||||||
1330 | {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq | ||||||
1331 | {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // vpermq | ||||||
1332 | {TTI::SK_PermuteSingleSrc, MVT::v16i32, 1}, // vpermd | ||||||
1333 | {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd | ||||||
1334 | {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // vpermd | ||||||
1335 | {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb | ||||||
1336 | |||||||
1337 | {TTI::SK_PermuteTwoSrc, MVT::v8f64, 1}, // vpermt2pd | ||||||
1338 | {TTI::SK_PermuteTwoSrc, MVT::v16f32, 1}, // vpermt2ps | ||||||
1339 | {TTI::SK_PermuteTwoSrc, MVT::v8i64, 1}, // vpermt2q | ||||||
1340 | {TTI::SK_PermuteTwoSrc, MVT::v16i32, 1}, // vpermt2d | ||||||
1341 | {TTI::SK_PermuteTwoSrc, MVT::v4f64, 1}, // vpermt2pd | ||||||
1342 | {TTI::SK_PermuteTwoSrc, MVT::v8f32, 1}, // vpermt2ps | ||||||
1343 | {TTI::SK_PermuteTwoSrc, MVT::v4i64, 1}, // vpermt2q | ||||||
1344 | {TTI::SK_PermuteTwoSrc, MVT::v8i32, 1}, // vpermt2d | ||||||
1345 | {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1}, // vpermt2pd | ||||||
1346 | {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1}, // vpermt2ps | ||||||
1347 | {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1}, // vpermt2q | ||||||
1348 | {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1}, // vpermt2d | ||||||
1349 | |||||||
1350 | // FIXME: This just applies the type legalization cost rules above | ||||||
1351 | // assuming these completely split. | ||||||
1352 | {TTI::SK_PermuteSingleSrc, MVT::v32i16, 14}, | ||||||
1353 | {TTI::SK_PermuteSingleSrc, MVT::v64i8, 14}, | ||||||
1354 | {TTI::SK_PermuteTwoSrc, MVT::v32i16, 42}, | ||||||
1355 | {TTI::SK_PermuteTwoSrc, MVT::v64i8, 42}, | ||||||
1356 | |||||||
1357 | {TTI::SK_Select, MVT::v32i16, 1}, // vpternlogq | ||||||
1358 | {TTI::SK_Select, MVT::v64i8, 1}, // vpternlogq | ||||||
1359 | {TTI::SK_Select, MVT::v8f64, 1}, // vblendmpd | ||||||
1360 | {TTI::SK_Select, MVT::v16f32, 1}, // vblendmps | ||||||
1361 | {TTI::SK_Select, MVT::v8i64, 1}, // vblendmq | ||||||
1362 | {TTI::SK_Select, MVT::v16i32, 1}, // vblendmd | ||||||
1363 | }; | ||||||
1364 | |||||||
1365 | if (ST->hasAVX512()) | ||||||
1366 | if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second)) | ||||||
1367 | return LT.first * Entry->Cost; | ||||||
1368 | |||||||
1369 | static const CostTblEntry AVX2ShuffleTbl[] = { | ||||||
1370 | {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd | ||||||
1371 | {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps | ||||||
1372 | {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq | ||||||
1373 | {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd | ||||||
1374 | {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw | ||||||
1375 | {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb | ||||||
1376 | |||||||
1377 | {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd | ||||||
1378 | {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps | ||||||
1379 | {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq | ||||||
1380 | {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd | ||||||
1381 | {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb | ||||||
1382 | {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb | ||||||
1383 | |||||||
1384 | {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb | ||||||
1385 | {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb | ||||||
1386 | |||||||
1387 | {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd | ||||||
1388 | {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps | ||||||
1389 | {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq | ||||||
1390 | {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd | ||||||
1391 | {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb | ||||||
1392 | // + vpblendvb | ||||||
1393 | {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb | ||||||
1394 | // + vpblendvb | ||||||
1395 | |||||||
1396 | {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd | ||||||
1397 | {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps | ||||||
1398 | {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd | ||||||
1399 | {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd | ||||||
1400 | {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb | ||||||
1401 | // + vpblendvb | ||||||
1402 | {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb | ||||||
1403 | // + vpblendvb | ||||||
1404 | }; | ||||||
1405 | |||||||
1406 | if (ST->hasAVX2()) | ||||||
1407 | if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second)) | ||||||
1408 | return LT.first * Entry->Cost; | ||||||
1409 | |||||||
1410 | static const CostTblEntry XOPShuffleTbl[] = { | ||||||
1411 | {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd | ||||||
1412 | {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps | ||||||
1413 | {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd | ||||||
1414 | {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps | ||||||
1415 | {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm | ||||||
1416 | // + vinsertf128 | ||||||
1417 | {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm | ||||||
1418 | // + vinsertf128 | ||||||
1419 | |||||||
1420 | {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm | ||||||
1421 | // + vinsertf128 | ||||||
1422 | {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm | ||||||
1423 | {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm | ||||||
1424 | // + vinsertf128 | ||||||
1425 | {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm | ||||||
1426 | }; | ||||||
1427 | |||||||
1428 | if (ST->hasXOP()) | ||||||
1429 | if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second)) | ||||||
1430 | return LT.first * Entry->Cost; | ||||||
1431 | |||||||
1432 | static const CostTblEntry AVX1ShuffleTbl[] = { | ||||||
1433 | {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd | ||||||
1434 | {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps | ||||||
1435 | {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd | ||||||
1436 | {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps | ||||||
1437 | {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128 | ||||||
1438 | {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128 | ||||||
1439 | |||||||
1440 | {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd | ||||||
1441 | {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps | ||||||
1442 | {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd | ||||||
1443 | {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps | ||||||
1444 | {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb | ||||||
1445 | // + vinsertf128 | ||||||
1446 | {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb | ||||||
1447 | // + vinsertf128 | ||||||
1448 | |||||||
1449 | {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd | ||||||
1450 | {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd | ||||||
1451 | {TTI::SK_Select, MVT::v8i32, 1}, // vblendps | ||||||
1452 | {TTI::SK_Select, MVT::v8f32, 1}, // vblendps | ||||||
1453 | {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor | ||||||
1454 | {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor | ||||||
1455 | |||||||
1456 | {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd | ||||||
1457 | {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd | ||||||
1458 | {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps | ||||||
1459 | {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps | ||||||
1460 | {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb | ||||||
1461 | // + 2*por + vinsertf128 | ||||||
1462 | {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb | ||||||
1463 | // + 2*por + vinsertf128 | ||||||
1464 | |||||||
1465 | {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd | ||||||
1466 | {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd | ||||||
1467 | {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps | ||||||
1468 | {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps | ||||||
1469 | {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb | ||||||
1470 | // + 4*por + vinsertf128 | ||||||
1471 | {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb | ||||||
1472 | // + 4*por + vinsertf128 | ||||||
1473 | }; | ||||||
1474 | |||||||
1475 | if (ST->hasAVX()) | ||||||
1476 | if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second)) | ||||||
1477 | return LT.first * Entry->Cost; | ||||||
1478 | |||||||
1479 | static const CostTblEntry SSE41ShuffleTbl[] = { | ||||||
1480 | {TTI::SK_Select, MVT::v2i64, 1}, // pblendw | ||||||
1481 | {TTI::SK_Select, MVT::v2f64, 1}, // movsd | ||||||
1482 | {TTI::SK_Select, MVT::v4i32, 1}, // pblendw | ||||||
1483 | {TTI::SK_Select, MVT::v4f32, 1}, // blendps | ||||||
1484 | {TTI::SK_Select, MVT::v8i16, 1}, // pblendw | ||||||
1485 | {TTI::SK_Select, MVT::v16i8, 1} // pblendvb | ||||||
1486 | }; | ||||||
1487 | |||||||
1488 | if (ST->hasSSE41()) | ||||||
1489 | if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second)) | ||||||
1490 | return LT.first * Entry->Cost; | ||||||
1491 | |||||||
1492 | static const CostTblEntry SSSE3ShuffleTbl[] = { | ||||||
1493 | {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb | ||||||
1494 | {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb | ||||||
1495 | |||||||
1496 | {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb | ||||||
1497 | {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb | ||||||
1498 | |||||||
1499 | {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por | ||||||
1500 | {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por | ||||||
1501 | |||||||
1502 | {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb | ||||||
1503 | {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb | ||||||
1504 | |||||||
1505 | {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por | ||||||
1506 | {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por | ||||||
1507 | }; | ||||||
1508 | |||||||
1509 | if (ST->hasSSSE3()) | ||||||
1510 | if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second)) | ||||||
1511 | return LT.first * Entry->Cost; | ||||||
1512 | |||||||
1513 | static const CostTblEntry SSE2ShuffleTbl[] = { | ||||||
1514 | {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd | ||||||
1515 | {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd | ||||||
1516 | {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd | ||||||
1517 | {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd | ||||||
1518 | {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd | ||||||
1519 | |||||||
1520 | {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd | ||||||
1521 | {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd | ||||||
1522 | {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd | ||||||
1523 | {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd | ||||||
1524 | {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw | ||||||
1525 | // + 2*pshufd + 2*unpck + packus | ||||||
1526 | |||||||
1527 | {TTI::SK_Select, MVT::v2i64, 1}, // movsd | ||||||
1528 | {TTI::SK_Select, MVT::v2f64, 1}, // movsd | ||||||
1529 | {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps | ||||||
1530 | {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por | ||||||
1531 | {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por | ||||||
1532 | |||||||
1533 | {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd | ||||||
1534 | {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd | ||||||
1535 | {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd | ||||||
1536 | {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw | ||||||
1537 | // + pshufd/unpck | ||||||
1538 | { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw | ||||||
1539 | // + 2*pshufd + 2*unpck + 2*packus | ||||||
1540 | |||||||
1541 | { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd | ||||||
1542 | { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd | ||||||
1543 | { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd} | ||||||
1544 | { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute | ||||||
1545 | { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute | ||||||
1546 | }; | ||||||
1547 | |||||||
1548 | if (ST->hasSSE2()) | ||||||
1549 | if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second)) | ||||||
1550 | return LT.first * Entry->Cost; | ||||||
1551 | |||||||
1552 | static const CostTblEntry SSE1ShuffleTbl[] = { | ||||||
1553 | { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps | ||||||
1554 | { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps | ||||||
1555 | { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps | ||||||
1556 | { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps | ||||||
1557 | { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps | ||||||
1558 | }; | ||||||
1559 | |||||||
1560 | if (ST->hasSSE1()) | ||||||
1561 | if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second)) | ||||||
1562 | return LT.first * Entry->Cost; | ||||||
1563 | |||||||
1564 | return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp); | ||||||
1565 | } | ||||||
1566 | |||||||
1567 | InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, | ||||||
1568 | Type *Src, | ||||||
1569 | TTI::CastContextHint CCH, | ||||||
1570 | TTI::TargetCostKind CostKind, | ||||||
1571 | const Instruction *I) { | ||||||
1572 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | ||||||
1573 | assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ? void (0) : __assert_fail ("ISD && \"Invalid opcode\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 1573, __extension__ __PRETTY_FUNCTION__)); | ||||||
1574 | |||||||
1575 | // TODO: Allow non-throughput costs that aren't binary. | ||||||
1576 | auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost { | ||||||
1577 | if (CostKind != TTI::TCK_RecipThroughput) | ||||||
1578 | return Cost == 0 ? 0 : 1; | ||||||
1579 | return Cost; | ||||||
1580 | }; | ||||||
1581 | |||||||
1582 | // The cost tables include both specific, custom (non-legal) src/dst type | ||||||
1583 | // conversions and generic, legalized types. We test for customs first, before | ||||||
1584 | // falling back to legalization. | ||||||
1585 | // FIXME: Need a better design of the cost table to handle non-simple types of | ||||||
1586 | // potential massive combinations (elem_num x src_type x dst_type). | ||||||
1587 | static const TypeConversionCostTblEntry AVX512BWConversionTbl[] { | ||||||
1588 | { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 }, | ||||||
1589 | { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 }, | ||||||
1590 | |||||||
1591 | // Mask sign extend has an instruction. | ||||||
1592 | { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 }, | ||||||
1593 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 }, | ||||||
1594 | { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 }, | ||||||
1595 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 }, | ||||||
1596 | { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 }, | ||||||
1597 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 }, | ||||||
1598 | { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 }, | ||||||
1599 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 }, | ||||||
1600 | { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 }, | ||||||
1601 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 }, | ||||||
1602 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 }, | ||||||
1603 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 }, | ||||||
1604 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, | ||||||
1605 | { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 }, | ||||||
1606 | { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 }, | ||||||
1607 | { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 }, | ||||||
1608 | { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, 1 }, | ||||||
1609 | |||||||
1610 | // Mask zero extend is a sext + shift. | ||||||
1611 | { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 }, | ||||||
1612 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 }, | ||||||
1613 | { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 }, | ||||||
1614 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 }, | ||||||
1615 | { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 }, | ||||||
1616 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 }, | ||||||
1617 | { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 }, | ||||||
1618 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 }, | ||||||
1619 | { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 }, | ||||||
1620 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 }, | ||||||
1621 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 }, | ||||||
1622 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 }, | ||||||
1623 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 }, | ||||||
1624 | { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 }, | ||||||
1625 | { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 }, | ||||||
1626 | { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 }, | ||||||
1627 | { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, 2 }, | ||||||
1628 | |||||||
1629 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, | ||||||
1630 | { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 }, | ||||||
1631 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, | ||||||
1632 | { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 }, | ||||||
1633 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, | ||||||
1634 | { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 }, | ||||||
1635 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, | ||||||
1636 | { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 }, | ||||||
1637 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, | ||||||
1638 | { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 }, | ||||||
1639 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, | ||||||
1640 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, | ||||||
1641 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, | ||||||
1642 | { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, | ||||||
1643 | { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, 2 }, | ||||||
1644 | { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, 2 }, | ||||||
1645 | { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, 2 }, | ||||||
1646 | |||||||
1647 | { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 2 }, | ||||||
1648 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm | ||||||
1649 | { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // vpmovwb | ||||||
1650 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // vpmovwb | ||||||
1651 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // vpmovwb | ||||||
1652 | }; | ||||||
1653 | |||||||
1654 | static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = { | ||||||
1655 | // Mask sign extend has an instruction. | ||||||
1656 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, | ||||||
1657 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 }, | ||||||
1658 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, | ||||||
1659 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, | ||||||
1660 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, | ||||||
1661 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, 1 }, | ||||||
1662 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, | ||||||
1663 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, | ||||||
1664 | |||||||
1665 | // Mask zero extend is a sext + shift. | ||||||
1666 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, | ||||||
1667 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 }, | ||||||
1668 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, | ||||||
1669 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, | ||||||
1670 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, | ||||||
1671 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, 2 }, | ||||||
1672 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, | ||||||
1673 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, | ||||||
1674 | |||||||
1675 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, | ||||||
1676 | { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 }, | ||||||
1677 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, | ||||||
1678 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, | ||||||
1679 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, | ||||||
1680 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, | ||||||
1681 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, | ||||||
1682 | { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, 2 }, | ||||||
1683 | |||||||
1684 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, | ||||||
1685 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, | ||||||
1686 | |||||||
1687 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, | ||||||
1688 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, | ||||||
1689 | |||||||
1690 | { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 }, | ||||||
1691 | { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 }, | ||||||
1692 | |||||||
1693 | { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 }, | ||||||
1694 | { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 }, | ||||||
1695 | }; | ||||||
1696 | |||||||
1697 | // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and | ||||||
1698 | // 256-bit wide vectors. | ||||||
1699 | |||||||
1700 | static const TypeConversionCostTblEntry AVX512FConversionTbl[] = { | ||||||
1701 | { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 }, | ||||||
1702 | { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 }, | ||||||
1703 | { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 }, | ||||||
1704 | |||||||
1705 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd | ||||||
1706 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd | ||||||
1707 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd | ||||||
1708 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd | ||||||
1709 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq | ||||||
1710 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq | ||||||
1711 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq | ||||||
1712 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd | ||||||
1713 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd | ||||||
1714 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd | ||||||
1715 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd | ||||||
1716 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd | ||||||
1717 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq | ||||||
1718 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq | ||||||
1719 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq | ||||||
1720 | { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 2 }, // vpmovdb | ||||||
1721 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 2 }, // vpmovdb | ||||||
1722 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 }, // vpmovdb | ||||||
1723 | { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, 2 }, // vpmovdb | ||||||
1724 | { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, 2 }, // vpmovdb | ||||||
1725 | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 }, // vpmovdw | ||||||
1726 | { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, 2 }, // vpmovdw | ||||||
1727 | { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 2 }, // vpmovqb | ||||||
1728 | { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1 }, // vpshufb | ||||||
1729 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 }, // vpmovqb | ||||||
1730 | { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, 2 }, // vpmovqb | ||||||
1731 | { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, 2 }, // vpmovqb | ||||||
1732 | { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, 2 }, // vpmovqb | ||||||
1733 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 }, // vpmovqw | ||||||
1734 | { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, 2 }, // vpmovqw | ||||||
1735 | { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, 2 }, // vpmovqw | ||||||
1736 | { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, // vpmovqd | ||||||
1737 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd | ||||||
1738 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb | ||||||
1739 | |||||||
1740 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32 | ||||||
1741 | { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 8 }, | ||||||
1742 | { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, 8 }, | ||||||
1743 | |||||||
1744 | // Sign extend is zmm vpternlogd+vptruncdb. | ||||||
1745 | // Zero extend is zmm broadcast load+vptruncdw. | ||||||
1746 | { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 3 }, | ||||||
1747 | { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 4 }, | ||||||
1748 | { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 3 }, | ||||||
1749 | { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 4 }, | ||||||
1750 | { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 3 }, | ||||||
1751 | { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 4 }, | ||||||
1752 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 3 }, | ||||||
1753 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 4 }, | ||||||
1754 | |||||||
1755 | // Sign extend is zmm vpternlogd+vptruncdw. | ||||||
1756 | // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw. | ||||||
1757 | { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 3 }, | ||||||
1758 | { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 4 }, | ||||||
1759 | { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 3 }, | ||||||
1760 | { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 4 }, | ||||||
1761 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 3 }, | ||||||
1762 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 4 }, | ||||||
1763 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 3 }, | ||||||
1764 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, | ||||||
1765 | |||||||
1766 | { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd | ||||||
1767 | { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld | ||||||
1768 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd | ||||||
1769 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld | ||||||
1770 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd | ||||||
1771 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld | ||||||
1772 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq | ||||||
1773 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq | ||||||
1774 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq | ||||||
1775 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq | ||||||
1776 | |||||||
1777 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd | ||||||
1778 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld | ||||||
1779 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq | ||||||
1780 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq | ||||||
1781 | |||||||
1782 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, | ||||||
1783 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, | ||||||
1784 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, | ||||||
1785 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, | ||||||
1786 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 }, | ||||||
1787 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 }, | ||||||
1788 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, | ||||||
1789 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, | ||||||
1790 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, | ||||||
1791 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, | ||||||
1792 | |||||||
1793 | { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right | ||||||
1794 | { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right | ||||||
1795 | |||||||
1796 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, | ||||||
1797 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, | ||||||
1798 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 }, | ||||||
1799 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 }, | ||||||
1800 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, | ||||||
1801 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 }, | ||||||
1802 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, | ||||||
1803 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, | ||||||
1804 | |||||||
1805 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, | ||||||
1806 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, | ||||||
1807 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 }, | ||||||
1808 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 }, | ||||||
1809 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, | ||||||
1810 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 }, | ||||||
1811 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, | ||||||
1812 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, | ||||||
1813 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 }, | ||||||
1814 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 }, | ||||||
1815 | |||||||
1816 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 }, | ||||||
1817 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, 7 }, | ||||||
1818 | { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64,15 }, | ||||||
1819 | { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32,11 }, | ||||||
1820 | { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64,31 }, | ||||||
1821 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, 3 }, | ||||||
1822 | { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, 7 }, | ||||||
1823 | { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, 5 }, | ||||||
1824 | { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64,15 }, | ||||||
1825 | { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 1 }, | ||||||
1826 | { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, 3 }, | ||||||
1827 | |||||||
1828 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 }, | ||||||
1829 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 3 }, | ||||||
1830 | { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 3 }, | ||||||
1831 | { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 }, | ||||||
1832 | { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 3 }, | ||||||
1833 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 3 }, | ||||||
1834 | }; | ||||||
1835 | |||||||
1836 | static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] { | ||||||
1837 | // Mask sign extend has an instruction. | ||||||
1838 | { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 }, | ||||||
1839 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 }, | ||||||
1840 | { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 }, | ||||||
1841 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 }, | ||||||
1842 | { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 }, | ||||||
1843 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 }, | ||||||
1844 | { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 }, | ||||||
1845 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 }, | ||||||
1846 | { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 }, | ||||||
1847 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 }, | ||||||
1848 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 }, | ||||||
1849 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 }, | ||||||
1850 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, | ||||||
1851 | { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 }, | ||||||
1852 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, 1 }, | ||||||
1853 | { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, 1 }, | ||||||
1854 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, 1 }, | ||||||
1855 | |||||||
1856 | // Mask zero extend is a sext + shift. | ||||||
1857 | { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 }, | ||||||
1858 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 }, | ||||||
1859 | { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 }, | ||||||
1860 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 }, | ||||||
1861 | { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 }, | ||||||
1862 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 }, | ||||||
1863 | { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 }, | ||||||
1864 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 }, | ||||||
1865 | { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 }, | ||||||
1866 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 }, | ||||||
1867 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 }, | ||||||
1868 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 }, | ||||||
1869 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 }, | ||||||
1870 | { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 }, | ||||||
1871 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, 2 }, | ||||||
1872 | { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, 2 }, | ||||||
1873 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, 2 }, | ||||||
1874 | |||||||
1875 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, | ||||||
1876 | { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 }, | ||||||
1877 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, | ||||||
1878 | { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 }, | ||||||
1879 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, | ||||||
1880 | { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 }, | ||||||
1881 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, | ||||||
1882 | { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 }, | ||||||
1883 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, | ||||||
1884 | { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 }, | ||||||
1885 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, | ||||||
1886 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, | ||||||
1887 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, | ||||||
1888 | { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, | ||||||
1889 | { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, 2 }, | ||||||
1890 | { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, 2 }, | ||||||
1891 | { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, 2 }, | ||||||
1892 | |||||||
1893 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, | ||||||
1894 | }; | ||||||
1895 | |||||||
1896 | static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = { | ||||||
1897 | // Mask sign extend has an instruction. | ||||||
1898 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, | ||||||
1899 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 }, | ||||||
1900 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, | ||||||
1901 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, 1 }, | ||||||
1902 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, | ||||||
1903 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, 1 }, | ||||||
1904 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, 1 }, | ||||||
1905 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, | ||||||
1906 | |||||||
1907 | // Mask zero extend is a sext + shift. | ||||||
1908 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, | ||||||
1909 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 }, | ||||||
1910 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, | ||||||
1911 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, 2 }, | ||||||
1912 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, | ||||||
1913 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, 2 }, | ||||||
1914 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, 2 }, | ||||||
1915 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, | ||||||
1916 | |||||||
1917 | { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, 2 }, | ||||||
1918 | { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, 2 }, | ||||||
1919 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, | ||||||
1920 | { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 }, | ||||||
1921 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, | ||||||
1922 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, | ||||||
1923 | { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, 2 }, | ||||||
1924 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, | ||||||
1925 | |||||||
1926 | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, | ||||||
1927 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, | ||||||
1928 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, | ||||||
1929 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, | ||||||
1930 | |||||||
1931 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, | ||||||
1932 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, | ||||||
1933 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, | ||||||
1934 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, | ||||||
1935 | |||||||
1936 | { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, 1 }, | ||||||
1937 | { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 }, | ||||||
1938 | { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, | ||||||
1939 | { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 }, | ||||||
1940 | |||||||
1941 | { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, 1 }, | ||||||
1942 | { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 }, | ||||||
1943 | { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, | ||||||
1944 | { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 }, | ||||||
1945 | }; | ||||||
1946 | |||||||
1947 | static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = { | ||||||
1948 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd | ||||||
1949 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd | ||||||
1950 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd | ||||||
1951 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8 | ||||||
1952 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq | ||||||
1953 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq | ||||||
1954 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq | ||||||
1955 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16 | ||||||
1956 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd | ||||||
1957 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd | ||||||
1958 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd | ||||||
1959 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq | ||||||
1960 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq | ||||||
1961 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd | ||||||
1962 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, // vpmovqb | ||||||
1963 | { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, // vpmovqw | ||||||
1964 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, // vpmovwb | ||||||
1965 | |||||||
1966 | // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb | ||||||
1967 | // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb | ||||||
1968 | { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 5 }, | ||||||
1969 | { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 6 }, | ||||||
1970 | { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 5 }, | ||||||
1971 | { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 6 }, | ||||||
1972 | { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 5 }, | ||||||
1973 | { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 6 }, | ||||||
1974 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 10 }, | ||||||
1975 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 12 }, | ||||||
1976 | |||||||
1977 | // sign extend is vpcmpeq+maskedmove+vpmovdw | ||||||
1978 | // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw | ||||||
1979 | { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 4 }, | ||||||
1980 | { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 5 }, | ||||||
1981 | { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 4 }, | ||||||
1982 | { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 5 }, | ||||||
1983 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 4 }, | ||||||
1984 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 5 }, | ||||||
1985 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 10 }, | ||||||
1986 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 12 }, | ||||||
1987 | |||||||
1988 | { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd | ||||||
1989 | { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld | ||||||
1990 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd | ||||||
1991 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld | ||||||
1992 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd | ||||||
1993 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld | ||||||
1994 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq | ||||||
1995 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq | ||||||
1996 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq | ||||||
1997 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq | ||||||
1998 | |||||||
1999 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 1 }, | ||||||
2000 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 1 }, | ||||||
2001 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 1 }, | ||||||
2002 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 1 }, | ||||||
2003 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, | ||||||
2004 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, | ||||||
2005 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 1 }, | ||||||
2006 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 1 }, | ||||||
2007 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, | ||||||
2008 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, | ||||||
2009 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, | ||||||
2010 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, | ||||||
2011 | |||||||
2012 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, | ||||||
2013 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 }, | ||||||
2014 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, | ||||||
2015 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 }, | ||||||
2016 | |||||||
2017 | { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 }, | ||||||
2018 | { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 }, | ||||||
2019 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, | ||||||
2020 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 }, | ||||||
2021 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, | ||||||
2022 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 }, | ||||||
2023 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, | ||||||
2024 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, | ||||||
2025 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, | ||||||
2026 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, | ||||||
2027 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 }, | ||||||
2028 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 }, | ||||||
2029 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 5 }, | ||||||
2030 | |||||||
2031 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 }, | ||||||
2032 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 }, | ||||||
2033 | { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, 5 }, | ||||||
2034 | |||||||
2035 | { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 }, | ||||||
2036 | { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 }, | ||||||
2037 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, | ||||||
2038 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 1 }, | ||||||
2039 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 }, | ||||||
2040 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 }, | ||||||
2041 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 }, | ||||||
2042 | }; | ||||||
2043 | |||||||
2044 | static const TypeConversionCostTblEntry AVX2ConversionTbl[] = { | ||||||
2045 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, | ||||||
2046 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, | ||||||
2047 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, | ||||||
2048 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, | ||||||
2049 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, | ||||||
2050 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, | ||||||
2051 | |||||||
2052 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 2 }, | ||||||
2053 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 2 }, | ||||||
2054 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 2 }, | ||||||
2055 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 2 }, | ||||||
2056 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, | ||||||
2057 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, | ||||||
2058 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 2 }, | ||||||
2059 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 2 }, | ||||||
2060 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, | ||||||
2061 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, | ||||||
2062 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 3 }, | ||||||
2063 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 3 }, | ||||||
2064 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, | ||||||
2065 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, | ||||||
2066 | |||||||
2067 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, | ||||||
2068 | |||||||
2069 | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 4 }, | ||||||
2070 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 4 }, | ||||||
2071 | { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 1 }, | ||||||
2072 | { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 1 }, | ||||||
2073 | { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 1 }, | ||||||
2074 | { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 4 }, | ||||||
2075 | { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 4 }, | ||||||
2076 | { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 1 }, | ||||||
2077 | { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 1 }, | ||||||
2078 | { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 5 }, | ||||||
2079 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, | ||||||
2080 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 }, | ||||||
2081 | |||||||
2082 | { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 }, | ||||||
2083 | { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 }, | ||||||
2084 | |||||||
2085 | { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 1 }, | ||||||
2086 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 1 }, | ||||||
2087 | { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 1 }, | ||||||
2088 | { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 3 }, | ||||||
2089 | |||||||
2090 | { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 3 }, | ||||||
2091 | { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 3 }, | ||||||
2092 | { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 1 }, | ||||||
2093 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 }, | ||||||
2094 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 }, | ||||||
2095 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4 }, | ||||||
2096 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 3 }, | ||||||
2097 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 4 }, | ||||||
2098 | |||||||
2099 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 }, | ||||||
2100 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 }, | ||||||
2101 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 }, | ||||||
2102 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 }, | ||||||
2103 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, | ||||||
2104 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, | ||||||
2105 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 3 }, | ||||||
2106 | |||||||
2107 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 }, | ||||||
2108 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 }, | ||||||
2109 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 }, | ||||||
2110 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 }, | ||||||
2111 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 }, | ||||||
2112 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 }, | ||||||
2113 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 2 }, | ||||||
2114 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 }, | ||||||
2115 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 }, | ||||||
2116 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 }, | ||||||
2117 | }; | ||||||
2118 | |||||||
2119 | static const TypeConversionCostTblEntry AVXConversionTbl[] = { | ||||||
2120 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 }, | ||||||
2121 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 }, | ||||||
2122 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 }, | ||||||
2123 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 }, | ||||||
2124 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, | ||||||
2125 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, | ||||||
2126 | |||||||
2127 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 3 }, | ||||||
2128 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 3 }, | ||||||
2129 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 3 }, | ||||||
2130 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 3 }, | ||||||
2131 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 3 }, | ||||||
2132 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 }, | ||||||
2133 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 3 }, | ||||||
2134 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 3 }, | ||||||
2135 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 3 }, | ||||||
2136 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 }, | ||||||
2137 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 3 }, | ||||||
2138 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 }, | ||||||
2139 | |||||||
2140 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 4 }, | ||||||
2141 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 5 }, | ||||||
2142 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 4 }, | ||||||
2143 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 9 }, | ||||||
2144 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, 11 }, | ||||||
2145 | |||||||
2146 | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 }, | ||||||
2147 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 }, | ||||||
2148 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // and+extract+packuswb | ||||||
2149 | { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 5 }, | ||||||
2150 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, | ||||||
2151 | { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 5 }, | ||||||
2152 | { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 3 }, // and+extract+2*packusdw | ||||||
2153 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 }, | ||||||
2154 | |||||||
2155 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, | ||||||
2156 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 }, | ||||||
2157 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 }, | ||||||
2158 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 }, | ||||||
2159 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 }, | ||||||
2160 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, | ||||||
2161 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 }, | ||||||
2162 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 }, | ||||||
2163 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 }, | ||||||
2164 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 }, | ||||||
2165 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 5 }, | ||||||
2166 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 8 }, | ||||||
2167 | |||||||
2168 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 }, | ||||||
2169 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 }, | ||||||
2170 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 }, | ||||||
2171 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 }, | ||||||
2172 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 }, | ||||||
2173 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, | ||||||
2174 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 }, | ||||||
2175 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 4 }, | ||||||
2176 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 4 }, | ||||||
2177 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 }, | ||||||
2178 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 }, | ||||||
2179 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 }, | ||||||
2180 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 10 }, | ||||||
2181 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 10 }, | ||||||
2182 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 18 }, | ||||||
2183 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 }, | ||||||
2184 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 10 }, | ||||||
2185 | |||||||
2186 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 }, | ||||||
2187 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, 2 }, | ||||||
2188 | { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, 2 }, | ||||||
2189 | { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, 2 }, | ||||||
2190 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 2 }, | ||||||
2191 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, 2 }, | ||||||
2192 | { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 2 }, | ||||||
2193 | { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, 2 }, | ||||||
2194 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 2 }, | ||||||
2195 | { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 2 }, | ||||||
2196 | { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 5 }, | ||||||
2197 | |||||||
2198 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, 2 }, | ||||||
2199 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, 2 }, | ||||||
2200 | { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, 2 }, | ||||||
2201 | { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, 2 }, | ||||||
2202 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 2 }, | ||||||
2203 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, 2 }, | ||||||
2204 | { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 2 }, | ||||||
2205 | { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, 2 }, | ||||||
2206 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 }, | ||||||
2207 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 }, | ||||||
2208 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 6 }, | ||||||
2209 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 7 }, | ||||||
2210 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 7 }, | ||||||
2211 | |||||||
2212 | { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 }, | ||||||
2213 | { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 }, | ||||||
2214 | }; | ||||||
2215 | |||||||
2216 | static const TypeConversionCostTblEntry SSE41ConversionTbl[] = { | ||||||
2217 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 1 }, | ||||||
2218 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 1 }, | ||||||
2219 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 1 }, | ||||||
2220 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 1 }, | ||||||
2221 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 }, | ||||||
2222 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 1 }, | ||||||
2223 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 1 }, | ||||||
2224 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 1 }, | ||||||
2225 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 }, | ||||||
2226 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 1 }, | ||||||
2227 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 }, | ||||||
2228 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 1 }, | ||||||
2229 | |||||||
2230 | // These truncates end up widening elements. | ||||||
2231 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ | ||||||
2232 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ | ||||||
2233 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD | ||||||
2234 | |||||||
2235 | { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 2 }, | ||||||
2236 | { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 2 }, | ||||||
2237 | { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 2 }, | ||||||
2238 | |||||||
2239 | { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 1 }, | ||||||
2240 | { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 1 }, | ||||||
2241 | { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 1 }, | ||||||
2242 | { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 1 }, | ||||||
2243 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 }, | ||||||
2244 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, | ||||||
2245 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 }, | ||||||
2246 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, | ||||||
2247 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, | ||||||
2248 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 1 }, | ||||||
2249 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 }, | ||||||
2250 | |||||||
2251 | { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 1 }, | ||||||
2252 | { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 1 }, | ||||||
2253 | { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 4 }, | ||||||
2254 | { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 }, | ||||||
2255 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 }, | ||||||
2256 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, | ||||||
2257 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 }, | ||||||
2258 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, | ||||||
2259 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 3 }, | ||||||
2260 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 }, | ||||||
2261 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 2 }, | ||||||
2262 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 12 }, | ||||||
2263 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 22 }, | ||||||
2264 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 4 }, | ||||||
2265 | |||||||
2266 | { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 1 }, | ||||||
2267 | { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 1 }, | ||||||
2268 | { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 1 }, | ||||||
2269 | { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 1 }, | ||||||
2270 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 2 }, | ||||||
2271 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 2 }, | ||||||
2272 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 1 }, | ||||||
2273 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 1 }, | ||||||
2274 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, | ||||||
2275 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 1 }, | ||||||
2276 | |||||||
2277 | { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 1 }, | ||||||
2278 | { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 }, | ||||||
2279 | { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 1 }, | ||||||
2280 | { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 }, | ||||||
2281 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 2 }, | ||||||
2282 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 2 }, | ||||||
2283 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 1 }, | ||||||
2284 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 1 }, | ||||||
2285 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 4 }, | ||||||
2286 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 }, | ||||||
2287 | }; | ||||||
2288 | |||||||
2289 | static const TypeConversionCostTblEntry SSE2ConversionTbl[] = { | ||||||
2290 | // These are somewhat magic numbers justified by comparing the | ||||||
2291 | // output of llvm-mca for our various supported scheduler models | ||||||
2292 | // and basing it off the worst case scenario. | ||||||
2293 | { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 3 }, | ||||||
2294 | { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 3 }, | ||||||
2295 | { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 3 }, | ||||||
2296 | { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 3 }, | ||||||
2297 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 3 }, | ||||||
2298 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 }, | ||||||
2299 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 3 }, | ||||||
2300 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 }, | ||||||
2301 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 }, | ||||||
2302 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4 }, | ||||||
2303 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 8 }, | ||||||
2304 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 8 }, | ||||||
2305 | |||||||
2306 | { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 3 }, | ||||||
2307 | { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 3 }, | ||||||
2308 | { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 8 }, | ||||||
2309 | { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 9 }, | ||||||
2310 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 }, | ||||||
2311 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 4 }, | ||||||
2312 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 4 }, | ||||||
2313 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 }, | ||||||
2314 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 7 }, | ||||||
2315 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 7 }, | ||||||
2316 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 }, | ||||||
2317 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 15 }, | ||||||
2318 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 18 }, | ||||||
2319 | |||||||
2320 | { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 4 }, | ||||||
2321 | { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 4 }, | ||||||
2322 | { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 4 }, | ||||||
2323 | { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 4 }, | ||||||
2324 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 6 }, | ||||||
2325 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 6 }, | ||||||
2326 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 5 }, | ||||||
2327 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 5 }, | ||||||
2328 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 4 }, | ||||||
2329 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 4 }, | ||||||
2330 | |||||||
2331 | { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 4 }, | ||||||
2332 | { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 }, | ||||||
2333 | { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 4 }, | ||||||
2334 | { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 15 }, | ||||||
2335 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 6 }, | ||||||
2336 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 6 }, | ||||||
2337 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 5 }, | ||||||
2338 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 5 }, | ||||||
2339 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 8 }, | ||||||
2340 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 8 }, | ||||||
2341 | |||||||
2342 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 4 }, | ||||||
2343 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 4 }, | ||||||
2344 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 2 }, | ||||||
2345 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 3 }, | ||||||
2346 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 }, | ||||||
2347 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 2 }, | ||||||
2348 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 2 }, | ||||||
2349 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 3 }, | ||||||
2350 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 }, | ||||||
2351 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 2 }, | ||||||
2352 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 }, | ||||||
2353 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 2 }, | ||||||
2354 | |||||||
2355 | // These truncates are really widening elements. | ||||||
2356 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD | ||||||
2357 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // PUNPCKLWD+DQ | ||||||
2358 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // PUNPCKLBW+WD+PSHUFD | ||||||
2359 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 1 }, // PUNPCKLWD | ||||||
2360 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD | ||||||
2361 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW | ||||||
2362 | |||||||
2363 | { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 2 }, // PAND+PACKUSWB | ||||||
2364 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, | ||||||
2365 | { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 3 }, // PAND+2*PACKUSWB | ||||||
2366 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 }, | ||||||
2367 | { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 }, | ||||||
2368 | { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 3 }, | ||||||
2369 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, | ||||||
2370 | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32,10 }, | ||||||
2371 | { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB | ||||||
2372 | { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW | ||||||
2373 | { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, 1 }, // PSHUFD | ||||||
2374 | }; | ||||||
2375 | |||||||
2376 | // Attempt to map directly to (simple) MVT types to let us match custom entries. | ||||||
2377 | EVT SrcTy = TLI->getValueType(DL, Src); | ||||||
2378 | EVT DstTy = TLI->getValueType(DL, Dst); | ||||||
2379 | |||||||
2380 | // The function getSimpleVT only handles simple value types. | ||||||
2381 | if (SrcTy.isSimple() && DstTy.isSimple()) { | ||||||
2382 | MVT SimpleSrcTy = SrcTy.getSimpleVT(); | ||||||
2383 | MVT SimpleDstTy = DstTy.getSimpleVT(); | ||||||
2384 | |||||||
2385 | if (ST->useAVX512Regs()) { | ||||||
2386 | if (ST->hasBWI()) | ||||||
2387 | if (const auto *Entry = ConvertCostTableLookup( | ||||||
2388 | AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) | ||||||
2389 | return AdjustCost(Entry->Cost); | ||||||
2390 | |||||||
2391 | if (ST->hasDQI()) | ||||||
2392 | if (const auto *Entry = ConvertCostTableLookup( | ||||||
2393 | AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) | ||||||
2394 | return AdjustCost(Entry->Cost); | ||||||
2395 | |||||||
2396 | if (ST->hasAVX512()) | ||||||
2397 | if (const auto *Entry = ConvertCostTableLookup( | ||||||
2398 | AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) | ||||||
2399 | return AdjustCost(Entry->Cost); | ||||||
2400 | } | ||||||
2401 | |||||||
2402 | if (ST->hasBWI()) | ||||||
2403 | if (const auto *Entry = ConvertCostTableLookup( | ||||||
2404 | AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) | ||||||
2405 | return AdjustCost(Entry->Cost); | ||||||
2406 | |||||||
2407 | if (ST->hasDQI()) | ||||||
2408 | if (const auto *Entry = ConvertCostTableLookup( | ||||||
2409 | AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) | ||||||
2410 | return AdjustCost(Entry->Cost); | ||||||
2411 | |||||||
2412 | if (ST->hasAVX512()) | ||||||
2413 | if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD, | ||||||
2414 | SimpleDstTy, SimpleSrcTy)) | ||||||
2415 | return AdjustCost(Entry->Cost); | ||||||
2416 | |||||||
2417 | if (ST->hasAVX2()) { | ||||||
2418 | if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, | ||||||
2419 | SimpleDstTy, SimpleSrcTy)) | ||||||
2420 | return AdjustCost(Entry->Cost); | ||||||
2421 | } | ||||||
2422 | |||||||
2423 | if (ST->hasAVX()) { | ||||||
2424 | if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, | ||||||
2425 | SimpleDstTy, SimpleSrcTy)) | ||||||
2426 | return AdjustCost(Entry->Cost); | ||||||
2427 | } | ||||||
2428 | |||||||
2429 | if (ST->hasSSE41()) { | ||||||
2430 | if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, | ||||||
2431 | SimpleDstTy, SimpleSrcTy)) | ||||||
2432 | return AdjustCost(Entry->Cost); | ||||||
2433 | } | ||||||
2434 | |||||||
2435 | if (ST->hasSSE2()) { | ||||||
2436 | if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, | ||||||
2437 | SimpleDstTy, SimpleSrcTy)) | ||||||
2438 | return AdjustCost(Entry->Cost); | ||||||
2439 | } | ||||||
2440 | } | ||||||
2441 | |||||||
2442 | // Fall back to legalized types. | ||||||
2443 | std::pair<InstructionCost, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src); | ||||||
2444 | std::pair<InstructionCost, MVT> LTDest = | ||||||
2445 | TLI->getTypeLegalizationCost(DL, Dst); | ||||||
2446 | |||||||
2447 | if (ST->useAVX512Regs()) { | ||||||
2448 | if (ST->hasBWI()) | ||||||
2449 | if (const auto *Entry = ConvertCostTableLookup( | ||||||
2450 | AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second)) | ||||||
2451 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | ||||||
2452 | |||||||
2453 | if (ST->hasDQI()) | ||||||
2454 | if (const auto *Entry = ConvertCostTableLookup( | ||||||
2455 | AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second)) | ||||||
2456 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | ||||||
2457 | |||||||
2458 | if (ST->hasAVX512()) | ||||||
2459 | if (const auto *Entry = ConvertCostTableLookup( | ||||||
2460 | AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second)) | ||||||
2461 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | ||||||
2462 | } | ||||||
2463 | |||||||
2464 | if (ST->hasBWI()) | ||||||
2465 | if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD, | ||||||
2466 | LTDest.second, LTSrc.second)) | ||||||
2467 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | ||||||
2468 | |||||||
2469 | if (ST->hasDQI()) | ||||||
2470 | if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD, | ||||||
2471 | LTDest.second, LTSrc.second)) | ||||||
2472 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | ||||||
2473 | |||||||
2474 | if (ST->hasAVX512()) | ||||||
2475 | if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD, | ||||||
2476 | LTDest.second, LTSrc.second)) | ||||||
2477 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | ||||||
2478 | |||||||
2479 | if (ST->hasAVX2()) | ||||||
2480 | if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, | ||||||
2481 | LTDest.second, LTSrc.second)) | ||||||
2482 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | ||||||
2483 | |||||||
2484 | if (ST->hasAVX()) | ||||||
2485 | if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, | ||||||
2486 | LTDest.second, LTSrc.second)) | ||||||
2487 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | ||||||
2488 | |||||||
2489 | if (ST->hasSSE41()) | ||||||
2490 | if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, | ||||||
2491 | LTDest.second, LTSrc.second)) | ||||||
2492 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | ||||||
2493 | |||||||
2494 | if (ST->hasSSE2()) | ||||||
2495 | if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, | ||||||
2496 | LTDest.second, LTSrc.second)) | ||||||
2497 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | ||||||
2498 | |||||||
2499 | // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for | ||||||
2500 | // sitofp. | ||||||
2501 | if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) && | ||||||
2502 | 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) { | ||||||
2503 | Type *ExtSrc = Src->getWithNewBitWidth(32); | ||||||
2504 | unsigned ExtOpc = | ||||||
2505 | (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt; | ||||||
2506 | |||||||
2507 | // For scalar loads the extend would be free. | ||||||
2508 | InstructionCost ExtCost = 0; | ||||||
2509 | if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0)))) | ||||||
2510 | ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind); | ||||||
2511 | |||||||
2512 | return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc, | ||||||
2513 | TTI::CastContextHint::None, CostKind); | ||||||
2514 | } | ||||||
2515 | |||||||
2516 | // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi | ||||||
2517 | // i32. | ||||||
2518 | if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) && | ||||||
2519 | 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) { | ||||||
2520 | Type *TruncDst = Dst->getWithNewBitWidth(32); | ||||||
2521 | return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) + | ||||||
2522 | getCastInstrCost(Instruction::Trunc, Dst, TruncDst, | ||||||
2523 | TTI::CastContextHint::None, CostKind); | ||||||
2524 | } | ||||||
2525 | |||||||
2526 | return AdjustCost( | ||||||
2527 | BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); | ||||||
2528 | } | ||||||
2529 | |||||||
2530 | InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, | ||||||
2531 | Type *CondTy, | ||||||
2532 | CmpInst::Predicate VecPred, | ||||||
2533 | TTI::TargetCostKind CostKind, | ||||||
2534 | const Instruction *I) { | ||||||
2535 | // TODO: Handle other cost kinds. | ||||||
2536 | if (CostKind != TTI::TCK_RecipThroughput) | ||||||
2537 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, | ||||||
2538 | I); | ||||||
2539 | |||||||
2540 | // Legalize the type. | ||||||
2541 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); | ||||||
2542 | |||||||
2543 | MVT MTy = LT.second; | ||||||
2544 | |||||||
2545 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | ||||||
2546 | assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ? void (0) : __assert_fail ("ISD && \"Invalid opcode\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 2546, __extension__ __PRETTY_FUNCTION__)); | ||||||
2547 | |||||||
2548 | unsigned ExtraCost = 0; | ||||||
2549 | if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) { | ||||||
2550 | // Some vector comparison predicates cost extra instructions. | ||||||
2551 | // TODO: Should we invert this and assume worst case cmp costs | ||||||
2552 | // and reduce for particular predicates? | ||||||
2553 | if (MTy.isVector() && | ||||||
2554 | !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) || | ||||||
2555 | (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) || | ||||||
2556 | ST->hasBWI())) { | ||||||
2557 | // Fallback to I if a specific predicate wasn't specified. | ||||||
2558 | CmpInst::Predicate Pred = VecPred; | ||||||
2559 | if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE || | ||||||
2560 | Pred == CmpInst::BAD_FCMP_PREDICATE)) | ||||||
2561 | Pred = cast<CmpInst>(I)->getPredicate(); | ||||||
2562 | |||||||
2563 | switch (Pred) { | ||||||
2564 | case CmpInst::Predicate::ICMP_NE: | ||||||
2565 | // xor(cmpeq(x,y),-1) | ||||||
2566 | ExtraCost = 1; | ||||||
2567 | break; | ||||||
2568 | case CmpInst::Predicate::ICMP_SGE: | ||||||
2569 | case CmpInst::Predicate::ICMP_SLE: | ||||||
2570 | // xor(cmpgt(x,y),-1) | ||||||
2571 | ExtraCost = 1; | ||||||
2572 | break; | ||||||
2573 | case CmpInst::Predicate::ICMP_ULT: | ||||||
2574 | case CmpInst::Predicate::ICMP_UGT: | ||||||
2575 | // cmpgt(xor(x,signbit),xor(y,signbit)) | ||||||
2576 | // xor(cmpeq(pmaxu(x,y),x),-1) | ||||||
2577 | ExtraCost = 2; | ||||||
2578 | break; | ||||||
2579 | case CmpInst::Predicate::ICMP_ULE: | ||||||
2580 | case CmpInst::Predicate::ICMP_UGE: | ||||||
2581 | if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) || | ||||||
2582 | (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) { | ||||||
2583 | // cmpeq(psubus(x,y),0) | ||||||
2584 | // cmpeq(pminu(x,y),x) | ||||||
2585 | ExtraCost = 1; | ||||||
2586 | } else { | ||||||
2587 | // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1) | ||||||
2588 | ExtraCost = 3; | ||||||
2589 | } | ||||||
2590 | break; | ||||||
2591 | case CmpInst::Predicate::BAD_ICMP_PREDICATE: | ||||||
2592 | case CmpInst::Predicate::BAD_FCMP_PREDICATE: | ||||||
2593 | // Assume worst case scenario and add the maximum extra cost. | ||||||
2594 | ExtraCost = 3; | ||||||
2595 | break; | ||||||
2596 | default: | ||||||
2597 | break; | ||||||
2598 | } | ||||||
2599 | } | ||||||
2600 | } | ||||||
2601 | |||||||
2602 | static const CostTblEntry SLMCostTbl[] = { | ||||||
2603 | // slm pcmpeq/pcmpgt throughput is 2 | ||||||
2604 | { ISD::SETCC, MVT::v2i64, 2 }, | ||||||
2605 | }; | ||||||
2606 | |||||||
2607 | static const CostTblEntry AVX512BWCostTbl[] = { | ||||||
2608 | { ISD::SETCC, MVT::v32i16, 1 }, | ||||||
2609 | { ISD::SETCC, MVT::v64i8, 1 }, | ||||||
2610 | |||||||
2611 | { ISD::SELECT, MVT::v32i16, 1 }, | ||||||
2612 | { ISD::SELECT, MVT::v64i8, 1 }, | ||||||
2613 | }; | ||||||
2614 | |||||||
2615 | static const CostTblEntry AVX512CostTbl[] = { | ||||||
2616 | { ISD::SETCC, MVT::v8i64, 1 }, | ||||||
2617 | { ISD::SETCC, MVT::v16i32, 1 }, | ||||||
2618 | { ISD::SETCC, MVT::v8f64, 1 }, | ||||||
2619 | { ISD::SETCC, MVT::v16f32, 1 }, | ||||||
2620 | |||||||
2621 | { ISD::SELECT, MVT::v8i64, 1 }, | ||||||
2622 | { ISD::SELECT, MVT::v16i32, 1 }, | ||||||
2623 | { ISD::SELECT, MVT::v8f64, 1 }, | ||||||
2624 | { ISD::SELECT, MVT::v16f32, 1 }, | ||||||
2625 | |||||||
2626 | { ISD::SETCC, MVT::v32i16, 2 }, // FIXME: should probably be 4 | ||||||
2627 | { ISD::SETCC, MVT::v64i8, 2 }, // FIXME: should probably be 4 | ||||||
2628 | |||||||
2629 | { ISD::SELECT, MVT::v32i16, 2 }, // FIXME: should be 3 | ||||||
2630 | { ISD::SELECT, MVT::v64i8, 2 }, // FIXME: should be 3 | ||||||
2631 | }; | ||||||
2632 | |||||||
2633 | static const CostTblEntry AVX2CostTbl[] = { | ||||||
2634 | { ISD::SETCC, MVT::v4i64, 1 }, | ||||||
2635 | { ISD::SETCC, MVT::v8i32, 1 }, | ||||||
2636 | { ISD::SETCC, MVT::v16i16, 1 }, | ||||||
2637 | { ISD::SETCC, MVT::v32i8, 1 }, | ||||||
2638 | |||||||
2639 | { ISD::SELECT, MVT::v4i64, 1 }, // pblendvb | ||||||
2640 | { ISD::SELECT, MVT::v8i32, 1 }, // pblendvb | ||||||
2641 | { ISD::SELECT, MVT::v16i16, 1 }, // pblendvb | ||||||
2642 | { ISD::SELECT, MVT::v32i8, 1 }, // pblendvb | ||||||
2643 | }; | ||||||
2644 | |||||||
2645 | static const CostTblEntry AVX1CostTbl[] = { | ||||||
2646 | { ISD::SETCC, MVT::v4f64, 1 }, | ||||||
2647 | { ISD::SETCC, MVT::v8f32, 1 }, | ||||||
2648 | // AVX1 does not support 8-wide integer compare. | ||||||
2649 | { ISD::SETCC, MVT::v4i64, 4 }, | ||||||
2650 | { ISD::SETCC, MVT::v8i32, 4 }, | ||||||
2651 | { ISD::SETCC, MVT::v16i16, 4 }, | ||||||
2652 | { ISD::SETCC, MVT::v32i8, 4 }, | ||||||
2653 | |||||||
2654 | { ISD::SELECT, MVT::v4f64, 1 }, // vblendvpd | ||||||
2655 | { ISD::SELECT, MVT::v8f32, 1 }, // vblendvps | ||||||
2656 | { ISD::SELECT, MVT::v4i64, 1 }, // vblendvpd | ||||||
2657 | { ISD::SELECT, MVT::v8i32, 1 }, // vblendvps | ||||||
2658 | { ISD::SELECT, MVT::v16i16, 3 }, // vandps + vandnps + vorps | ||||||
2659 | { ISD::SELECT, MVT::v32i8, 3 }, // vandps + vandnps + vorps | ||||||
2660 | }; | ||||||
2661 | |||||||
2662 | static const CostTblEntry SSE42CostTbl[] = { | ||||||
2663 | { ISD::SETCC, MVT::v2f64, 1 }, | ||||||
2664 | { ISD::SETCC, MVT::v4f32, 1 }, | ||||||
2665 | { ISD::SETCC, MVT::v2i64, 1 }, | ||||||
2666 | }; | ||||||
2667 | |||||||
2668 | static const CostTblEntry SSE41CostTbl[] = { | ||||||
2669 | { ISD::SELECT, MVT::v2f64, 1 }, // blendvpd | ||||||
2670 | { ISD::SELECT, MVT::v4f32, 1 }, // blendvps | ||||||
2671 | { ISD::SELECT, MVT::v2i64, 1 }, // pblendvb | ||||||
2672 | { ISD::SELECT, MVT::v4i32, 1 }, // pblendvb | ||||||
2673 | { ISD::SELECT, MVT::v8i16, 1 }, // pblendvb | ||||||
2674 | { ISD::SELECT, MVT::v16i8, 1 }, // pblendvb | ||||||
2675 | }; | ||||||
2676 | |||||||
2677 | static const CostTblEntry SSE2CostTbl[] = { | ||||||
2678 | { ISD::SETCC, MVT::v2f64, 2 }, | ||||||
2679 | { ISD::SETCC, MVT::f64, 1 }, | ||||||
2680 | { ISD::SETCC, MVT::v2i64, 8 }, | ||||||
2681 | { ISD::SETCC, MVT::v4i32, 1 }, | ||||||
2682 | { ISD::SETCC, MVT::v8i16, 1 }, | ||||||
2683 | { ISD::SETCC, MVT::v16i8, 1 }, | ||||||
2684 | |||||||
2685 | { ISD::SELECT, MVT::v2f64, 3 }, // andpd + andnpd + orpd | ||||||
2686 | { ISD::SELECT, MVT::v2i64, 3 }, // pand + pandn + por | ||||||
2687 | { ISD::SELECT, MVT::v4i32, 3 }, // pand + pandn + por | ||||||
2688 | { ISD::SELECT, MVT::v8i16, 3 }, // pand + pandn + por | ||||||
2689 | { ISD::SELECT, MVT::v16i8, 3 }, // pand + pandn + por | ||||||
2690 | }; | ||||||
2691 | |||||||
2692 | static const CostTblEntry SSE1CostTbl[] = { | ||||||
2693 | { ISD::SETCC, MVT::v4f32, 2 }, | ||||||
2694 | { ISD::SETCC, MVT::f32, 1 }, | ||||||
2695 | |||||||
2696 | { ISD::SELECT, MVT::v4f32, 3 }, // andps + andnps + orps | ||||||
2697 | }; | ||||||
2698 | |||||||
2699 | if (ST->useSLMArithCosts()) | ||||||
2700 | if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) | ||||||
2701 | return LT.first * (ExtraCost + Entry->Cost); | ||||||
2702 | |||||||
2703 | if (ST->hasBWI()) | ||||||
2704 | if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) | ||||||
2705 | return LT.first * (ExtraCost + Entry->Cost); | ||||||
2706 | |||||||
2707 | if (ST->hasAVX512()) | ||||||
2708 | if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) | ||||||
2709 | return LT.first * (ExtraCost + Entry->Cost); | ||||||
2710 | |||||||
2711 | if (ST->hasAVX2()) | ||||||
2712 | if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) | ||||||
2713 | return LT.first * (ExtraCost + Entry->Cost); | ||||||
2714 | |||||||
2715 | if (ST->hasAVX()) | ||||||
2716 | if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) | ||||||
2717 | return LT.first * (ExtraCost + Entry->Cost); | ||||||
2718 | |||||||
2719 | if (ST->hasSSE42()) | ||||||
2720 | if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) | ||||||
2721 | return LT.first * (ExtraCost + Entry->Cost); | ||||||
2722 | |||||||
2723 | if (ST->hasSSE41()) | ||||||
2724 | if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) | ||||||
2725 | return LT.first * (ExtraCost + Entry->Cost); | ||||||
2726 | |||||||
2727 | if (ST->hasSSE2()) | ||||||
2728 | if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) | ||||||
2729 | return LT.first * (ExtraCost + Entry->Cost); | ||||||
2730 | |||||||
2731 | if (ST->hasSSE1()) | ||||||
2732 | if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) | ||||||
2733 | return LT.first * (ExtraCost + Entry->Cost); | ||||||
2734 | |||||||
2735 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); | ||||||
2736 | } | ||||||
2737 | |||||||
2738 | unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; } | ||||||
2739 | |||||||
2740 | InstructionCost | ||||||
2741 | X86TTIImpl::getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, | ||||||
2742 | TTI::TargetCostKind CostKind) { | ||||||
2743 | |||||||
2744 | // Costs should match the codegen from: | ||||||
2745 | // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll | ||||||
2746 | // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll | ||||||
2747 | // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll | ||||||
2748 | // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll | ||||||
2749 | // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll | ||||||
2750 | |||||||
2751 | // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not | ||||||
2752 | // specialized in these tables yet. | ||||||
2753 | static const CostTblEntry AVX512BITALGCostTbl[] = { | ||||||
2754 | { ISD::CTPOP, MVT::v32i16, 1 }, | ||||||
2755 | { ISD::CTPOP, MVT::v64i8, 1 }, | ||||||
2756 | { ISD::CTPOP, MVT::v16i16, 1 }, | ||||||
2757 | { ISD::CTPOP, MVT::v32i8, 1 }, | ||||||
2758 | { ISD::CTPOP, MVT::v8i16, 1 }, | ||||||
2759 | { ISD::CTPOP, MVT::v16i8, 1 }, | ||||||
2760 | }; | ||||||
2761 | static const CostTblEntry AVX512VPOPCNTDQCostTbl[] = { | ||||||
2762 | { ISD::CTPOP, MVT::v8i64, 1 }, | ||||||
2763 | { ISD::CTPOP, MVT::v16i32, 1 }, | ||||||
2764 | { ISD::CTPOP, MVT::v4i64, 1 }, | ||||||
2765 | { ISD::CTPOP, MVT::v8i32, 1 }, | ||||||
2766 | { ISD::CTPOP, MVT::v2i64, 1 }, | ||||||
2767 | { ISD::CTPOP, MVT::v4i32, 1 }, | ||||||
2768 | }; | ||||||
2769 | static const CostTblEntry AVX512CDCostTbl[] = { | ||||||
2770 | { ISD::CTLZ, MVT::v8i64, 1 }, | ||||||
2771 | { ISD::CTLZ, MVT::v16i32, 1 }, | ||||||
2772 | { ISD::CTLZ, MVT::v32i16, 8 }, | ||||||
2773 | { ISD::CTLZ, MVT::v64i8, 20 }, | ||||||
2774 | { ISD::CTLZ, MVT::v4i64, 1 }, | ||||||
2775 | { ISD::CTLZ, MVT::v8i32, 1 }, | ||||||
2776 | { ISD::CTLZ, MVT::v16i16, 4 }, | ||||||
2777 | { ISD::CTLZ, MVT::v32i8, 10 }, | ||||||
2778 | { ISD::CTLZ, MVT::v2i64, 1 }, | ||||||
2779 | { ISD::CTLZ, MVT::v4i32, 1 }, | ||||||
2780 | { ISD::CTLZ, MVT::v8i16, 4 }, | ||||||
2781 | { ISD::CTLZ, MVT::v16i8, 4 }, | ||||||
2782 | }; | ||||||
2783 | static const CostTblEntry AVX512BWCostTbl[] = { | ||||||
2784 | { ISD::ABS, MVT::v32i16, 1 }, | ||||||
2785 | { ISD::ABS, MVT::v64i8, 1 }, | ||||||
2786 | { ISD::BITREVERSE, MVT::v8i64, 3 }, | ||||||
2787 | { ISD::BITREVERSE, MVT::v16i32, 3 }, | ||||||
2788 | { ISD::BITREVERSE, MVT::v32i16, 3 }, | ||||||
2789 | { ISD::BITREVERSE, MVT::v64i8, 2 }, | ||||||
2790 | { ISD::BSWAP, MVT::v8i64, 1 }, | ||||||
2791 | { ISD::BSWAP, MVT::v16i32, 1 }, | ||||||
2792 | { ISD::BSWAP, MVT::v32i16, 1 }, | ||||||
2793 | { ISD::CTLZ, MVT::v8i64, 23 }, | ||||||
2794 | { ISD::CTLZ, MVT::v16i32, 22 }, | ||||||
2795 | { ISD::CTLZ, MVT::v32i16, 18 }, | ||||||
2796 | { ISD::CTLZ, MVT::v64i8, 17 }, | ||||||
2797 | { ISD::CTPOP, MVT::v8i64, 7 }, | ||||||
2798 | { ISD::CTPOP, MVT::v16i32, 11 }, | ||||||
2799 | { ISD::CTPOP, MVT::v32i16, 9 }, | ||||||
2800 | { ISD::CTPOP, MVT::v64i8, 6 }, | ||||||
2801 | { ISD::CTTZ, MVT::v8i64, 10 }, | ||||||
2802 | { ISD::CTTZ, MVT::v16i32, 14 }, | ||||||
2803 | { ISD::CTTZ, MVT::v32i16, 12 }, | ||||||
2804 | { ISD::CTTZ, MVT::v64i8, 9 }, | ||||||
2805 | { ISD::SADDSAT, MVT::v32i16, 1 }, | ||||||
2806 | { ISD::SADDSAT, MVT::v64i8, 1 }, | ||||||
2807 | { ISD::SMAX, MVT::v32i16, 1 }, | ||||||
2808 | { ISD::SMAX, MVT::v64i8, 1 }, | ||||||
2809 | { ISD::SMIN, MVT::v32i16, 1 }, | ||||||
2810 | { ISD::SMIN, MVT::v64i8, 1 }, | ||||||
2811 | { ISD::SSUBSAT, MVT::v32i16, 1 }, | ||||||
2812 | { ISD::SSUBSAT, MVT::v64i8, 1 }, | ||||||
2813 | { ISD::UADDSAT, MVT::v32i16, 1 }, | ||||||
2814 | { ISD::UADDSAT, MVT::v64i8, 1 }, | ||||||
2815 | { ISD::UMAX, MVT::v32i16, 1 }, | ||||||
2816 | { ISD::UMAX, MVT::v64i8, 1 }, | ||||||
2817 | { ISD::UMIN, MVT::v32i16, 1 }, | ||||||
2818 | { ISD::UMIN, MVT::v64i8, 1 }, | ||||||
2819 | { ISD::USUBSAT, MVT::v32i16, 1 }, | ||||||
2820 | { ISD::USUBSAT, MVT::v64i8, 1 }, | ||||||
2821 | }; | ||||||
2822 | static const CostTblEntry AVX512CostTbl[] = { | ||||||
2823 | { ISD::ABS, MVT::v8i64, 1 }, | ||||||
2824 | { ISD::ABS, MVT::v16i32, 1 }, | ||||||
2825 | { ISD::ABS, MVT::v32i16, 2 }, | ||||||
2826 | { ISD::ABS, MVT::v64i8, 2 }, | ||||||
2827 | { ISD::ABS, MVT::v4i64, 1 }, | ||||||
2828 | { ISD::ABS, MVT::v2i64, 1 }, | ||||||
2829 | { ISD::BITREVERSE, MVT::v8i64, 36 }, | ||||||
2830 | { ISD::BITREVERSE, MVT::v16i32, 24 }, | ||||||
2831 | { ISD::BITREVERSE, MVT::v32i16, 10 }, | ||||||
2832 | { ISD::BITREVERSE, MVT::v64i8, 10 }, | ||||||
2833 | { ISD::BSWAP, MVT::v8i64, 4 }, | ||||||
2834 | { ISD::BSWAP, MVT::v16i32, 4 }, | ||||||
2835 | { ISD::BSWAP, MVT::v32i16, 4 }, | ||||||
2836 | { ISD::CTLZ, MVT::v8i64, 29 }, | ||||||
2837 | { ISD::CTLZ, MVT::v16i32, 35 }, | ||||||
2838 | { ISD::CTLZ, MVT::v32i16, 28 }, | ||||||
2839 | { ISD::CTLZ, MVT::v64i8, 18 }, | ||||||
2840 | { ISD::CTPOP, MVT::v8i64, 16 }, | ||||||
2841 | { ISD::CTPOP, MVT::v16i32, 24 }, | ||||||
2842 | { ISD::CTPOP, MVT::v32i16, 18 }, | ||||||
2843 | { ISD::CTPOP, MVT::v64i8, 12 }, | ||||||
2844 | { ISD::CTTZ, MVT::v8i64, 20 }, | ||||||
2845 | { ISD::CTTZ, MVT::v16i32, 28 }, | ||||||
2846 | { ISD::CTTZ, MVT::v32i16, 24 }, | ||||||
2847 | { ISD::CTTZ, MVT::v64i8, 18 }, | ||||||
2848 | { ISD::SMAX, MVT::v8i64, 1 }, | ||||||
2849 | { ISD::SMAX, MVT::v16i32, 1 }, | ||||||
2850 | { ISD::SMAX, MVT::v32i16, 2 }, | ||||||
2851 | { ISD::SMAX, MVT::v64i8, 2 }, | ||||||
2852 | { ISD::SMAX, MVT::v4i64, 1 }, | ||||||
2853 | { ISD::SMAX, MVT::v2i64, 1 }, | ||||||
2854 | { ISD::SMIN, MVT::v8i64, 1 }, | ||||||
2855 | { ISD::SMIN, MVT::v16i32, 1 }, | ||||||
2856 | { ISD::SMIN, MVT::v32i16, 2 }, | ||||||
2857 | { ISD::SMIN, MVT::v64i8, 2 }, | ||||||
2858 | { ISD::SMIN, MVT::v4i64, 1 }, | ||||||
2859 | { ISD::SMIN, MVT::v2i64, 1 }, | ||||||
2860 | { ISD::UMAX, MVT::v8i64, 1 }, | ||||||
2861 | { ISD::UMAX, MVT::v16i32, 1 }, | ||||||
2862 | { ISD::UMAX, MVT::v32i16, 2 }, | ||||||
2863 | { ISD::UMAX, MVT::v64i8, 2 }, | ||||||
2864 | { ISD::UMAX, MVT::v4i64, 1 }, | ||||||
2865 | { ISD::UMAX, MVT::v2i64, 1 }, | ||||||
2866 | { ISD::UMIN, MVT::v8i64, 1 }, | ||||||
2867 | { ISD::UMIN, MVT::v16i32, 1 }, | ||||||
2868 | { ISD::UMIN, MVT::v32i16, 2 }, | ||||||
2869 | { ISD::UMIN, MVT::v64i8, 2 }, | ||||||
2870 | { ISD::UMIN, MVT::v4i64, 1 }, | ||||||
2871 | { ISD::UMIN, MVT::v2i64, 1 }, | ||||||
2872 | { ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd | ||||||
2873 | { ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq | ||||||
2874 | { ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq | ||||||
2875 | { ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq | ||||||
2876 | { ISD::UADDSAT, MVT::v16i32, 3 }, // not + pminud + paddd | ||||||
2877 | { ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq | ||||||
2878 | { ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq | ||||||
2879 | { ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq | ||||||
2880 | { ISD::SADDSAT, MVT::v32i16, 2 }, | ||||||
2881 | { ISD::SADDSAT, MVT::v64i8, 2 }, | ||||||
2882 | { ISD::SSUBSAT, MVT::v32i16, 2 }, | ||||||
2883 | { ISD::SSUBSAT, MVT::v64i8, 2 }, | ||||||
2884 | { ISD::UADDSAT, MVT::v32i16, 2 }, | ||||||
2885 | { ISD::UADDSAT, MVT::v64i8, 2 }, | ||||||
2886 | { ISD::USUBSAT, MVT::v32i16, 2 }, | ||||||
2887 | { ISD::USUBSAT, MVT::v64i8, 2 }, | ||||||
2888 | { ISD::FMAXNUM, MVT::f32, 2 }, | ||||||
2889 | { ISD::FMAXNUM, MVT::v4f32, 2 }, | ||||||
2890 | { ISD::FMAXNUM, MVT::v8f32, 2 }, | ||||||
2891 | { ISD::FMAXNUM, MVT::v16f32, 2 }, | ||||||
2892 | { ISD::FMAXNUM, MVT::f64, 2 }, | ||||||
2893 | { ISD::FMAXNUM, MVT::v2f64, 2 }, | ||||||
2894 | { ISD::FMAXNUM, MVT::v4f64, 2 }, | ||||||
2895 | { ISD::FMAXNUM, MVT::v8f64, 2 }, | ||||||
2896 | }; | ||||||
2897 | static const CostTblEntry XOPCostTbl[] = { | ||||||
2898 | { ISD::BITREVERSE, MVT::v4i64, 4 }, | ||||||
2899 | { ISD::BITREVERSE, MVT::v8i32, 4 }, | ||||||
2900 | { ISD::BITREVERSE, MVT::v16i16, 4 }, | ||||||
2901 | { ISD::BITREVERSE, MVT::v32i8, 4 }, | ||||||
2902 | { ISD::BITREVERSE, MVT::v2i64, 1 }, | ||||||
2903 | { ISD::BITREVERSE, MVT::v4i32, 1 }, | ||||||
2904 | { ISD::BITREVERSE, MVT::v8i16, 1 }, | ||||||
2905 | { ISD::BITREVERSE, MVT::v16i8, 1 }, | ||||||
2906 | { ISD::BITREVERSE, MVT::i64, 3 }, | ||||||
2907 | { ISD::BITREVERSE, MVT::i32, 3 }, | ||||||
2908 | { ISD::BITREVERSE, MVT::i16, 3 }, | ||||||
2909 | { ISD::BITREVERSE, MVT::i8, 3 } | ||||||
2910 | }; | ||||||
2911 | static const CostTblEntry AVX2CostTbl[] = { | ||||||
2912 | { ISD::ABS, MVT::v4i64, 2 }, // VBLENDVPD(X,VPSUBQ(0,X),X) | ||||||
2913 | { ISD::ABS, MVT::v8i32, 1 }, | ||||||
2914 | { ISD::ABS, MVT::v16i16, 1 }, | ||||||
2915 | { ISD::ABS, MVT::v32i8, 1 }, | ||||||
2916 | { ISD::BITREVERSE, MVT::v2i64, 3 }, | ||||||
2917 | { ISD::BITREVERSE, MVT::v4i64, 3 }, | ||||||
2918 | { ISD::BITREVERSE, MVT::v4i32, 3 }, | ||||||
2919 | { ISD::BITREVERSE, MVT::v8i32, 3 }, | ||||||
2920 | { ISD::BITREVERSE, MVT::v8i16, 3 }, | ||||||
2921 | { ISD::BITREVERSE, MVT::v16i16, 3 }, | ||||||
2922 | { ISD::BITREVERSE, MVT::v16i8, 3 }, | ||||||
2923 | { ISD::BITREVERSE, MVT::v32i8, 3 }, | ||||||
2924 | { ISD::BSWAP, MVT::v4i64, 1 }, | ||||||
2925 | { ISD::BSWAP, MVT::v8i32, 1 }, | ||||||
2926 | { ISD::BSWAP, MVT::v16i16, 1 }, | ||||||
2927 | { ISD::CTLZ, MVT::v2i64, 7 }, | ||||||
2928 | { ISD::CTLZ, MVT::v4i64, 7 }, | ||||||
2929 | { ISD::CTLZ, MVT::v4i32, 5 }, | ||||||
2930 | { ISD::CTLZ, MVT::v8i32, 5 }, | ||||||
2931 | { ISD::CTLZ, MVT::v8i16, 4 }, | ||||||
2932 | { ISD::CTLZ, MVT::v16i16, 4 }, | ||||||
2933 | { ISD::CTLZ, MVT::v16i8, 3 }, | ||||||
2934 | { ISD::CTLZ, MVT::v32i8, 3 }, | ||||||
2935 | { ISD::CTPOP, MVT::v2i64, 3 }, | ||||||
2936 | { ISD::CTPOP, MVT::v4i64, 3 }, | ||||||
2937 | { ISD::CTPOP, MVT::v4i32, 7 }, | ||||||
2938 | { ISD::CTPOP, MVT::v8i32, 7 }, | ||||||
2939 | { ISD::CTPOP, MVT::v8i16, 3 }, | ||||||
2940 | { ISD::CTPOP, MVT::v16i16, 3 }, | ||||||
2941 | { ISD::CTPOP, MVT::v16i8, 2 }, | ||||||
2942 | { ISD::CTPOP, MVT::v32i8, 2 }, | ||||||
2943 | { ISD::CTTZ, MVT::v2i64, 4 }, | ||||||
2944 | { ISD::CTTZ, MVT::v4i64, 4 }, | ||||||
2945 | { ISD::CTTZ, MVT::v4i32, 7 }, | ||||||
2946 | { ISD::CTTZ, MVT::v8i32, 7 }, | ||||||
2947 | { ISD::CTTZ, MVT::v8i16, 4 }, | ||||||
2948 | { ISD::CTTZ, MVT::v16i16, 4 }, | ||||||
2949 | { ISD::CTTZ, MVT::v16i8, 3 }, | ||||||
2950 | { ISD::CTTZ, MVT::v32i8, 3 }, | ||||||
2951 | { ISD::SADDSAT, MVT::v16i16, 1 }, | ||||||
2952 | { ISD::SADDSAT, MVT::v32i8, 1 }, | ||||||
2953 | { ISD::SMAX, MVT::v8i32, 1 }, | ||||||
2954 | { ISD::SMAX, MVT::v16i16, 1 }, | ||||||
2955 | { ISD::SMAX, MVT::v32i8, 1 }, | ||||||
2956 | { ISD::SMIN, MVT::v8i32, 1 }, | ||||||
2957 | { ISD::SMIN, MVT::v16i16, 1 }, | ||||||
2958 | { ISD::SMIN, MVT::v32i8, 1 }, | ||||||
2959 | { ISD::SSUBSAT, MVT::v16i16, 1 }, | ||||||
2960 | { ISD::SSUBSAT, MVT::v32i8, 1 }, | ||||||
2961 | { ISD::UADDSAT, MVT::v16i16, 1 }, | ||||||
2962 | { ISD::UADDSAT, MVT::v32i8, 1 }, | ||||||
2963 | { ISD::UADDSAT, MVT::v8i32, 3 }, // not + pminud + paddd | ||||||
2964 | { ISD::UMAX, MVT::v8i32, 1 }, | ||||||
2965 | { ISD::UMAX, MVT::v16i16, 1 }, | ||||||
2966 | { ISD::UMAX, MVT::v32i8, 1 }, | ||||||
2967 | { ISD::UMIN, MVT::v8i32, 1 }, | ||||||
2968 | { ISD::UMIN, MVT::v16i16, 1 }, | ||||||
2969 | { ISD::UMIN, MVT::v32i8, 1 }, | ||||||
2970 | { ISD::USUBSAT, MVT::v16i16, 1 }, | ||||||
2971 | { ISD::USUBSAT, MVT::v32i8, 1 }, | ||||||
2972 | { ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd | ||||||
2973 | { ISD::FMAXNUM, MVT::v8f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS | ||||||
2974 | { ISD::FMAXNUM, MVT::v4f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD | ||||||
2975 | { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/ | ||||||
2976 | { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/ | ||||||
2977 | { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/ | ||||||
2978 | { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/ | ||||||
2979 | { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/ | ||||||
2980 | { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/ | ||||||
2981 | }; | ||||||
2982 | static const CostTblEntry AVX1CostTbl[] = { | ||||||
2983 | { ISD::ABS, MVT::v4i64, 5 }, // VBLENDVPD(X,VPSUBQ(0,X),X) | ||||||
2984 | { ISD::ABS, MVT::v8i32, 3 }, | ||||||
2985 | { ISD::ABS, MVT::v16i16, 3 }, | ||||||
2986 | { ISD::ABS, MVT::v32i8, 3 }, | ||||||
2987 | { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert | ||||||
2988 | { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert | ||||||
2989 | { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert | ||||||
2990 | { ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert | ||||||
2991 | { ISD::BSWAP, MVT::v4i64, 4 }, | ||||||
2992 | { ISD::BSWAP, MVT::v8i32, 4 }, | ||||||
2993 | { ISD::BSWAP, MVT::v16i16, 4 }, | ||||||
2994 | { ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert | ||||||
2995 | { ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert | ||||||
2996 | { ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert | ||||||
2997 | { ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert | ||||||
2998 | { ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert | ||||||
2999 | { ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert | ||||||
3000 | { ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert | ||||||
3001 | { ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert | ||||||
3002 | { ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert | ||||||
3003 | { ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert | ||||||
3004 | { ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert | ||||||
3005 | { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert | ||||||
3006 | { ISD::SADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
3007 | { ISD::SADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
3008 | { ISD::SMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
3009 | { ISD::SMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
3010 | { ISD::SMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
3011 | { ISD::SMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
3012 | { ISD::SMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
3013 | { ISD::SMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
3014 | { ISD::SSUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
3015 | { ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
3016 | { ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
3017 | { ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
3018 | { ISD::UADDSAT, MVT::v8i32, 8 }, // 2 x 128-bit Op + extract/insert | ||||||
3019 | { ISD::UMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
3020 | { ISD::UMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
3021 | { ISD::UMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
3022 | { ISD::UMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
3023 | { ISD::UMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
3024 | { ISD::UMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
3025 | { ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
3026 | { ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
3027 | { ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert | ||||||
3028 | { ISD::FMAXNUM, MVT::f32, 3 }, // MAXSS + CMPUNORDSS + BLENDVPS | ||||||
3029 | { ISD::FMAXNUM, MVT::v4f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS | ||||||
3030 | { ISD::FMAXNUM, MVT::v8f32, 5 }, // MAXPS + CMPUNORDPS + BLENDVPS + ? | ||||||
3031 | { ISD::FMAXNUM, MVT::f64, 3 }, // MAXSD + CMPUNORDSD + BLENDVPD | ||||||
3032 | { ISD::FMAXNUM, MVT::v2f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD | ||||||
3033 | { ISD::FMAXNUM, MVT::v4f64, 5 }, // MAXPD + CMPUNORDPD + BLENDVPD + ? | ||||||
3034 | { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/ | ||||||
3035 | { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/ | ||||||
3036 | { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/ | ||||||
3037 | { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/ | ||||||
3038 | { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/ | ||||||
3039 | { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/ | ||||||
3040 | }; | ||||||
3041 | static const CostTblEntry GLMCostTbl[] = { | ||||||
3042 | { ISD::FSQRT, MVT::f32, 19 }, // sqrtss | ||||||
3043 | { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps | ||||||
3044 | { ISD::FSQRT, MVT::f64, 34 }, // sqrtsd | ||||||
3045 | { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd | ||||||
3046 | }; | ||||||
3047 | static const CostTblEntry SLMCostTbl[] = { | ||||||
3048 | { ISD::FSQRT, MVT::f32, 20 }, // sqrtss | ||||||
3049 | { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps | ||||||
3050 | { ISD::FSQRT, MVT::f64, 35 }, // sqrtsd | ||||||
3051 | { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd | ||||||
3052 | }; | ||||||
3053 | static const CostTblEntry SSE42CostTbl[] = { | ||||||
3054 | { ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd | ||||||
3055 | { ISD::UADDSAT, MVT::v4i32, 3 }, // not + pminud + paddd | ||||||
3056 | { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/ | ||||||
3057 | { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/ | ||||||
3058 | }; | ||||||
3059 | static const CostTblEntry SSE41CostTbl[] = { | ||||||
3060 | { ISD::ABS, MVT::v2i64, 2 }, // BLENDVPD(X,PSUBQ(0,X),X) | ||||||
3061 | { ISD::SMAX, MVT::v4i32, 1 }, | ||||||
3062 | { ISD::SMAX, MVT::v16i8, 1 }, | ||||||
3063 | { ISD::SMIN, MVT::v4i32, 1 }, | ||||||
3064 | { ISD::SMIN, MVT::v16i8, 1 }, | ||||||
3065 | { ISD::UMAX, MVT::v4i32, 1 }, | ||||||
3066 | { ISD::UMAX, MVT::v8i16, 1 }, | ||||||
3067 | { ISD::UMIN, MVT::v4i32, 1 }, | ||||||
3068 | { ISD::UMIN, MVT::v8i16, 1 }, | ||||||
3069 | }; | ||||||
3070 | static const CostTblEntry SSSE3CostTbl[] = { | ||||||
3071 | { ISD::ABS, MVT::v4i32, 1 }, | ||||||
3072 | { ISD::ABS, MVT::v8i16, 1 }, | ||||||
3073 | { ISD::ABS, MVT::v16i8, 1 }, | ||||||
3074 | { ISD::BITREVERSE, MVT::v2i64, 5 }, | ||||||
3075 | { ISD::BITREVERSE, MVT::v4i32, 5 }, | ||||||
3076 | { ISD::BITREVERSE, MVT::v8i16, 5 }, | ||||||
3077 | { ISD::BITREVERSE, MVT::v16i8, 5 }, | ||||||
3078 | { ISD::BSWAP, MVT::v2i64, 1 }, | ||||||
3079 | { ISD::BSWAP, MVT::v4i32, 1 }, | ||||||
3080 | { ISD::BSWAP, MVT::v8i16, 1 }, | ||||||
3081 | { ISD::CTLZ, MVT::v2i64, 23 }, | ||||||
3082 | { ISD::CTLZ, MVT::v4i32, 18 }, | ||||||
3083 | { ISD::CTLZ, MVT::v8i16, 14 }, | ||||||
3084 | { ISD::CTLZ, MVT::v16i8, 9 }, | ||||||
3085 | { ISD::CTPOP, MVT::v2i64, 7 }, | ||||||
3086 | { ISD::CTPOP, MVT::v4i32, 11 }, | ||||||
3087 | { ISD::CTPOP, MVT::v8i16, 9 }, | ||||||
3088 | { ISD::CTPOP, MVT::v16i8, 6 }, | ||||||
3089 | { ISD::CTTZ, MVT::v2i64, 10 }, | ||||||
3090 | { ISD::CTTZ, MVT::v4i32, 14 }, | ||||||
3091 | { ISD::CTTZ, MVT::v8i16, 12 }, | ||||||
3092 | { ISD::CTTZ, MVT::v16i8, 9 } | ||||||
3093 | }; | ||||||
3094 | static const CostTblEntry SSE2CostTbl[] = { | ||||||
3095 | { ISD::ABS, MVT::v2i64, 4 }, | ||||||
3096 | { ISD::ABS, MVT::v4i32, 3 }, | ||||||
3097 | { ISD::ABS, MVT::v8i16, 2 }, | ||||||
3098 | { ISD::ABS, MVT::v16i8, 2 }, | ||||||
3099 | { ISD::BITREVERSE, MVT::v2i64, 29 }, | ||||||
3100 | { ISD::BITREVERSE, MVT::v4i32, 27 }, | ||||||
3101 | { ISD::BITREVERSE, MVT::v8i16, 27 }, | ||||||
3102 | { ISD::BITREVERSE, MVT::v16i8, 20 }, | ||||||
3103 | { ISD::BSWAP, MVT::v2i64, 7 }, | ||||||
3104 | { ISD::BSWAP, MVT::v4i32, 7 }, | ||||||
3105 | { ISD::BSWAP, MVT::v8i16, 7 }, | ||||||
3106 | { ISD::CTLZ, MVT::v2i64, 25 }, | ||||||
3107 | { ISD::CTLZ, MVT::v4i32, 26 }, | ||||||
3108 | { ISD::CTLZ, MVT::v8i16, 20 }, | ||||||
3109 | { ISD::CTLZ, MVT::v16i8, 17 }, | ||||||
3110 | { ISD::CTPOP, MVT::v2i64, 12 }, | ||||||
3111 | { ISD::CTPOP, MVT::v4i32, 15 }, | ||||||
3112 | { ISD::CTPOP, MVT::v8i16, 13 }, | ||||||
3113 | { ISD::CTPOP, MVT::v16i8, 10 }, | ||||||
3114 | { ISD::CTTZ, MVT::v2i64, 14 }, | ||||||
3115 | { ISD::CTTZ, MVT::v4i32, 18 }, | ||||||
3116 | { ISD::CTTZ, MVT::v8i16, 16 }, | ||||||
3117 | { ISD::CTTZ, MVT::v16i8, 13 }, | ||||||
3118 | { ISD::SADDSAT, MVT::v8i16, 1 }, | ||||||
3119 | { ISD::SADDSAT, MVT::v16i8, 1 }, | ||||||
3120 | { ISD::SMAX, MVT::v8i16, 1 }, | ||||||
3121 | { ISD::SMIN, MVT::v8i16, 1 }, | ||||||
3122 | { ISD::SSUBSAT, MVT::v8i16, 1 }, | ||||||
3123 | { ISD::SSUBSAT, MVT::v16i8, 1 }, | ||||||
3124 | { ISD::UADDSAT, MVT::v8i16, 1 }, | ||||||
3125 | { ISD::UADDSAT, MVT::v16i8, 1 }, | ||||||
3126 | { ISD::UMAX, MVT::v8i16, 2 }, | ||||||
3127 | { ISD::UMAX, MVT::v16i8, 1 }, | ||||||
3128 | { ISD::UMIN, MVT::v8i16, 2 }, | ||||||
3129 | { ISD::UMIN, MVT::v16i8, 1 }, | ||||||
3130 | { ISD::USUBSAT, MVT::v8i16, 1 }, | ||||||
3131 | { ISD::USUBSAT, MVT::v16i8, 1 }, | ||||||
3132 | { ISD::FMAXNUM, MVT::f64, 4 }, | ||||||
3133 | { ISD::FMAXNUM, MVT::v2f64, 4 }, | ||||||
3134 | { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/ | ||||||
3135 | { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/ | ||||||
3136 | }; | ||||||
3137 | static const CostTblEntry SSE1CostTbl[] = { | ||||||
3138 | { ISD::FMAXNUM, MVT::f32, 4 }, | ||||||
3139 | { ISD::FMAXNUM, MVT::v4f32, 4 }, | ||||||
3140 | { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/ | ||||||
3141 | { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/ | ||||||
3142 | }; | ||||||
3143 | static const CostTblEntry BMI64CostTbl[] = { // 64-bit targets | ||||||
3144 | { ISD::CTTZ, MVT::i64, 1 }, | ||||||
3145 | }; | ||||||
3146 | static const CostTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets | ||||||
3147 | { ISD::CTTZ, MVT::i32, 1 }, | ||||||
3148 | { ISD::CTTZ, MVT::i16, 1 }, | ||||||
3149 | { ISD::CTTZ, MVT::i8, 1 }, | ||||||
3150 | }; | ||||||
3151 | static const CostTblEntry LZCNT64CostTbl[] = { // 64-bit targets | ||||||
3152 | { ISD::CTLZ, MVT::i64, 1 }, | ||||||
3153 | }; | ||||||
3154 | static const CostTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets | ||||||
3155 | { ISD::CTLZ, MVT::i32, 1 }, | ||||||
3156 | { ISD::CTLZ, MVT::i16, 1 }, | ||||||
3157 | { ISD::CTLZ, MVT::i8, 1 }, | ||||||
3158 | }; | ||||||
3159 | static const CostTblEntry POPCNT64CostTbl[] = { // 64-bit targets | ||||||
3160 | { ISD::CTPOP, MVT::i64, 1 }, | ||||||
3161 | }; | ||||||
3162 | static const CostTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets | ||||||
3163 | { ISD::CTPOP, MVT::i32, 1 }, | ||||||
3164 | { ISD::CTPOP, MVT::i16, 1 }, | ||||||
3165 | { ISD::CTPOP, MVT::i8, 1 }, | ||||||
3166 | }; | ||||||
3167 | static const CostTblEntry X64CostTbl[] = { // 64-bit targets | ||||||
3168 | { ISD::ABS, MVT::i64, 2 }, // SUB+CMOV | ||||||
3169 | { ISD::BITREVERSE, MVT::i64, 14 }, | ||||||
3170 | { ISD::BSWAP, MVT::i64, 1 }, | ||||||
3171 | { ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV | ||||||
3172 | { ISD::CTTZ, MVT::i64, 3 }, // TEST+BSF+CMOV/BRANCH | ||||||
3173 | { ISD::CTPOP, MVT::i64, 10 }, | ||||||
3174 | { ISD::SADDO, MVT::i64, 1 }, | ||||||
3175 | { ISD::UADDO, MVT::i64, 1 }, | ||||||
3176 | { ISD::UMULO, MVT::i64, 2 }, // mulq + seto | ||||||
3177 | }; | ||||||
3178 | static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets | ||||||
3179 | { ISD::ABS, MVT::i32, 2 }, // SUB+CMOV | ||||||
3180 | { ISD::ABS, MVT::i16, 2 }, // SUB+CMOV | ||||||
3181 | { ISD::BITREVERSE, MVT::i32, 14 }, | ||||||
3182 | { ISD::BITREVERSE, MVT::i16, 14 }, | ||||||
3183 | { ISD::BITREVERSE, MVT::i8, 11 }, | ||||||
3184 | { ISD::BSWAP, MVT::i32, 1 }, | ||||||
3185 | { ISD::BSWAP, MVT::i16, 1 }, // ROL | ||||||
3186 | { ISD::CTLZ, MVT::i32, 4 }, // BSR+XOR or BSR+XOR+CMOV | ||||||
3187 | { ISD::CTLZ, MVT::i16, 4 }, // BSR+XOR or BSR+XOR+CMOV | ||||||
3188 | { ISD::CTLZ, MVT::i8, 4 }, // BSR+XOR or BSR+XOR+CMOV | ||||||
3189 | { ISD::CTTZ, MVT::i32, 3 }, // TEST+BSF+CMOV/BRANCH | ||||||
3190 | { ISD::CTTZ, MVT::i16, 3 }, // TEST+BSF+CMOV/BRANCH | ||||||
3191 | { ISD::CTTZ, MVT::i8, 3 }, // TEST+BSF+CMOV/BRANCH | ||||||
3192 | { ISD::CTPOP, MVT::i32, 8 }, | ||||||
3193 | { ISD::CTPOP, MVT::i16, 9 }, | ||||||
3194 | { ISD::CTPOP, MVT::i8, 7 }, | ||||||
3195 | { ISD::SADDO, MVT::i32, 1 }, | ||||||
3196 | { ISD::SADDO, MVT::i16, 1 }, | ||||||
3197 | { ISD::SADDO, MVT::i8, 1 }, | ||||||
3198 | { ISD::UADDO, MVT::i32, 1 }, | ||||||
3199 | { ISD::UADDO, MVT::i16, 1 }, | ||||||
3200 | { ISD::UADDO, MVT::i8, 1 }, | ||||||
3201 | { ISD::UMULO, MVT::i32, 2 }, // mul + seto | ||||||
3202 | { ISD::UMULO, MVT::i16, 2 }, | ||||||
3203 | { ISD::UMULO, MVT::i8, 2 }, | ||||||
3204 | }; | ||||||
3205 | |||||||
3206 | Type *RetTy = ICA.getReturnType(); | ||||||
3207 | Type *OpTy = RetTy; | ||||||
3208 | Intrinsic::ID IID = ICA.getID(); | ||||||
3209 | unsigned ISD = ISD::DELETED_NODE; | ||||||
3210 | switch (IID) { | ||||||
3211 | default: | ||||||
3212 | break; | ||||||
3213 | case Intrinsic::abs: | ||||||
3214 | ISD = ISD::ABS; | ||||||
3215 | break; | ||||||
3216 | case Intrinsic::bitreverse: | ||||||
3217 | ISD = ISD::BITREVERSE; | ||||||
3218 | break; | ||||||
3219 | case Intrinsic::bswap: | ||||||
3220 | ISD = ISD::BSWAP; | ||||||
3221 | break; | ||||||
3222 | case Intrinsic::ctlz: | ||||||
3223 | ISD = ISD::CTLZ; | ||||||
3224 | break; | ||||||
3225 | case Intrinsic::ctpop: | ||||||
3226 | ISD = ISD::CTPOP; | ||||||
3227 | break; | ||||||
3228 | case Intrinsic::cttz: | ||||||
3229 | ISD = ISD::CTTZ; | ||||||
3230 | break; | ||||||
3231 | case Intrinsic::maxnum: | ||||||
3232 | case Intrinsic::minnum: | ||||||
3233 | // FMINNUM has same costs so don't duplicate. | ||||||
3234 | ISD = ISD::FMAXNUM; | ||||||
3235 | break; | ||||||
3236 | case Intrinsic::sadd_sat: | ||||||
3237 | ISD = ISD::SADDSAT; | ||||||
3238 | break; | ||||||
3239 | case Intrinsic::smax: | ||||||
3240 | ISD = ISD::SMAX; | ||||||
3241 | break; | ||||||
3242 | case Intrinsic::smin: | ||||||
3243 | ISD = ISD::SMIN; | ||||||
3244 | break; | ||||||
3245 | case Intrinsic::ssub_sat: | ||||||
3246 | ISD = ISD::SSUBSAT; | ||||||
3247 | break; | ||||||
3248 | case Intrinsic::uadd_sat: | ||||||
3249 | ISD = ISD::UADDSAT; | ||||||
3250 | break; | ||||||
3251 | case Intrinsic::umax: | ||||||
3252 | ISD = ISD::UMAX; | ||||||
3253 | break; | ||||||
3254 | case Intrinsic::umin: | ||||||
3255 | ISD = ISD::UMIN; | ||||||
3256 | break; | ||||||
3257 | case Intrinsic::usub_sat: | ||||||
3258 | ISD = ISD::USUBSAT; | ||||||
3259 | break; | ||||||
3260 | case Intrinsic::sqrt: | ||||||
3261 | ISD = ISD::FSQRT; | ||||||
3262 | break; | ||||||
3263 | case Intrinsic::sadd_with_overflow: | ||||||
3264 | case Intrinsic::ssub_with_overflow: | ||||||
3265 | // SSUBO has same costs so don't duplicate. | ||||||
3266 | ISD = ISD::SADDO; | ||||||
3267 | OpTy = RetTy->getContainedType(0); | ||||||
3268 | break; | ||||||
3269 | case Intrinsic::uadd_with_overflow: | ||||||
3270 | case Intrinsic::usub_with_overflow: | ||||||
3271 | // USUBO has same costs so don't duplicate. | ||||||
3272 | ISD = ISD::UADDO; | ||||||
3273 | OpTy = RetTy->getContainedType(0); | ||||||
3274 | break; | ||||||
3275 | case Intrinsic::umul_with_overflow: | ||||||
3276 | case Intrinsic::smul_with_overflow: | ||||||
3277 | // SMULO has same costs so don't duplicate. | ||||||
3278 | ISD = ISD::UMULO; | ||||||
3279 | OpTy = RetTy->getContainedType(0); | ||||||
3280 | break; | ||||||
3281 | } | ||||||
3282 | |||||||
3283 | if (ISD != ISD::DELETED_NODE) { | ||||||
3284 | // Legalize the type. | ||||||
3285 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, OpTy); | ||||||
3286 | MVT MTy = LT.second; | ||||||
3287 | |||||||
3288 | // Attempt to lookup cost. | ||||||
3289 | if (ISD == ISD::BITREVERSE && ST->hasGFNI() && ST->hasSSSE3() && | ||||||
3290 | MTy.isVector()) { | ||||||
3291 | // With PSHUFB the code is very similar for all types. If we have integer | ||||||
3292 | // byte operations, we just need a GF2P8AFFINEQB for vXi8. For other types | ||||||
3293 | // we also need a PSHUFB. | ||||||
3294 | unsigned Cost = MTy.getVectorElementType() == MVT::i8 ? 1 : 2; | ||||||
3295 | |||||||
3296 | // Without byte operations, we need twice as many GF2P8AFFINEQB and PSHUFB | ||||||
3297 | // instructions. We also need an extract and an insert. | ||||||
3298 | if (!(MTy.is128BitVector() || (ST->hasAVX2() && MTy.is256BitVector()) || | ||||||
3299 | (ST->hasBWI() && MTy.is512BitVector()))) | ||||||
3300 | Cost = Cost * 2 + 2; | ||||||
3301 | |||||||
3302 | return LT.first * Cost; | ||||||
3303 | } | ||||||
3304 | |||||||
3305 | auto adjustTableCost = [](const CostTblEntry &Entry, | ||||||
3306 | InstructionCost LegalizationCost, | ||||||
3307 | FastMathFlags FMF) { | ||||||
3308 | // If there are no NANs to deal with, then these are reduced to a | ||||||
3309 | // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we | ||||||
3310 | // assume is used in the non-fast case. | ||||||
3311 | if (Entry.ISD == ISD::FMAXNUM || Entry.ISD == ISD::FMINNUM) { | ||||||
3312 | if (FMF.noNaNs()) | ||||||
3313 | return LegalizationCost * 1; | ||||||
3314 | } | ||||||
3315 | return LegalizationCost * (int)Entry.Cost; | ||||||
3316 | }; | ||||||
3317 | |||||||
3318 | if (ST->useGLMDivSqrtCosts()) | ||||||
3319 | if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy)) | ||||||
3320 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3321 | |||||||
3322 | if (ST->useSLMArithCosts()) | ||||||
3323 | if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) | ||||||
3324 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3325 | |||||||
3326 | if (ST->hasBITALG()) | ||||||
3327 | if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy)) | ||||||
3328 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3329 | |||||||
3330 | if (ST->hasVPOPCNTDQ()) | ||||||
3331 | if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy)) | ||||||
3332 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3333 | |||||||
3334 | if (ST->hasCDI()) | ||||||
3335 | if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy)) | ||||||
3336 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3337 | |||||||
3338 | if (ST->hasBWI()) | ||||||
3339 | if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) | ||||||
3340 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3341 | |||||||
3342 | if (ST->hasAVX512()) | ||||||
3343 | if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) | ||||||
3344 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3345 | |||||||
3346 | if (ST->hasXOP()) | ||||||
3347 | if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy)) | ||||||
3348 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3349 | |||||||
3350 | if (ST->hasAVX2()) | ||||||
3351 | if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) | ||||||
3352 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3353 | |||||||
3354 | if (ST->hasAVX()) | ||||||
3355 | if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) | ||||||
3356 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3357 | |||||||
3358 | if (ST->hasSSE42()) | ||||||
3359 | if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) | ||||||
3360 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3361 | |||||||
3362 | if (ST->hasSSE41()) | ||||||
3363 | if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) | ||||||
3364 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3365 | |||||||
3366 | if (ST->hasSSSE3()) | ||||||
3367 | if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy)) | ||||||
3368 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3369 | |||||||
3370 | if (ST->hasSSE2()) | ||||||
3371 | if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) | ||||||
3372 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3373 | |||||||
3374 | if (ST->hasSSE1()) | ||||||
3375 | if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) | ||||||
3376 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3377 | |||||||
3378 | if (ST->hasBMI()) { | ||||||
3379 | if (ST->is64Bit()) | ||||||
3380 | if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy)) | ||||||
3381 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3382 | |||||||
3383 | if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy)) | ||||||
3384 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3385 | } | ||||||
3386 | |||||||
3387 | if (ST->hasLZCNT()) { | ||||||
3388 | if (ST->is64Bit()) | ||||||
3389 | if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy)) | ||||||
3390 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3391 | |||||||
3392 | if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy)) | ||||||
3393 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3394 | } | ||||||
3395 | |||||||
3396 | if (ST->hasPOPCNT()) { | ||||||
3397 | if (ST->is64Bit()) | ||||||
3398 | if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy)) | ||||||
3399 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3400 | |||||||
3401 | if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy)) | ||||||
3402 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3403 | } | ||||||
3404 | |||||||
3405 | if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) { | ||||||
3406 | if (const Instruction *II = ICA.getInst()) { | ||||||
3407 | if (II->hasOneUse() && isa<StoreInst>(II->user_back())) | ||||||
3408 | return TTI::TCC_Free; | ||||||
3409 | if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) { | ||||||
3410 | if (LI->hasOneUse()) | ||||||
3411 | return TTI::TCC_Free; | ||||||
3412 | } | ||||||
3413 | } | ||||||
3414 | } | ||||||
3415 | |||||||
3416 | if (ST->is64Bit()) | ||||||
3417 | if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy)) | ||||||
3418 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3419 | |||||||
3420 | if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy)) | ||||||
3421 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3422 | } | ||||||
3423 | |||||||
3424 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); | ||||||
3425 | } | ||||||
3426 | |||||||
3427 | InstructionCost | ||||||
3428 | X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, | ||||||
3429 | TTI::TargetCostKind CostKind) { | ||||||
3430 | if (ICA.isTypeBasedOnly()) | ||||||
3431 | return getTypeBasedIntrinsicInstrCost(ICA, CostKind); | ||||||
3432 | |||||||
3433 | static const CostTblEntry AVX512BWCostTbl[] = { | ||||||
3434 | { ISD::ROTL, MVT::v32i16, 2 }, | ||||||
3435 | { ISD::ROTL, MVT::v16i16, 2 }, | ||||||
3436 | { ISD::ROTL, MVT::v8i16, 2 }, | ||||||
3437 | { ISD::ROTL, MVT::v64i8, 5 }, | ||||||
3438 | { ISD::ROTL, MVT::v32i8, 5 }, | ||||||
3439 | { ISD::ROTL, MVT::v16i8, 5 }, | ||||||
3440 | { ISD::ROTR, MVT::v32i16, 2 }, | ||||||
3441 | { ISD::ROTR, MVT::v16i16, 2 }, | ||||||
3442 | { ISD::ROTR, MVT::v8i16, 2 }, | ||||||
3443 | { ISD::ROTR, MVT::v64i8, 5 }, | ||||||
3444 | { ISD::ROTR, MVT::v32i8, 5 }, | ||||||
3445 | { ISD::ROTR, MVT::v16i8, 5 } | ||||||
3446 | }; | ||||||
3447 | static const CostTblEntry AVX512CostTbl[] = { | ||||||
3448 | { ISD::ROTL, MVT::v8i64, 1 }, | ||||||
3449 | { ISD::ROTL, MVT::v4i64, 1 }, | ||||||
3450 | { ISD::ROTL, MVT::v2i64, 1 }, | ||||||
3451 | { ISD::ROTL, MVT::v16i32, 1 }, | ||||||
3452 | { ISD::ROTL, MVT::v8i32, 1 }, | ||||||
3453 | { ISD::ROTL, MVT::v4i32, 1 }, | ||||||
3454 | { ISD::ROTR, MVT::v8i64, 1 }, | ||||||
3455 | { ISD::ROTR, MVT::v4i64, 1 }, | ||||||
3456 | { ISD::ROTR, MVT::v2i64, 1 }, | ||||||
3457 | { ISD::ROTR, MVT::v16i32, 1 }, | ||||||
3458 | { ISD::ROTR, MVT::v8i32, 1 }, | ||||||
3459 | { ISD::ROTR, MVT::v4i32, 1 } | ||||||
3460 | }; | ||||||
3461 | // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y)) | ||||||
3462 | static const CostTblEntry XOPCostTbl[] = { | ||||||
3463 | { ISD::ROTL, MVT::v4i64, 4 }, | ||||||
3464 | { ISD::ROTL, MVT::v8i32, 4 }, | ||||||
3465 | { ISD::ROTL, MVT::v16i16, 4 }, | ||||||
3466 | { ISD::ROTL, MVT::v32i8, 4 }, | ||||||
3467 | { ISD::ROTL, MVT::v2i64, 1 }, | ||||||
3468 | { ISD::ROTL, MVT::v4i32, 1 }, | ||||||
3469 | { ISD::ROTL, MVT::v8i16, 1 }, | ||||||
3470 | { ISD::ROTL, MVT::v16i8, 1 }, | ||||||
3471 | { ISD::ROTR, MVT::v4i64, 6 }, | ||||||
3472 | { ISD::ROTR, MVT::v8i32, 6 }, | ||||||
3473 | { ISD::ROTR, MVT::v16i16, 6 }, | ||||||
3474 | { ISD::ROTR, MVT::v32i8, 6 }, | ||||||
3475 | { ISD::ROTR, MVT::v2i64, 2 }, | ||||||
3476 | { ISD::ROTR, MVT::v4i32, 2 }, | ||||||
3477 | { ISD::ROTR, MVT::v8i16, 2 }, | ||||||
3478 | { ISD::ROTR, MVT::v16i8, 2 } | ||||||
3479 | }; | ||||||
3480 | static const CostTblEntry X64CostTbl[] = { // 64-bit targets | ||||||
3481 | { ISD::ROTL, MVT::i64, 1 }, | ||||||
3482 | { ISD::ROTR, MVT::i64, 1 }, | ||||||
3483 | { ISD::FSHL, MVT::i64, 4 } | ||||||
3484 | }; | ||||||
3485 | static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets | ||||||
3486 | { ISD::ROTL, MVT::i32, 1 }, | ||||||
3487 | { ISD::ROTL, MVT::i16, 1 }, | ||||||
3488 | { ISD::ROTL, MVT::i8, 1 }, | ||||||
3489 | { ISD::ROTR, MVT::i32, 1 }, | ||||||
3490 | { ISD::ROTR, MVT::i16, 1 }, | ||||||
3491 | { ISD::ROTR, MVT::i8, 1 }, | ||||||
3492 | { ISD::FSHL, MVT::i32, 4 }, | ||||||
3493 | { ISD::FSHL, MVT::i16, 4 }, | ||||||
3494 | { ISD::FSHL, MVT::i8, 4 } | ||||||
3495 | }; | ||||||
3496 | |||||||
3497 | Intrinsic::ID IID = ICA.getID(); | ||||||
3498 | Type *RetTy = ICA.getReturnType(); | ||||||
3499 | const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); | ||||||
3500 | unsigned ISD = ISD::DELETED_NODE; | ||||||
3501 | switch (IID) { | ||||||
3502 | default: | ||||||
3503 | break; | ||||||
3504 | case Intrinsic::fshl: | ||||||
3505 | ISD = ISD::FSHL; | ||||||
3506 | if (Args[0] == Args[1]) | ||||||
3507 | ISD = ISD::ROTL; | ||||||
3508 | break; | ||||||
3509 | case Intrinsic::fshr: | ||||||
3510 | // FSHR has same costs so don't duplicate. | ||||||
3511 | ISD = ISD::FSHL; | ||||||
3512 | if (Args[0] == Args[1]) | ||||||
3513 | ISD = ISD::ROTR; | ||||||
3514 | break; | ||||||
3515 | } | ||||||
3516 | |||||||
3517 | if (ISD != ISD::DELETED_NODE) { | ||||||
3518 | // Legalize the type. | ||||||
3519 | std::pair<InstructionCost, MVT> LT = | ||||||
3520 | TLI->getTypeLegalizationCost(DL, RetTy); | ||||||
3521 | MVT MTy = LT.second; | ||||||
3522 | |||||||
3523 | // Attempt to lookup cost. | ||||||
3524 | if (ST->hasBWI()) | ||||||
3525 | if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) | ||||||
3526 | return LT.first * Entry->Cost; | ||||||
3527 | |||||||
3528 | if (ST->hasAVX512()) | ||||||
3529 | if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) | ||||||
3530 | return LT.first * Entry->Cost; | ||||||
3531 | |||||||
3532 | if (ST->hasXOP()) | ||||||
3533 | if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy)) | ||||||
3534 | return LT.first * Entry->Cost; | ||||||
3535 | |||||||
3536 | if (ST->is64Bit()) | ||||||
3537 | if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy)) | ||||||
3538 | return LT.first * Entry->Cost; | ||||||
3539 | |||||||
3540 | if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy)) | ||||||
3541 | return LT.first * Entry->Cost; | ||||||
3542 | } | ||||||
3543 | |||||||
3544 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); | ||||||
3545 | } | ||||||
3546 | |||||||
3547 | InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, | ||||||
3548 | unsigned Index) { | ||||||
3549 | static const CostTblEntry SLMCostTbl[] = { | ||||||
3550 | { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 }, | ||||||
3551 | { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 }, | ||||||
3552 | { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 }, | ||||||
3553 | { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 } | ||||||
3554 | }; | ||||||
3555 | |||||||
3556 | assert(Val->isVectorTy() && "This must be a vector type")(static_cast <bool> (Val->isVectorTy() && "This must be a vector type" ) ? void (0) : __assert_fail ("Val->isVectorTy() && \"This must be a vector type\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3556, __extension__ __PRETTY_FUNCTION__)); | ||||||
3557 | Type *ScalarType = Val->getScalarType(); | ||||||
3558 | int RegisterFileMoveCost = 0; | ||||||
3559 | |||||||
3560 | // Non-immediate extraction/insertion can be handled as a sequence of | ||||||
3561 | // aliased loads+stores via the stack. | ||||||
3562 | if (Index == -1U && (Opcode == Instruction::ExtractElement || | ||||||
3563 | Opcode == Instruction::InsertElement)) { | ||||||
3564 | // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns: | ||||||
3565 | // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0. | ||||||
3566 | |||||||
3567 | // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling. | ||||||
3568 | assert(isa<FixedVectorType>(Val) && "Fixed vector type expected")(static_cast <bool> (isa<FixedVectorType>(Val) && "Fixed vector type expected") ? void (0) : __assert_fail ("isa<FixedVectorType>(Val) && \"Fixed vector type expected\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3568, __extension__ __PRETTY_FUNCTION__)); | ||||||
3569 | Align VecAlign = DL.getPrefTypeAlign(Val); | ||||||
3570 | Align SclAlign = DL.getPrefTypeAlign(ScalarType); | ||||||
3571 | |||||||
3572 | // Extract - store vector to stack, load scalar. | ||||||
3573 | if (Opcode == Instruction::ExtractElement) { | ||||||
3574 | return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, | ||||||
3575 | TTI::TargetCostKind::TCK_RecipThroughput) + | ||||||
3576 | getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0, | ||||||
3577 | TTI::TargetCostKind::TCK_RecipThroughput); | ||||||
3578 | } | ||||||
3579 | // Insert - store vector to stack, store scalar, load vector. | ||||||
3580 | if (Opcode == Instruction::InsertElement) { | ||||||
3581 | return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, | ||||||
3582 | TTI::TargetCostKind::TCK_RecipThroughput) + | ||||||
3583 | getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0, | ||||||
3584 | TTI::TargetCostKind::TCK_RecipThroughput) + | ||||||
3585 | getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, | ||||||
3586 | TTI::TargetCostKind::TCK_RecipThroughput); | ||||||
3587 | } | ||||||
3588 | } | ||||||
3589 | |||||||
3590 | if (Index != -1U && (Opcode
| ||||||
3591 | Opcode == Instruction::InsertElement)) { | ||||||
3592 | // Legalize the type. | ||||||
3593 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); | ||||||
3594 | |||||||
3595 | // This type is legalized to a scalar type. | ||||||
3596 | if (!LT.second.isVector()) | ||||||
3597 | return 0; | ||||||
3598 | |||||||
3599 | // The type may be split. Normalize the index to the new type. | ||||||
3600 | unsigned NumElts = LT.second.getVectorNumElements(); | ||||||
3601 | unsigned SubNumElts = NumElts; | ||||||
3602 | Index = Index % NumElts; | ||||||
3603 | |||||||
3604 | // For >128-bit vectors, we need to extract higher 128-bit subvectors. | ||||||
3605 | // For inserts, we also need to insert the subvector back. | ||||||
3606 | if (LT.second.getSizeInBits() > 128) { | ||||||
3607 | assert((LT.second.getSizeInBits() % 128) == 0 && "Illegal vector")(static_cast <bool> ((LT.second.getSizeInBits() % 128) == 0 && "Illegal vector") ? void (0) : __assert_fail ("(LT.second.getSizeInBits() % 128) == 0 && \"Illegal vector\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3607, __extension__ __PRETTY_FUNCTION__)); | ||||||
3608 | unsigned NumSubVecs = LT.second.getSizeInBits() / 128; | ||||||
3609 | SubNumElts = NumElts / NumSubVecs; | ||||||
3610 | if (SubNumElts <= Index) { | ||||||
3611 | RegisterFileMoveCost += (Opcode
| ||||||
3612 | Index %= SubNumElts; | ||||||
| |||||||
3613 | } | ||||||
3614 | } | ||||||
3615 | |||||||
3616 | if (Index == 0) { | ||||||
3617 | // Floating point scalars are already located in index #0. | ||||||
3618 | // Many insertions to #0 can fold away for scalar fp-ops, so let's assume | ||||||
3619 | // true for all. | ||||||
3620 | if (ScalarType->isFloatingPointTy()) | ||||||
3621 | return RegisterFileMoveCost; | ||||||
3622 | |||||||
3623 | // Assume movd/movq XMM -> GPR is relatively cheap on all targets. | ||||||
3624 | if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement) | ||||||
3625 | return 1 + RegisterFileMoveCost; | ||||||
3626 | } | ||||||
3627 | |||||||
3628 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | ||||||
3629 | assert(ISD && "Unexpected vector opcode")(static_cast <bool> (ISD && "Unexpected vector opcode" ) ? void (0) : __assert_fail ("ISD && \"Unexpected vector opcode\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3629, __extension__ __PRETTY_FUNCTION__)); | ||||||
3630 | MVT MScalarTy = LT.second.getScalarType(); | ||||||
3631 | if (ST->useSLMArithCosts()) | ||||||
3632 | if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy)) | ||||||
3633 | return Entry->Cost + RegisterFileMoveCost; | ||||||
3634 | |||||||
3635 | // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets. | ||||||
3636 | if ((MScalarTy == MVT::i16 && ST->hasSSE2()) || | ||||||
3637 | (MScalarTy.isInteger() && ST->hasSSE41())) | ||||||
3638 | return 1 + RegisterFileMoveCost; | ||||||
3639 | |||||||
3640 | // Assume insertps is relatively cheap on all targets. | ||||||
3641 | if (MScalarTy == MVT::f32 && ST->hasSSE41() && | ||||||
3642 | Opcode == Instruction::InsertElement) | ||||||
3643 | return 1 + RegisterFileMoveCost; | ||||||
3644 | |||||||
3645 | // For extractions we just need to shuffle the element to index 0, which | ||||||
3646 | // should be very cheap (assume cost = 1). For insertions we need to shuffle | ||||||
3647 | // the elements to its destination. In both cases we must handle the | ||||||
3648 | // subvector move(s). | ||||||
3649 | // If the vector type is already less than 128-bits then don't reduce it. | ||||||
3650 | // TODO: Under what circumstances should we shuffle using the full width? | ||||||
3651 | InstructionCost ShuffleCost = 1; | ||||||
3652 | if (Opcode == Instruction::InsertElement) { | ||||||
3653 | auto *SubTy = cast<VectorType>(Val); | ||||||
3654 | EVT VT = TLI->getValueType(DL, Val); | ||||||
3655 | if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128) | ||||||
3656 | SubTy = FixedVectorType::get(ScalarType, SubNumElts); | ||||||
3657 | ShuffleCost = | ||||||
3658 | getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, None, 0, SubTy); | ||||||
3659 | } | ||||||
3660 | int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1; | ||||||
3661 | return ShuffleCost + IntOrFpCost + RegisterFileMoveCost; | ||||||
3662 | } | ||||||
3663 | |||||||
3664 | // Add to the base cost if we know that the extracted element of a vector is | ||||||
3665 | // destined to be moved to and used in the integer register file. | ||||||
3666 | if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy()) | ||||||
3667 | RegisterFileMoveCost += 1; | ||||||
3668 | |||||||
3669 | return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost; | ||||||
3670 | } | ||||||
3671 | |||||||
3672 | InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty, | ||||||
3673 | const APInt &DemandedElts, | ||||||
3674 | bool Insert, | ||||||
3675 | bool Extract) { | ||||||
3676 | InstructionCost Cost = 0; | ||||||
3677 | |||||||
3678 | // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much | ||||||
3679 | // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT. | ||||||
3680 | if (Insert) { | ||||||
3681 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); | ||||||
3682 | MVT MScalarTy = LT.second.getScalarType(); | ||||||
3683 | |||||||
3684 | if ((MScalarTy == MVT::i16 && ST->hasSSE2()) || | ||||||
3685 | (MScalarTy.isInteger() && ST->hasSSE41()) || | ||||||
3686 | (MScalarTy == MVT::f32 && ST->hasSSE41())) { | ||||||
3687 | // For types we can insert directly, insertion into 128-bit sub vectors is | ||||||
3688 | // cheap, followed by a cheap chain of concatenations. | ||||||
3689 | if (LT.second.getSizeInBits() <= 128) { | ||||||
3690 | Cost += | ||||||
3691 | BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false); | ||||||
3692 | } else { | ||||||
3693 | // In each 128-lane, if at least one index is demanded but not all | ||||||
3694 | // indices are demanded and this 128-lane is not the first 128-lane of | ||||||
3695 | // the legalized-vector, then this 128-lane needs a extracti128; If in | ||||||
3696 | // each 128-lane, there is at least one demanded index, this 128-lane | ||||||
3697 | // needs a inserti128. | ||||||
3698 | |||||||
3699 | // The following cases will help you build a better understanding: | ||||||
3700 | // Assume we insert several elements into a v8i32 vector in avx2, | ||||||
3701 | // Case#1: inserting into 1th index needs vpinsrd + inserti128. | ||||||
3702 | // Case#2: inserting into 5th index needs extracti128 + vpinsrd + | ||||||
3703 | // inserti128. | ||||||
3704 | // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128. | ||||||
3705 | const int CostValue = *LT.first.getValue(); | ||||||
3706 | assert(CostValue >= 0 && "Negative cost!")(static_cast <bool> (CostValue >= 0 && "Negative cost!" ) ? void (0) : __assert_fail ("CostValue >= 0 && \"Negative cost!\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3706, __extension__ __PRETTY_FUNCTION__)); | ||||||
3707 | unsigned Num128Lanes = LT.second.getSizeInBits() / 128 * CostValue; | ||||||
3708 | unsigned NumElts = LT.second.getVectorNumElements() * CostValue; | ||||||
3709 | APInt WidenedDemandedElts = DemandedElts.zextOrSelf(NumElts); | ||||||
3710 | unsigned Scale = NumElts / Num128Lanes; | ||||||
3711 | // We iterate each 128-lane, and check if we need a | ||||||
3712 | // extracti128/inserti128 for this 128-lane. | ||||||
3713 | for (unsigned I = 0; I < NumElts; I += Scale) { | ||||||
3714 | APInt Mask = WidenedDemandedElts.getBitsSet(NumElts, I, I + Scale); | ||||||
3715 | APInt MaskedDE = Mask & WidenedDemandedElts; | ||||||
3716 | unsigned Population = MaskedDE.countPopulation(); | ||||||
3717 | Cost += (Population > 0 && Population != Scale && | ||||||
3718 | I % LT.second.getVectorNumElements() != 0); | ||||||
3719 | Cost += Population > 0; | ||||||
3720 | } | ||||||
3721 | Cost += DemandedElts.countPopulation(); | ||||||
3722 | |||||||
3723 | // For vXf32 cases, insertion into the 0'th index in each v4f32 | ||||||
3724 | // 128-bit vector is free. | ||||||
3725 | // NOTE: This assumes legalization widens vXf32 vectors. | ||||||
3726 | if (MScalarTy == MVT::f32) | ||||||
3727 | for (unsigned i = 0, e = cast<FixedVectorType>(Ty)->getNumElements(); | ||||||
3728 | i < e; i += 4) | ||||||
3729 | if (DemandedElts[i]) | ||||||
3730 | Cost--; | ||||||
3731 | } | ||||||
3732 | } else if (LT.second.isVector()) { | ||||||
3733 | // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded | ||||||
3734 | // integer element as a SCALAR_TO_VECTOR, then we build the vector as a | ||||||
3735 | // series of UNPCK followed by CONCAT_VECTORS - all of these can be | ||||||
3736 | // considered cheap. | ||||||
3737 | if (Ty->isIntOrIntVectorTy()) | ||||||
3738 | Cost += DemandedElts.countPopulation(); | ||||||
3739 | |||||||
3740 | // Get the smaller of the legalized or original pow2-extended number of | ||||||
3741 | // vector elements, which represents the number of unpacks we'll end up | ||||||
3742 | // performing. | ||||||
3743 | unsigned NumElts = LT.second.getVectorNumElements(); | ||||||
3744 | unsigned Pow2Elts = | ||||||
3745 | PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements()); | ||||||
3746 | Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first; | ||||||
3747 | } | ||||||
3748 | } | ||||||
3749 | |||||||
3750 | // TODO: Use default extraction for now, but we should investigate extending this | ||||||
3751 | // to handle repeated subvector extraction. | ||||||
3752 | if (Extract) | ||||||
3753 | Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, false, Extract); | ||||||
3754 | |||||||
3755 | return Cost; | ||||||
3756 | } | ||||||
3757 | |||||||
3758 | InstructionCost | ||||||
3759 | X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, | ||||||
3760 | int VF, const APInt &DemandedDstElts, | ||||||
3761 | TTI::TargetCostKind CostKind) { | ||||||
3762 | const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy); | ||||||
3763 | // We don't differentiate element types here, only element bit width. | ||||||
3764 | EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits); | ||||||
3765 | |||||||
3766 | auto bailout = [&]() { | ||||||
3767 | return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF, | ||||||
3768 | DemandedDstElts, CostKind); | ||||||
3769 | }; | ||||||
3770 | |||||||
3771 | // For now, only deal with AVX512 cases. | ||||||
3772 | if (!ST->hasAVX512()) | ||||||
3773 | return bailout(); | ||||||
3774 | |||||||
3775 | // Do we have a native shuffle for this element type, or should we promote? | ||||||
3776 | unsigned PromEltTyBits = EltTyBits; | ||||||
3777 | switch (EltTyBits) { | ||||||
3778 | case 32: | ||||||
3779 | case 64: | ||||||
3780 | break; // AVX512F. | ||||||
3781 | case 16: | ||||||
3782 | if (!ST->hasBWI()) | ||||||
3783 | PromEltTyBits = 32; // promote to i32, AVX512F. | ||||||
3784 | break; // AVX512BW | ||||||
3785 | case 8: | ||||||
3786 | if (!ST->hasVBMI()) | ||||||
3787 | PromEltTyBits = 32; // promote to i32, AVX512F. | ||||||
3788 | break; // AVX512VBMI | ||||||
3789 | case 1: | ||||||
3790 | // There is no support for shuffling i1 elements. We *must* promote. | ||||||
3791 | if (ST->hasBWI()) { | ||||||
3792 | if (ST->hasVBMI()) | ||||||
3793 | PromEltTyBits = 8; // promote to i8, AVX512VBMI. | ||||||
3794 | else | ||||||
3795 | PromEltTyBits = 16; // promote to i16, AVX512BW. | ||||||
3796 | break; | ||||||
3797 | } | ||||||
3798 | if (ST->hasDQI()) { | ||||||
3799 | PromEltTyBits = 32; // promote to i32, AVX512F. | ||||||
3800 | break; | ||||||
3801 | } | ||||||
3802 | return bailout(); | ||||||
3803 | default: | ||||||
3804 | return bailout(); | ||||||
3805 | } | ||||||
3806 | auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits); | ||||||
3807 | |||||||
3808 | auto *SrcVecTy = FixedVectorType::get(EltTy, VF); | ||||||
3809 | auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF); | ||||||
3810 | |||||||
3811 | int NumDstElements = VF * ReplicationFactor; | ||||||
3812 | auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements); | ||||||
3813 | auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements); | ||||||
3814 | |||||||
3815 | // Legalize the types. | ||||||
3816 | MVT LegalSrcVecTy = TLI->getTypeLegalizationCost(DL, SrcVecTy).second; | ||||||
3817 | MVT LegalPromSrcVecTy = TLI->getTypeLegalizationCost(DL, PromSrcVecTy).second; | ||||||
3818 | MVT LegalPromDstVecTy = TLI->getTypeLegalizationCost(DL, PromDstVecTy).second; | ||||||
3819 | MVT LegalDstVecTy = TLI->getTypeLegalizationCost(DL, DstVecTy).second; | ||||||
3820 | // They should have legalized into vector types. | ||||||
3821 | if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() || | ||||||
3822 | !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector()) | ||||||
3823 | return bailout(); | ||||||
3824 | |||||||
3825 | if (PromEltTyBits != EltTyBits) { | ||||||
3826 | // If we have to perform the shuffle with wider elt type than our data type, | ||||||
3827 | // then we will first need to anyext (we don't care about the new bits) | ||||||
3828 | // the source elements, and then truncate Dst elements. | ||||||
3829 | InstructionCost PromotionCost; | ||||||
3830 | PromotionCost += getCastInstrCost( | ||||||
3831 | Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy, | ||||||
3832 | TargetTransformInfo::CastContextHint::None, CostKind); | ||||||
3833 | PromotionCost += | ||||||
3834 | getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy, | ||||||
3835 | /*Src=*/PromDstVecTy, | ||||||
3836 | TargetTransformInfo::CastContextHint::None, CostKind); | ||||||
3837 | return PromotionCost + getReplicationShuffleCost(PromEltTy, | ||||||
3838 | ReplicationFactor, VF, | ||||||
3839 | DemandedDstElts, CostKind); | ||||||
3840 | } | ||||||
3841 | |||||||
3842 | assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&(static_cast <bool> (LegalSrcVecTy.getScalarSizeInBits( ) == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy .getScalarType() && "We expect that the legalization doesn't affect the element width, " "doesn't coalesce/split elements.") ? void (0) : __assert_fail ("LegalSrcVecTy.getScalarSizeInBits() == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() && \"We expect that the legalization doesn't affect the element width, \" \"doesn't coalesce/split elements.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3845, __extension__ __PRETTY_FUNCTION__)) | ||||||
3843 | LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&(static_cast <bool> (LegalSrcVecTy.getScalarSizeInBits( ) == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy .getScalarType() && "We expect that the legalization doesn't affect the element width, " "doesn't coalesce/split elements.") ? void (0) : __assert_fail ("LegalSrcVecTy.getScalarSizeInBits() == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() && \"We expect that the legalization doesn't affect the element width, \" \"doesn't coalesce/split elements.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3845, __extension__ __PRETTY_FUNCTION__)) | ||||||
3844 | "We expect that the legalization doesn't affect the element width, "(static_cast <bool> (LegalSrcVecTy.getScalarSizeInBits( ) == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy .getScalarType() && "We expect that the legalization doesn't affect the element width, " "doesn't coalesce/split elements.") ? void (0) : __assert_fail ("LegalSrcVecTy.getScalarSizeInBits() == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() && \"We expect that the legalization doesn't affect the element width, \" \"doesn't coalesce/split elements.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3845, __extension__ __PRETTY_FUNCTION__)) | ||||||
3845 | "doesn't coalesce/split elements.")(static_cast <bool> (LegalSrcVecTy.getScalarSizeInBits( ) == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy .getScalarType() && "We expect that the legalization doesn't affect the element width, " "doesn't coalesce/split elements.") ? void (0) : __assert_fail ("LegalSrcVecTy.getScalarSizeInBits() == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() && \"We expect that the legalization doesn't affect the element width, \" \"doesn't coalesce/split elements.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3845, __extension__ __PRETTY_FUNCTION__)); | ||||||
3846 | |||||||
3847 | unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements(); | ||||||
3848 | unsigned NumDstVectors = | ||||||
3849 | divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec); | ||||||
3850 | |||||||
3851 | auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec); | ||||||
3852 | |||||||
3853 | // Not all the produced Dst elements may be demanded. In our case, | ||||||
3854 | // given that a single Dst vector is formed by a single shuffle, | ||||||
3855 | // if all elements that will form a single Dst vector aren't demanded, | ||||||
3856 | // then we won't need to do that shuffle, so adjust the cost accordingly. | ||||||
3857 | APInt DemandedDstVectors = APIntOps::ScaleBitMask( | ||||||
3858 | DemandedDstElts.zextOrSelf(NumDstVectors * NumEltsPerDstVec), | ||||||
3859 | NumDstVectors); | ||||||
3860 | unsigned NumDstVectorsDemanded = DemandedDstVectors.countPopulation(); | ||||||
3861 | |||||||
3862 | InstructionCost SingleShuffleCost = | ||||||
3863 | getShuffleCost(TTI::SK_PermuteSingleSrc, SingleDstVecTy, | ||||||
3864 | /*Mask=*/None, /*Index=*/0, /*SubTp=*/nullptr); | ||||||
3865 | return NumDstVectorsDemanded * SingleShuffleCost; | ||||||
3866 | } | ||||||
3867 | |||||||
3868 | InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, | ||||||
3869 | MaybeAlign Alignment, | ||||||
3870 | unsigned AddressSpace, | ||||||
3871 | TTI::TargetCostKind CostKind, | ||||||
3872 | const Instruction *I) { | ||||||
3873 | // TODO: Handle other cost kinds. | ||||||
3874 | if (CostKind != TTI::TCK_RecipThroughput) { | ||||||
3875 | if (auto *SI = dyn_cast_or_null<StoreInst>(I)) { | ||||||
3876 | // Store instruction with index and scale costs 2 Uops. | ||||||
3877 | // Check the preceding GEP to identify non-const indices. | ||||||
3878 | if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) { | ||||||
3879 | if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); })) | ||||||
3880 | return TTI::TCC_Basic * 2; | ||||||
3881 | } | ||||||
3882 | } | ||||||
3883 | return TTI::TCC_Basic; | ||||||
3884 | } | ||||||
3885 | |||||||
3886 | assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&(static_cast <bool> ((Opcode == Instruction::Load || Opcode == Instruction::Store) && "Invalid Opcode") ? void ( 0) : __assert_fail ("(Opcode == Instruction::Load || Opcode == Instruction::Store) && \"Invalid Opcode\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3887, __extension__ __PRETTY_FUNCTION__)) | ||||||
3887 | "Invalid Opcode")(static_cast <bool> ((Opcode == Instruction::Load || Opcode == Instruction::Store) && "Invalid Opcode") ? void ( 0) : __assert_fail ("(Opcode == Instruction::Load || Opcode == Instruction::Store) && \"Invalid Opcode\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3887, __extension__ __PRETTY_FUNCTION__)); | ||||||
3888 | // Type legalization can't handle structs | ||||||
3889 | if (TLI->getValueType(DL, Src, true) == MVT::Other) | ||||||
3890 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, | ||||||
3891 | CostKind); | ||||||
3892 | |||||||
3893 | // Legalize the type. | ||||||
3894 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); | ||||||
3895 | |||||||
3896 | auto *VTy = dyn_cast<FixedVectorType>(Src); | ||||||
3897 | |||||||
3898 | // Handle the simple case of non-vectors. | ||||||
3899 | // NOTE: this assumes that legalization never creates vector from scalars! | ||||||
3900 | if (!VTy || !LT.second.isVector()) | ||||||
3901 | // Each load/store unit costs 1. | ||||||
3902 | return LT.first * 1; | ||||||
3903 | |||||||
3904 | bool IsLoad = Opcode == Instruction::Load; | ||||||
3905 | |||||||
3906 | Type *EltTy = VTy->getElementType(); | ||||||
3907 | |||||||
3908 | const int EltTyBits = DL.getTypeSizeInBits(EltTy); | ||||||
3909 | |||||||
3910 | InstructionCost Cost = 0; | ||||||
3911 | |||||||
3912 | // Source of truth: how many elements were there in the original IR vector? | ||||||
3913 | const unsigned SrcNumElt = VTy->getNumElements(); | ||||||
3914 | |||||||
3915 | // How far have we gotten? | ||||||
3916 | int NumEltRemaining = SrcNumElt; | ||||||
3917 | // Note that we intentionally capture by-reference, NumEltRemaining changes. | ||||||
3918 | auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; }; | ||||||
3919 | |||||||
3920 | const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8); | ||||||
3921 | |||||||
3922 | // Note that even if we can store 64 bits of an XMM, we still operate on XMM. | ||||||
3923 | const unsigned XMMBits = 128; | ||||||
3924 | if (XMMBits % EltTyBits != 0) | ||||||
3925 | // Vector size must be a multiple of the element size. I.e. no padding. | ||||||
3926 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, | ||||||
3927 | CostKind); | ||||||
3928 | const int NumEltPerXMM = XMMBits / EltTyBits; | ||||||
3929 | |||||||
3930 | auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM); | ||||||
3931 | |||||||
3932 | for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0; | ||||||
3933 | NumEltRemaining > 0; CurrOpSizeBytes /= 2) { | ||||||
3934 | // How many elements would a single op deal with at once? | ||||||
3935 | if ((8 * CurrOpSizeBytes) % EltTyBits != 0) | ||||||
3936 | // Vector size must be a multiple of the element size. I.e. no padding. | ||||||
3937 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, | ||||||
3938 | CostKind); | ||||||
3939 | int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits; | ||||||
3940 | |||||||
3941 | assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?")(static_cast <bool> (CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?") ? void (0) : __assert_fail ("CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && \"How'd we get here?\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3941, __extension__ __PRETTY_FUNCTION__)); | ||||||
3942 | assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||(static_cast <bool> ((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes )) && "Unless we haven't halved the op size yet, " "we have less than two op's sized units of work left." ) ? void (0) : __assert_fail ("(((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes)) && \"Unless we haven't halved the op size yet, \" \"we have less than two op's sized units of work left.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3945, __extension__ __PRETTY_FUNCTION__)) | ||||||
3943 | (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&(static_cast <bool> ((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes )) && "Unless we haven't halved the op size yet, " "we have less than two op's sized units of work left." ) ? void (0) : __assert_fail ("(((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes)) && \"Unless we haven't halved the op size yet, \" \"we have less than two op's sized units of work left.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3945, __extension__ __PRETTY_FUNCTION__)) | ||||||
3944 | "Unless we haven't halved the op size yet, "(static_cast <bool> ((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes )) && "Unless we haven't halved the op size yet, " "we have less than two op's sized units of work left." ) ? void (0) : __assert_fail ("(((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes)) && \"Unless we haven't halved the op size yet, \" \"we have less than two op's sized units of work left.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3945, __extension__ __PRETTY_FUNCTION__)) | ||||||
3945 | "we have less than two op's sized units of work left.")(static_cast <bool> ((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes )) && "Unless we haven't halved the op size yet, " "we have less than two op's sized units of work left." ) ? void (0) : __assert_fail ("(((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes)) && \"Unless we haven't halved the op size yet, \" \"we have less than two op's sized units of work left.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3945, __extension__ __PRETTY_FUNCTION__)); | ||||||
3946 | |||||||
3947 | auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM | ||||||
3948 | ? FixedVectorType::get(EltTy, CurrNumEltPerOp) | ||||||
3949 | : XMMVecTy; | ||||||
3950 | |||||||
3951 | assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&(static_cast <bool> (CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && "After halving sizes, the vector elt count is no longer a multiple " "of number of elements per operation?") ? void (0) : __assert_fail ("CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && \"After halving sizes, the vector elt count is no longer a multiple \" \"of number of elements per operation?\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3953, __extension__ __PRETTY_FUNCTION__)) | ||||||
3952 | "After halving sizes, the vector elt count is no longer a multiple "(static_cast <bool> (CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && "After halving sizes, the vector elt count is no longer a multiple " "of number of elements per operation?") ? void (0) : __assert_fail ("CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && \"After halving sizes, the vector elt count is no longer a multiple \" \"of number of elements per operation?\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3953, __extension__ __PRETTY_FUNCTION__)) | ||||||
3953 | "of number of elements per operation?")(static_cast <bool> (CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && "After halving sizes, the vector elt count is no longer a multiple " "of number of elements per operation?") ? void (0) : __assert_fail ("CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && \"After halving sizes, the vector elt count is no longer a multiple \" \"of number of elements per operation?\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3953, __extension__ __PRETTY_FUNCTION__)); | ||||||
3954 | auto *CoalescedVecTy = | ||||||
3955 | CurrNumEltPerOp == 1 | ||||||
3956 | ? CurrVecTy | ||||||
3957 | : FixedVectorType::get( | ||||||
3958 | IntegerType::get(Src->getContext(), | ||||||
3959 | EltTyBits * CurrNumEltPerOp), | ||||||
3960 | CurrVecTy->getNumElements() / CurrNumEltPerOp); | ||||||
3961 | assert(DL.getTypeSizeInBits(CoalescedVecTy) ==(static_cast <bool> (DL.getTypeSizeInBits(CoalescedVecTy ) == DL.getTypeSizeInBits(CurrVecTy) && "coalesciing elements doesn't change vector width." ) ? void (0) : __assert_fail ("DL.getTypeSizeInBits(CoalescedVecTy) == DL.getTypeSizeInBits(CurrVecTy) && \"coalesciing elements doesn't change vector width.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3963, __extension__ __PRETTY_FUNCTION__)) | ||||||
3962 | DL.getTypeSizeInBits(CurrVecTy) &&(static_cast <bool> (DL.getTypeSizeInBits(CoalescedVecTy ) == DL.getTypeSizeInBits(CurrVecTy) && "coalesciing elements doesn't change vector width." ) ? void (0) : __assert_fail ("DL.getTypeSizeInBits(CoalescedVecTy) == DL.getTypeSizeInBits(CurrVecTy) && \"coalesciing elements doesn't change vector width.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3963, __extension__ __PRETTY_FUNCTION__)) | ||||||
3963 | "coalesciing elements doesn't change vector width.")(static_cast <bool> (DL.getTypeSizeInBits(CoalescedVecTy ) == DL.getTypeSizeInBits(CurrVecTy) && "coalesciing elements doesn't change vector width." ) ? void (0) : __assert_fail ("DL.getTypeSizeInBits(CoalescedVecTy) == DL.getTypeSizeInBits(CurrVecTy) && \"coalesciing elements doesn't change vector width.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3963, __extension__ __PRETTY_FUNCTION__)); | ||||||
3964 | |||||||
3965 | while (NumEltRemaining > 0) { | ||||||
3966 | assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?")(static_cast <bool> (SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?" ) ? void (0) : __assert_fail ("SubVecEltsLeft >= 0 && \"Subreg element count overconsumtion?\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3966, __extension__ __PRETTY_FUNCTION__)); | ||||||
3967 | |||||||
3968 | // Can we use this vector size, as per the remaining element count? | ||||||
3969 | // Iff the vector is naturally aligned, we can do a wide load regardless. | ||||||
3970 | if (NumEltRemaining < CurrNumEltPerOp && | ||||||
3971 | (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) && | ||||||
3972 | CurrOpSizeBytes != 1) | ||||||
3973 | break; // Try smalled vector size. | ||||||
3974 | |||||||
3975 | bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0; | ||||||
3976 | |||||||
3977 | // If we have fully processed the previous reg, we need to replenish it. | ||||||
3978 | if (SubVecEltsLeft == 0) { | ||||||
3979 | SubVecEltsLeft += CurrVecTy->getNumElements(); | ||||||
3980 | // And that's free only for the 0'th subvector of a legalized vector. | ||||||
3981 | if (!Is0thSubVec) | ||||||
3982 | Cost += getShuffleCost(IsLoad ? TTI::ShuffleKind::SK_InsertSubvector | ||||||
3983 | : TTI::ShuffleKind::SK_ExtractSubvector, | ||||||
3984 | VTy, None, NumEltDone(), CurrVecTy); | ||||||
3985 | } | ||||||
3986 | |||||||
3987 | // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM, | ||||||
3988 | // for smaller widths (32/16/8) we have to insert/extract them separately. | ||||||
3989 | // Again, it's free for the 0'th subreg (if op is 32/64 bit wide, | ||||||
3990 | // but let's pretend that it is also true for 16/8 bit wide ops...) | ||||||
3991 | if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) { | ||||||
3992 | int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM; | ||||||
3993 | assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "")(static_cast <bool> (NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "") ? void (0) : __assert_fail ("NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && \"\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3993, __extension__ __PRETTY_FUNCTION__)); | ||||||
3994 | int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp; | ||||||
3995 | APInt DemandedElts = | ||||||
3996 | APInt::getBitsSet(CoalescedVecTy->getNumElements(), | ||||||
3997 | CoalescedVecEltIdx, CoalescedVecEltIdx + 1); | ||||||
3998 | assert(DemandedElts.countPopulation() == 1 && "Inserting single value")(static_cast <bool> (DemandedElts.countPopulation() == 1 && "Inserting single value") ? void (0) : __assert_fail ("DemandedElts.countPopulation() == 1 && \"Inserting single value\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3998, __extension__ __PRETTY_FUNCTION__)); | ||||||
3999 | Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad, | ||||||
4000 | !IsLoad); | ||||||
4001 | } | ||||||
4002 | |||||||
4003 | // This isn't exactly right. We're using slow unaligned 32-byte accesses | ||||||
4004 | // as a proxy for a double-pumped AVX memory interface such as on | ||||||
4005 | // Sandybridge. | ||||||
4006 | if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow()) | ||||||
4007 | Cost += 2; | ||||||
4008 | else | ||||||
4009 | Cost += 1; | ||||||
4010 | |||||||
4011 | SubVecEltsLeft -= CurrNumEltPerOp; | ||||||
4012 | NumEltRemaining -= CurrNumEltPerOp; | ||||||
4013 | Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes); | ||||||
4014 | } | ||||||
4015 | } | ||||||
4016 | |||||||
4017 | assert(NumEltRemaining <= 0 && "Should have processed all the elements.")(static_cast <bool> (NumEltRemaining <= 0 && "Should have processed all the elements.") ? void (0) : __assert_fail ("NumEltRemaining <= 0 && \"Should have processed all the elements.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4017, __extension__ __PRETTY_FUNCTION__)); | ||||||
4018 | |||||||
4019 | return Cost; | ||||||
4020 | } | ||||||
4021 | |||||||
4022 | InstructionCost | ||||||
4023 | X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment, | ||||||
4024 | unsigned AddressSpace, | ||||||
4025 | TTI::TargetCostKind CostKind) { | ||||||
4026 | bool IsLoad = (Instruction::Load == Opcode); | ||||||
4027 | bool IsStore = (Instruction::Store == Opcode); | ||||||
4028 | |||||||
4029 | auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy); | ||||||
4030 | if (!SrcVTy) | ||||||
4031 | // To calculate scalar take the regular cost, without mask | ||||||
4032 | return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind); | ||||||
4033 | |||||||
4034 | unsigned NumElem = SrcVTy->getNumElements(); | ||||||
4035 | auto *MaskTy = | ||||||
4036 | FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem); | ||||||
4037 | if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) || | ||||||
4038 | (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) { | ||||||
4039 | // Scalarization | ||||||
4040 | APInt DemandedElts = APInt::getAllOnes(NumElem); | ||||||
4041 | InstructionCost MaskSplitCost = | ||||||
4042 | getScalarizationOverhead(MaskTy, DemandedElts, false, true); | ||||||
4043 | InstructionCost ScalarCompareCost = getCmpSelInstrCost( | ||||||
4044 | Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr, | ||||||
4045 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | ||||||
4046 | InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind); | ||||||
4047 | InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); | ||||||
4048 | InstructionCost ValueSplitCost = | ||||||
4049 | getScalarizationOverhead(SrcVTy, DemandedElts, IsLoad, IsStore); | ||||||
4050 | InstructionCost MemopCost = | ||||||
4051 | NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(), | ||||||
4052 | Alignment, AddressSpace, CostKind); | ||||||
4053 | return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost; | ||||||
4054 | } | ||||||
4055 | |||||||
4056 | // Legalize the type. | ||||||
4057 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy); | ||||||
4058 | auto VT = TLI->getValueType(DL, SrcVTy); | ||||||
4059 | InstructionCost Cost = 0; | ||||||
4060 | if (VT.isSimple() && LT.second != VT.getSimpleVT() && | ||||||
4061 | LT.second.getVectorNumElements() == NumElem) | ||||||
4062 | // Promotion requires extend/truncate for data and a shuffle for mask. | ||||||
4063 | Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, None, 0, nullptr) + | ||||||
4064 | getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, None, 0, nullptr); | ||||||
4065 | |||||||
4066 | else if (LT.first * LT.second.getVectorNumElements() > NumElem) { | ||||||
4067 | auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(), | ||||||
4068 | LT.second.getVectorNumElements()); | ||||||
4069 | // Expanding requires fill mask with zeroes | ||||||
4070 | Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, None, 0, MaskTy); | ||||||
4071 | } | ||||||
4072 | |||||||
4073 | // Pre-AVX512 - each maskmov load costs 2 + store costs ~8. | ||||||
4074 | if (!ST->hasAVX512()) | ||||||
4075 | return Cost + LT.first * (IsLoad ? 2 : 8); | ||||||
4076 | |||||||
4077 | // AVX-512 masked load/store is cheapper | ||||||
4078 | return Cost + LT.first; | ||||||
4079 | } | ||||||
4080 | |||||||
4081 | InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty, | ||||||
4082 | ScalarEvolution *SE, | ||||||
4083 | const SCEV *Ptr) { | ||||||
4084 | // Address computations in vectorized code with non-consecutive addresses will | ||||||
4085 | // likely result in more instructions compared to scalar code where the | ||||||
4086 | // computation can more often be merged into the index mode. The resulting | ||||||
4087 | // extra micro-ops can significantly decrease throughput. | ||||||
4088 | const unsigned NumVectorInstToHideOverhead = 10; | ||||||
4089 | |||||||
4090 | // Cost modeling of Strided Access Computation is hidden by the indexing | ||||||
4091 | // modes of X86 regardless of the stride value. We dont believe that there | ||||||
4092 | // is a difference between constant strided access in gerenal and constant | ||||||
4093 | // strided value which is less than or equal to 64. | ||||||
4094 | // Even in the case of (loop invariant) stride whose value is not known at | ||||||
4095 | // compile time, the address computation will not incur more than one extra | ||||||
4096 | // ADD instruction. | ||||||
4097 | if (Ty->isVectorTy() && SE && !ST->hasAVX2()) { | ||||||
4098 | // TODO: AVX2 is the current cut-off because we don't have correct | ||||||
4099 | // interleaving costs for prior ISA's. | ||||||
4100 | if (!BaseT::isStridedAccess(Ptr)) | ||||||
4101 | return NumVectorInstToHideOverhead; | ||||||
4102 | if (!BaseT::getConstantStrideStep(SE, Ptr)) | ||||||
4103 | return 1; | ||||||
4104 | } | ||||||
4105 | |||||||
4106 | return BaseT::getAddressComputationCost(Ty, SE, Ptr); | ||||||
4107 | } | ||||||
4108 | |||||||
4109 | InstructionCost | ||||||
4110 | X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, | ||||||
4111 | Optional<FastMathFlags> FMF, | ||||||
4112 | TTI::TargetCostKind CostKind) { | ||||||
4113 | if (TTI::requiresOrderedReduction(FMF)) | ||||||
| |||||||
4114 | return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); | ||||||
4115 | |||||||
4116 | // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput | ||||||
4117 | // and make it as the cost. | ||||||
4118 | |||||||
4119 | static const CostTblEntry SLMCostTblNoPairWise[] = { | ||||||
4120 | { ISD::FADD, MVT::v2f64, 3 }, | ||||||
4121 | { ISD::ADD, MVT::v2i64, 5 }, | ||||||
4122 | }; | ||||||
4123 | |||||||
4124 | static const CostTblEntry SSE2CostTblNoPairWise[] = { | ||||||
4125 | { ISD::FADD, MVT::v2f64, 2 }, | ||||||
4126 | { ISD::FADD, MVT::v2f32, 2 }, | ||||||
4127 | { ISD::FADD, MVT::v4f32, 4 }, | ||||||
4128 | { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". | ||||||
4129 | { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32 | ||||||
4130 | { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3". | ||||||
4131 | { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3". | ||||||
4132 | { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3". | ||||||
4133 | { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3". | ||||||
4134 | { ISD::ADD, MVT::v2i8, 2 }, | ||||||
4135 | { ISD::ADD, MVT::v4i8, 2 }, | ||||||
4136 | { ISD::ADD, MVT::v8i8, 2 }, | ||||||
4137 | { ISD::ADD, MVT::v16i8, 3 }, | ||||||
4138 | }; | ||||||
4139 | |||||||
4140 | static const CostTblEntry AVX1CostTblNoPairWise[] = { | ||||||
4141 | { ISD::FADD, MVT::v4f64, 3 }, | ||||||
4142 | { ISD::FADD, MVT::v4f32, 3 }, | ||||||
4143 | { ISD::FADD, MVT::v8f32, 4 }, | ||||||
4144 | { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". | ||||||
4145 | { ISD::ADD, MVT::v4i64, 3 }, | ||||||
4146 | { ISD::ADD, MVT::v8i32, 5 }, | ||||||
4147 | { ISD::ADD, MVT::v16i16, 5 }, | ||||||
4148 | { ISD::ADD, MVT::v32i8, 4 }, | ||||||
4149 | }; | ||||||
4150 | |||||||
4151 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | ||||||
4152 | assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ? void (0) : __assert_fail ("ISD && \"Invalid opcode\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4152, __extension__ __PRETTY_FUNCTION__)); | ||||||
4153 | |||||||
4154 | // Before legalizing the type, give a chance to look up illegal narrow types | ||||||
4155 | // in the table. | ||||||
4156 | // FIXME: Is there a better way to do this? | ||||||
4157 | EVT VT = TLI->getValueType(DL, ValTy); | ||||||
4158 | if (VT.isSimple()) { | ||||||
4159 | MVT MTy = VT.getSimpleVT(); | ||||||
4160 | if (ST->useSLMArithCosts()) | ||||||
4161 | if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy)) | ||||||
4162 | return Entry->Cost; | ||||||
4163 | |||||||
4164 | if (ST->hasAVX()) | ||||||
4165 | if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) | ||||||
4166 | return Entry->Cost; | ||||||
4167 | |||||||
4168 | if (ST->hasSSE2()) | ||||||
4169 | if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) | ||||||
4170 | return Entry->Cost; | ||||||
4171 | } | ||||||
4172 | |||||||
4173 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); | ||||||
4174 | |||||||
4175 | MVT MTy = LT.second; | ||||||
4176 | |||||||
4177 | auto *ValVTy = cast<FixedVectorType>(ValTy); | ||||||
4178 | |||||||
4179 | // Special case: vXi8 mul reductions are performed as vXi16. | ||||||
4180 | if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) { | ||||||
4181 | auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16); | ||||||
4182 | auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements()); | ||||||
4183 | return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy, | ||||||
4184 | TargetTransformInfo::CastContextHint::None, | ||||||
4185 | CostKind) + | ||||||
4186 | getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind); | ||||||
4187 | } | ||||||
4188 | |||||||
4189 | InstructionCost ArithmeticCost = 0; | ||||||
4190 | if (LT.first != 1 && MTy.isVector() && | ||||||
4191 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { | ||||||
4192 | // Type needs to be split. We need LT.first - 1 arithmetic ops. | ||||||
4193 | auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(), | ||||||
4194 | MTy.getVectorNumElements()); | ||||||
4195 | ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind); | ||||||
4196 | ArithmeticCost *= LT.first - 1; | ||||||
4197 | } | ||||||
4198 | |||||||
4199 | if (ST->useSLMArithCosts()) | ||||||
4200 | if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy)) | ||||||
4201 | return ArithmeticCost + Entry->Cost; | ||||||
4202 | |||||||
4203 | if (ST->hasAVX()) | ||||||
4204 | if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) | ||||||
4205 | return ArithmeticCost + Entry->Cost; | ||||||
4206 | |||||||
4207 | if (ST->hasSSE2()) | ||||||
4208 | if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) | ||||||
4209 | return ArithmeticCost + Entry->Cost; | ||||||
4210 | |||||||
4211 | // FIXME: These assume a naive kshift+binop lowering, which is probably | ||||||
4212 | // conservative in most cases. | ||||||
4213 | static const CostTblEntry AVX512BoolReduction[] = { | ||||||
4214 | { ISD::AND, MVT::v2i1, 3 }, | ||||||
4215 | { ISD::AND, MVT::v4i1, 5 }, | ||||||
4216 | { ISD::AND, MVT::v8i1, 7 }, | ||||||
4217 | { ISD::AND, MVT::v16i1, 9 }, | ||||||
4218 | { ISD::AND, MVT::v32i1, 11 }, | ||||||
4219 | { ISD::AND, MVT::v64i1, 13 }, | ||||||
4220 | { ISD::OR, MVT::v2i1, 3 }, | ||||||
4221 | { ISD::OR, MVT::v4i1, 5 }, | ||||||
4222 | { ISD::OR, MVT::v8i1, 7 }, | ||||||
4223 | { ISD::OR, MVT::v16i1, 9 }, | ||||||
4224 | { ISD::OR, MVT::v32i1, 11 }, | ||||||
4225 | { ISD::OR, MVT::v64i1, 13 }, | ||||||
4226 | }; | ||||||
4227 | |||||||
4228 | static const CostTblEntry AVX2BoolReduction[] = { | ||||||
4229 | { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp | ||||||
4230 | { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp | ||||||
4231 | { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp | ||||||
4232 | { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp | ||||||
4233 | }; | ||||||
4234 | |||||||
4235 | static const CostTblEntry AVX1BoolReduction[] = { | ||||||
4236 | { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp | ||||||
4237 | { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp | ||||||
4238 | { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp | ||||||
4239 | { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp | ||||||
4240 | { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp | ||||||
4241 | { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp | ||||||
4242 | { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp | ||||||
4243 | { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp | ||||||
4244 | }; | ||||||
4245 | |||||||
4246 | static const CostTblEntry SSE2BoolReduction[] = { | ||||||
4247 | { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp | ||||||
4248 | { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp | ||||||
4249 | { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp | ||||||
4250 | { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp | ||||||
4251 | { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp | ||||||
4252 | { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp | ||||||
4253 | { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp | ||||||
4254 | { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp | ||||||
4255 | }; | ||||||
4256 | |||||||
4257 | // Handle bool allof/anyof patterns. | ||||||
4258 | if (ValVTy->getElementType()->isIntegerTy(1)) { | ||||||
4259 | InstructionCost ArithmeticCost = 0; | ||||||
4260 | if (LT.first != 1 && MTy.isVector() && | ||||||
4261 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { | ||||||
4262 | // Type needs to be split. We need LT.first - 1 arithmetic ops. | ||||||
4263 | auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(), | ||||||
4264 | MTy.getVectorNumElements()); | ||||||
4265 | ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind); | ||||||
4266 | ArithmeticCost *= LT.first - 1; | ||||||
4267 | } | ||||||
4268 | |||||||
4269 | if (ST->hasAVX512()) | ||||||
4270 | if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy)) | ||||||
4271 | return ArithmeticCost + Entry->Cost; | ||||||
4272 | if (ST->hasAVX2()) | ||||||
4273 | if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy)) | ||||||
4274 | return ArithmeticCost + Entry->Cost; | ||||||
4275 | if (ST->hasAVX()) | ||||||
4276 | if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy)) | ||||||
4277 | return ArithmeticCost + Entry->Cost; | ||||||
4278 | if (ST->hasSSE2()) | ||||||
4279 | if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy)) | ||||||
4280 | return ArithmeticCost + Entry->Cost; | ||||||
4281 | |||||||
4282 | return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind); | ||||||
4283 | } | ||||||
4284 | |||||||
4285 | unsigned NumVecElts = ValVTy->getNumElements(); | ||||||
4286 | unsigned ScalarSize = ValVTy->getScalarSizeInBits(); | ||||||
4287 | |||||||
4288 | // Special case power of 2 reductions where the scalar type isn't changed | ||||||
4289 | // by type legalization. | ||||||
4290 | if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits()) | ||||||
4291 | return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind); | ||||||
4292 | |||||||
4293 | InstructionCost ReductionCost = 0; | ||||||
4294 | |||||||
4295 | auto *Ty = ValVTy; | ||||||
4296 | if (LT.first != 1 && MTy.isVector() && | ||||||
4297 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { | ||||||
4298 | // Type needs to be split. We need LT.first - 1 arithmetic ops. | ||||||
4299 | Ty = FixedVectorType::get(ValVTy->getElementType(), | ||||||
4300 | MTy.getVectorNumElements()); | ||||||
4301 | ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind); | ||||||
4302 | ReductionCost *= LT.first - 1; | ||||||
4303 | NumVecElts = MTy.getVectorNumElements(); | ||||||
4304 | } | ||||||
4305 | |||||||
4306 | // Now handle reduction with the legal type, taking into account size changes | ||||||
4307 | // at each level. | ||||||
4308 | while (NumVecElts > 1) { | ||||||
4309 | // Determine the size of the remaining vector we need to reduce. | ||||||
4310 | unsigned Size = NumVecElts * ScalarSize; | ||||||
4311 | NumVecElts /= 2; | ||||||
4312 | // If we're reducing from 256/512 bits, use an extract_subvector. | ||||||
4313 | if (Size > 128) { | ||||||
4314 | auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts); | ||||||
4315 | ReductionCost += | ||||||
4316 | getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, NumVecElts, SubTy); | ||||||
4317 | Ty = SubTy; | ||||||
4318 | } else if (Size == 128) { | ||||||
4319 | // Reducing from 128 bits is a permute of v2f64/v2i64. | ||||||
4320 | FixedVectorType *ShufTy; | ||||||
4321 | if (ValVTy->isFloatingPointTy()) | ||||||
4322 | ShufTy = | ||||||
4323 | FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2); | ||||||
4324 | else | ||||||
4325 | ShufTy = | ||||||
4326 | FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2); | ||||||
4327 | ReductionCost += | ||||||
4328 | getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr); | ||||||
4329 | } else if (Size == 64) { | ||||||
4330 | // Reducing from 64 bits is a shuffle of v4f32/v4i32. | ||||||
4331 | FixedVectorType *ShufTy; | ||||||
4332 | if (ValVTy->isFloatingPointTy()) | ||||||
4333 | ShufTy = | ||||||
4334 | FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4); | ||||||
4335 | else | ||||||
4336 | ShufTy = | ||||||
4337 | FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4); | ||||||
4338 | ReductionCost += | ||||||
4339 | getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr); | ||||||
4340 | } else { | ||||||
4341 | // Reducing from smaller size is a shift by immediate. | ||||||
4342 | auto *ShiftTy = FixedVectorType::get( | ||||||
4343 | Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size); | ||||||
4344 | ReductionCost += getArithmeticInstrCost( | ||||||
4345 | Instruction::LShr, ShiftTy, CostKind, | ||||||
4346 | TargetTransformInfo::OK_AnyValue, | ||||||
4347 | TargetTransformInfo::OK_UniformConstantValue, | ||||||
4348 | TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); | ||||||
4349 | } | ||||||
4350 | |||||||
4351 | // Add the arithmetic op for this level. | ||||||
4352 | ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind); | ||||||
4353 | } | ||||||
4354 | |||||||
4355 | // Add the final extract element to the cost. | ||||||
4356 | return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0); | ||||||
4357 | } | ||||||
4358 | |||||||
4359 | InstructionCost X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy, | ||||||
4360 | bool IsUnsigned) { | ||||||
4361 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); | ||||||
4362 | |||||||
4363 | MVT MTy = LT.second; | ||||||
4364 | |||||||
4365 | int ISD; | ||||||
4366 | if (Ty->isIntOrIntVectorTy()) { | ||||||
4367 | ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN; | ||||||
4368 | } else { | ||||||
4369 | assert(Ty->isFPOrFPVectorTy() &&(static_cast <bool> (Ty->isFPOrFPVectorTy() && "Expected float point or integer vector type.") ? void (0) : __assert_fail ("Ty->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4370, __extension__ __PRETTY_FUNCTION__)) | ||||||
4370 | "Expected float point or integer vector type.")(static_cast <bool> (Ty->isFPOrFPVectorTy() && "Expected float point or integer vector type.") ? void (0) : __assert_fail ("Ty->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4370, __extension__ __PRETTY_FUNCTION__)); | ||||||
4371 | ISD = ISD::FMINNUM; | ||||||
4372 | } | ||||||
4373 | |||||||
4374 | static const CostTblEntry SSE1CostTbl[] = { | ||||||
4375 | {ISD::FMINNUM, MVT::v4f32, 1}, | ||||||
4376 | }; | ||||||
4377 | |||||||
4378 | static const CostTblEntry SSE2CostTbl[] = { | ||||||
4379 | {ISD::FMINNUM, MVT::v2f64, 1}, | ||||||
4380 | {ISD::SMIN, MVT::v8i16, 1}, | ||||||
4381 | {ISD::UMIN, MVT::v16i8, 1}, | ||||||
4382 | }; | ||||||
4383 | |||||||
4384 | static const CostTblEntry SSE41CostTbl[] = { | ||||||
4385 | {ISD::SMIN, MVT::v4i32, 1}, | ||||||
4386 | {ISD::UMIN, MVT::v4i32, 1}, | ||||||
4387 | {ISD::UMIN, MVT::v8i16, 1}, | ||||||
4388 | {ISD::SMIN, MVT::v16i8, 1}, | ||||||
4389 | }; | ||||||
4390 | |||||||
4391 | static const CostTblEntry SSE42CostTbl[] = { | ||||||
4392 | {ISD::UMIN, MVT::v2i64, 3}, // xor+pcmpgtq+blendvpd | ||||||
4393 | }; | ||||||
4394 | |||||||
4395 | static const CostTblEntry AVX1CostTbl[] = { | ||||||
4396 | {ISD::FMINNUM, MVT::v8f32, 1}, | ||||||
4397 | {ISD::FMINNUM, MVT::v4f64, 1}, | ||||||
4398 | {ISD::SMIN, MVT::v8i32, 3}, | ||||||
4399 | {ISD::UMIN, MVT::v8i32, 3}, | ||||||
4400 | {ISD::SMIN, MVT::v16i16, 3}, | ||||||
4401 | {ISD::UMIN, MVT::v16i16, 3}, | ||||||
4402 | {ISD::SMIN, MVT::v32i8, 3}, | ||||||
4403 | {ISD::UMIN, MVT::v32i8, 3}, | ||||||
4404 | }; | ||||||
4405 | |||||||
4406 | static const CostTblEntry AVX2CostTbl[] = { | ||||||
4407 | {ISD::SMIN, MVT::v8i32, 1}, | ||||||
4408 | {ISD::UMIN, MVT::v8i32, 1}, | ||||||
4409 | {ISD::SMIN, MVT::v16i16, 1}, | ||||||
4410 | {ISD::UMIN, MVT::v16i16, 1}, | ||||||
4411 | {ISD::SMIN, MVT::v32i8, 1}, | ||||||
4412 | {ISD::UMIN, MVT::v32i8, 1}, | ||||||
4413 | }; | ||||||
4414 | |||||||
4415 | static const CostTblEntry AVX512CostTbl[] = { | ||||||
4416 | {ISD::FMINNUM, MVT::v16f32, 1}, | ||||||
4417 | {ISD::FMINNUM, MVT::v8f64, 1}, | ||||||
4418 | {ISD::SMIN, MVT::v2i64, 1}, | ||||||
4419 | {ISD::UMIN, MVT::v2i64, 1}, | ||||||
4420 | {ISD::SMIN, MVT::v4i64, 1}, | ||||||
4421 | {ISD::UMIN, MVT::v4i64, 1}, | ||||||
4422 | {ISD::SMIN, MVT::v8i64, 1}, | ||||||
4423 | {ISD::UMIN, MVT::v8i64, 1}, | ||||||
4424 | {ISD::SMIN, MVT::v16i32, 1}, | ||||||
4425 | {ISD::UMIN, MVT::v16i32, 1}, | ||||||
4426 | }; | ||||||
4427 | |||||||
4428 | static const CostTblEntry AVX512BWCostTbl[] = { | ||||||
4429 | {ISD::SMIN, MVT::v32i16, 1}, | ||||||
4430 | {ISD::UMIN, MVT::v32i16, 1}, | ||||||
4431 | {ISD::SMIN, MVT::v64i8, 1}, | ||||||
4432 | {ISD::UMIN, MVT::v64i8, 1}, | ||||||
4433 | }; | ||||||
4434 | |||||||
4435 | // If we have a native MIN/MAX instruction for this type, use it. | ||||||
4436 | if (ST->hasBWI()) | ||||||
4437 | if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) | ||||||
4438 | return LT.first * Entry->Cost; | ||||||
4439 | |||||||
4440 | if (ST->hasAVX512()) | ||||||
4441 | if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) | ||||||
4442 | return LT.first * Entry->Cost; | ||||||
4443 | |||||||
4444 | if (ST->hasAVX2()) | ||||||
4445 | if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) | ||||||
4446 | return LT.first * Entry->Cost; | ||||||
4447 | |||||||
4448 | if (ST->hasAVX()) | ||||||
4449 | if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) | ||||||
4450 | return LT.first * Entry->Cost; | ||||||
4451 | |||||||
4452 | if (ST->hasSSE42()) | ||||||
4453 | if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) | ||||||
4454 | return LT.first * Entry->Cost; | ||||||
4455 | |||||||
4456 | if (ST->hasSSE41()) | ||||||
4457 | if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) | ||||||
4458 | return LT.first * Entry->Cost; | ||||||
4459 | |||||||
4460 | if (ST->hasSSE2()) | ||||||
4461 | if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) | ||||||
4462 | return LT.first * Entry->Cost; | ||||||
4463 | |||||||
4464 | if (ST->hasSSE1()) | ||||||
4465 | if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) | ||||||
4466 | return LT.first * Entry->Cost; | ||||||
4467 | |||||||
4468 | unsigned CmpOpcode; | ||||||
4469 | if (Ty->isFPOrFPVectorTy()) { | ||||||
4470 | CmpOpcode = Instruction::FCmp; | ||||||
4471 | } else { | ||||||
4472 | assert(Ty->isIntOrIntVectorTy() &&(static_cast <bool> (Ty->isIntOrIntVectorTy() && "expecting floating point or integer type for min/max reduction" ) ? void (0) : __assert_fail ("Ty->isIntOrIntVectorTy() && \"expecting floating point or integer type for min/max reduction\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4473, __extension__ __PRETTY_FUNCTION__)) | ||||||
4473 | "expecting floating point or integer type for min/max reduction")(static_cast <bool> (Ty->isIntOrIntVectorTy() && "expecting floating point or integer type for min/max reduction" ) ? void (0) : __assert_fail ("Ty->isIntOrIntVectorTy() && \"expecting floating point or integer type for min/max reduction\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4473, __extension__ __PRETTY_FUNCTION__)); | ||||||
4474 | CmpOpcode = Instruction::ICmp; | ||||||
4475 | } | ||||||
4476 | |||||||
4477 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; | ||||||
4478 | // Otherwise fall back to cmp+select. | ||||||
4479 | InstructionCost Result = | ||||||
4480 | getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CmpInst::BAD_ICMP_PREDICATE, | ||||||
4481 | CostKind) + | ||||||
4482 | getCmpSelInstrCost(Instruction::Select, Ty, CondTy, | ||||||
4483 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | ||||||
4484 | return Result; | ||||||
4485 | } | ||||||
4486 | |||||||
4487 | InstructionCost | ||||||
4488 | X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy, | ||||||
4489 | bool IsUnsigned, | ||||||
4490 | TTI::TargetCostKind CostKind) { | ||||||
4491 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); | ||||||
4492 | |||||||
4493 | MVT MTy = LT.second; | ||||||
4494 | |||||||
4495 | int ISD; | ||||||
4496 | if (ValTy->isIntOrIntVectorTy()) { | ||||||
4497 | ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN; | ||||||
4498 | } else { | ||||||
4499 | assert(ValTy->isFPOrFPVectorTy() &&(static_cast <bool> (ValTy->isFPOrFPVectorTy() && "Expected float point or integer vector type.") ? void (0) : __assert_fail ("ValTy->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4500, __extension__ __PRETTY_FUNCTION__)) | ||||||
4500 | "Expected float point or integer vector type.")(static_cast <bool> (ValTy->isFPOrFPVectorTy() && "Expected float point or integer vector type.") ? void (0) : __assert_fail ("ValTy->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4500, __extension__ __PRETTY_FUNCTION__)); | ||||||
4501 | ISD = ISD::FMINNUM; | ||||||
4502 | } | ||||||
4503 | |||||||
4504 | // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput | ||||||
4505 | // and make it as the cost. | ||||||
4506 | |||||||
4507 | static const CostTblEntry SSE2CostTblNoPairWise[] = { | ||||||
4508 | {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw | ||||||
4509 | {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw | ||||||
4510 | {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw | ||||||
4511 | }; | ||||||
4512 | |||||||
4513 | static const CostTblEntry SSE41CostTblNoPairWise[] = { | ||||||
4514 | {ISD::SMIN, MVT::v2i16, 3}, // same as sse2 | ||||||
4515 | {ISD::SMIN, MVT::v4i16, 5}, // same as sse2 | ||||||
4516 | {ISD::UMIN, MVT::v2i16, 5}, // same as sse2 | ||||||
4517 | {ISD::UMIN, MVT::v4i16, 7}, // same as sse2 | ||||||
4518 | {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor | ||||||
4519 | {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax | ||||||
4520 | {ISD::SMIN, MVT::v2i8, 3}, // pminsb | ||||||
4521 | {ISD::SMIN, MVT::v4i8, 5}, // pminsb | ||||||
4522 | {ISD::SMIN, MVT::v8i8, 7}, // pminsb | ||||||
4523 | {ISD::SMIN, MVT::v16i8, 6}, | ||||||
4524 | {ISD::UMIN, MVT::v2i8, 3}, // same as sse2 | ||||||
4525 | {ISD::UMIN, MVT::v4i8, 5}, // same as sse2 | ||||||
4526 | {ISD::UMIN, MVT::v8i8, 7}, // same as sse2 | ||||||
4527 | {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax | ||||||
4528 | }; | ||||||
4529 | |||||||
4530 | static const CostTblEntry AVX1CostTblNoPairWise[] = { | ||||||
4531 | {ISD::SMIN, MVT::v16i16, 6}, | ||||||
4532 | {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax | ||||||
4533 | {ISD::SMIN, MVT::v32i8, 8}, | ||||||
4534 | {ISD::UMIN, MVT::v32i8, 8}, | ||||||
4535 | }; | ||||||
4536 | |||||||
4537 | static const CostTblEntry AVX512BWCostTblNoPairWise[] = { | ||||||
4538 | {ISD::SMIN, MVT::v32i16, 8}, | ||||||
4539 | {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax | ||||||
4540 | {ISD::SMIN, MVT::v64i8, 10}, | ||||||
4541 | {ISD::UMIN, MVT::v64i8, 10}, | ||||||
4542 | }; | ||||||
4543 | |||||||
4544 | // Before legalizing the type, give a chance to look up illegal narrow types | ||||||
4545 | // in the table. | ||||||
4546 | // FIXME: Is there a better way to do this? | ||||||
4547 | EVT VT = TLI->getValueType(DL, ValTy); | ||||||
4548 | if (VT.isSimple()) { | ||||||
4549 | MVT MTy = VT.getSimpleVT(); | ||||||
4550 | if (ST->hasBWI()) | ||||||
4551 | if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy)) | ||||||
4552 | return Entry->Cost; | ||||||
4553 | |||||||
4554 | if (ST->hasAVX()) | ||||||
4555 | if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) | ||||||
4556 | return Entry->Cost; | ||||||
4557 | |||||||
4558 | if (ST->hasSSE41()) | ||||||
4559 | if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy)) | ||||||
4560 | return Entry->Cost; | ||||||
4561 | |||||||
4562 | if (ST->hasSSE2()) | ||||||
4563 | if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) | ||||||
4564 | return Entry->Cost; | ||||||
4565 | } | ||||||
4566 | |||||||
4567 | auto *ValVTy = cast<FixedVectorType>(ValTy); | ||||||
4568 | unsigned NumVecElts = ValVTy->getNumElements(); | ||||||
4569 | |||||||
4570 | auto *Ty = ValVTy; | ||||||
4571 | InstructionCost MinMaxCost = 0; | ||||||
4572 | if (LT.first != 1 && MTy.isVector() && | ||||||
4573 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { | ||||||
4574 | // Type needs to be split. We need LT.first - 1 operations ops. | ||||||
4575 | Ty = FixedVectorType::get(ValVTy->getElementType(), | ||||||
4576 | MTy.getVectorNumElements()); | ||||||
4577 | auto *SubCondTy = FixedVectorType::get(CondTy->getElementType(), | ||||||
4578 | MTy.getVectorNumElements()); | ||||||
4579 | MinMaxCost = getMinMaxCost(Ty, SubCondTy, IsUnsigned); | ||||||
4580 | MinMaxCost *= LT.first - 1; | ||||||
4581 | NumVecElts = MTy.getVectorNumElements(); | ||||||
4582 | } | ||||||
4583 | |||||||
4584 | if (ST->hasBWI()) | ||||||
4585 | if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy)) | ||||||
4586 | return MinMaxCost + Entry->Cost; | ||||||
4587 | |||||||
4588 | if (ST->hasAVX()) | ||||||
4589 | if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) | ||||||
4590 | return MinMaxCost + Entry->Cost; | ||||||
4591 | |||||||
4592 | if (ST->hasSSE41()) | ||||||
4593 | if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy)) | ||||||
4594 | return MinMaxCost + Entry->Cost; | ||||||
4595 | |||||||
4596 | if (ST->hasSSE2()) | ||||||
4597 | if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) | ||||||
4598 | return MinMaxCost + Entry->Cost; | ||||||
4599 | |||||||
4600 | unsigned ScalarSize = ValTy->getScalarSizeInBits(); | ||||||
4601 | |||||||
4602 | // Special case power of 2 reductions where the scalar type isn't changed | ||||||
4603 | // by type legalization. | ||||||
4604 | if (!isPowerOf2_32(ValVTy->getNumElements()) || | ||||||
4605 | ScalarSize != MTy.getScalarSizeInBits()) | ||||||
4606 | return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsUnsigned, CostKind); | ||||||
4607 | |||||||
4608 | // Now handle reduction with the legal type, taking into account size changes | ||||||
4609 | // at each level. | ||||||
4610 | while (NumVecElts > 1) { | ||||||
4611 | // Determine the size of the remaining vector we need to reduce. | ||||||
4612 | unsigned Size = NumVecElts * ScalarSize; | ||||||
4613 | NumVecElts /= 2; | ||||||
4614 | // If we're reducing from 256/512 bits, use an extract_subvector. | ||||||
4615 | if (Size > 128) { | ||||||
4616 | auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts); | ||||||
4617 | MinMaxCost += | ||||||
4618 | getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, NumVecElts, SubTy); | ||||||
4619 | Ty = SubTy; | ||||||
4620 | } else if (Size == 128) { | ||||||
4621 | // Reducing from 128 bits is a permute of v2f64/v2i64. | ||||||
4622 | VectorType *ShufTy; | ||||||
4623 | if (ValTy->isFloatingPointTy()) | ||||||
4624 | ShufTy = | ||||||
4625 | FixedVectorType::get(Type::getDoubleTy(ValTy->getContext()), 2); | ||||||
4626 | else | ||||||
4627 | ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2); | ||||||
4628 | MinMaxCost += | ||||||
4629 | getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr); | ||||||
4630 | } else if (Size == 64) { | ||||||
4631 | // Reducing from 64 bits is a shuffle of v4f32/v4i32. | ||||||
4632 | FixedVectorType *ShufTy; | ||||||
4633 | if (ValTy->isFloatingPointTy()) | ||||||
4634 | ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4); | ||||||
4635 | else | ||||||
4636 | ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4); | ||||||
4637 | MinMaxCost += | ||||||
4638 | getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr); | ||||||
4639 | } else { | ||||||
4640 | // Reducing from smaller size is a shift by immediate. | ||||||
4641 | auto *ShiftTy = FixedVectorType::get( | ||||||
4642 | Type::getIntNTy(ValTy->getContext(), Size), 128 / Size); | ||||||
4643 | MinMaxCost += getArithmeticInstrCost( | ||||||
4644 | Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput, | ||||||
4645 | TargetTransformInfo::OK_AnyValue, | ||||||
4646 | TargetTransformInfo::OK_UniformConstantValue, | ||||||
4647 | TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); | ||||||
4648 | } | ||||||
4649 | |||||||
4650 | // Add the arithmetic op for this level. | ||||||
4651 | auto *SubCondTy = | ||||||
4652 | FixedVectorType::get(CondTy->getElementType(), Ty->getNumElements()); | ||||||
4653 | MinMaxCost += getMinMaxCost(Ty, SubCondTy, IsUnsigned); | ||||||
4654 | } | ||||||
4655 | |||||||
4656 | // Add the final extract element to the cost. | ||||||
4657 | return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0); | ||||||
4658 | } | ||||||
4659 | |||||||
4660 | /// Calculate the cost of materializing a 64-bit value. This helper | ||||||
4661 | /// method might only calculate a fraction of a larger immediate. Therefore it | ||||||
4662 | /// is valid to return a cost of ZERO. | ||||||
4663 | InstructionCost X86TTIImpl::getIntImmCost(int64_t Val) { | ||||||
4664 | if (Val == 0) | ||||||
4665 | return TTI::TCC_Free; | ||||||
4666 | |||||||
4667 | if (isInt<32>(Val)) | ||||||
4668 | return TTI::TCC_Basic; | ||||||
4669 | |||||||
4670 | return 2 * TTI::TCC_Basic; | ||||||
4671 | } | ||||||
4672 | |||||||
4673 | InstructionCost X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, | ||||||
4674 | TTI::TargetCostKind CostKind) { | ||||||
4675 | assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) : __assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 4675, __extension__ __PRETTY_FUNCTION__)); | ||||||
4676 | |||||||
4677 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); | ||||||
4678 | if (BitSize == 0) | ||||||
4679 | return ~0U; | ||||||
4680 | |||||||
4681 | // Never hoist constants larger than 128bit, because this might lead to | ||||||
4682 | // incorrect code generation or assertions in codegen. | ||||||
4683 | // Fixme: Create a cost model for types larger than i128 once the codegen | ||||||
4684 | // issues have been fixed. | ||||||
4685 | if (BitSize > 128) | ||||||
4686 | return TTI::TCC_Free; | ||||||
4687 | |||||||
4688 | if (Imm == 0) | ||||||
4689 | return TTI::TCC_Free; | ||||||
4690 | |||||||
4691 | // Sign-extend all constants to a multiple of 64-bit. | ||||||
4692 | APInt ImmVal = Imm; | ||||||
4693 | if (BitSize % 64 != 0) | ||||||
4694 | ImmVal = Imm.sext(alignTo(BitSize, 64)); | ||||||
4695 | |||||||
4696 | // Split the constant into 64-bit chunks and calculate the cost for each | ||||||
4697 | // chunk. | ||||||
4698 | InstructionCost Cost = 0; | ||||||
4699 | for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { | ||||||
4700 | APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); | ||||||
4701 | int64_t Val = Tmp.getSExtValue(); | ||||||
4702 | Cost += getIntImmCost(Val); | ||||||
4703 | } | ||||||
4704 | // We need at least one instruction to materialize the constant. | ||||||
4705 | return std::max<InstructionCost>(1, Cost); | ||||||
4706 | } | ||||||
4707 | |||||||
4708 | InstructionCost X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, | ||||||
4709 | const APInt &Imm, Type *Ty, | ||||||
4710 | TTI::TargetCostKind CostKind, | ||||||
4711 | Instruction *Inst) { | ||||||
4712 | assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) : __assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 4712, __extension__ __PRETTY_FUNCTION__)); | ||||||
4713 | |||||||
4714 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); | ||||||
4715 | // There is no cost model for constants with a bit size of 0. Return TCC_Free | ||||||
4716 | // here, so that constant hoisting will ignore this constant. | ||||||
4717 | if (BitSize == 0) | ||||||
4718 | return TTI::TCC_Free; | ||||||
4719 | |||||||
4720 | unsigned ImmIdx = ~0U; | ||||||
4721 | switch (Opcode) { | ||||||
4722 | default: | ||||||
4723 | return TTI::TCC_Free; | ||||||
4724 | case Instruction::GetElementPtr: | ||||||
4725 | // Always hoist the base address of a GetElementPtr. This prevents the | ||||||
4726 | // creation of new constants for every base constant that gets constant | ||||||
4727 | // folded with the offset. | ||||||
4728 | if (Idx == 0) | ||||||
4729 | return 2 * TTI::TCC_Basic; | ||||||
4730 | return TTI::TCC_Free; | ||||||
4731 | case Instruction::Store: | ||||||
4732 | ImmIdx = 0; | ||||||
4733 | break; | ||||||
4734 | case Instruction::ICmp: | ||||||
4735 | // This is an imperfect hack to prevent constant hoisting of | ||||||
4736 | // compares that might be trying to check if a 64-bit value fits in | ||||||
4737 | // 32-bits. The backend can optimize these cases using a right shift by 32. | ||||||
4738 | // Ideally we would check the compare predicate here. There also other | ||||||
4739 | // similar immediates the backend can use shifts for. | ||||||
4740 | if (Idx == 1 && Imm.getBitWidth() == 64) { | ||||||
4741 | uint64_t ImmVal = Imm.getZExtValue(); | ||||||
4742 | if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff) | ||||||
4743 | return TTI::TCC_Free; | ||||||
4744 | } | ||||||
4745 | ImmIdx = 1; | ||||||
4746 | break; | ||||||
4747 | case Instruction::And: | ||||||
4748 | // We support 64-bit ANDs with immediates with 32-bits of leading zeroes | ||||||
4749 | // by using a 32-bit operation with implicit zero extension. Detect such | ||||||
4750 | // immediates here as the normal path expects bit 31 to be sign extended. | ||||||
4751 | if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue())) | ||||||
4752 | return TTI::TCC_Free; | ||||||
4753 | ImmIdx = 1; | ||||||
4754 | break; | ||||||
4755 | case Instruction::Add: | ||||||
4756 | case Instruction::Sub: | ||||||
4757 | // For add/sub, we can use the opposite instruction for INT32_MIN. | ||||||
4758 | if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000) | ||||||
4759 | return TTI::TCC_Free; | ||||||
4760 | ImmIdx = 1; | ||||||
4761 | break; | ||||||
4762 | case Instruction::UDiv: | ||||||
4763 | case Instruction::SDiv: | ||||||
4764 | case Instruction::URem: | ||||||
4765 | case Instruction::SRem: | ||||||
4766 | // Division by constant is typically expanded later into a different | ||||||
4767 | // instruction sequence. This completely changes the constants. | ||||||
4768 | // Report them as "free" to stop ConstantHoist from marking them as opaque. | ||||||
4769 | return TTI::TCC_Free; | ||||||
4770 | case Instruction::Mul: | ||||||
4771 | case Instruction::Or: | ||||||
4772 | case Instruction::Xor: | ||||||
4773 | ImmIdx = 1; | ||||||
4774 | break; | ||||||
4775 | // Always return TCC_Free for the shift value of a shift instruction. | ||||||
4776 | case Instruction::Shl: | ||||||
4777 | case Instruction::LShr: | ||||||
4778 | case Instruction::AShr: | ||||||
4779 | if (Idx == 1) | ||||||
4780 | return TTI::TCC_Free; | ||||||
4781 | break; | ||||||
4782 | case Instruction::Trunc: | ||||||
4783 | case Instruction::ZExt: | ||||||
4784 | case Instruction::SExt: | ||||||
4785 | case Instruction::IntToPtr: | ||||||
4786 | case Instruction::PtrToInt: | ||||||
4787 | case Instruction::BitCast: | ||||||
4788 | case Instruction::PHI: | ||||||
4789 | case Instruction::Call: | ||||||
4790 | case Instruction::Select: | ||||||
4791 | case Instruction::Ret: | ||||||
4792 | case Instruction::Load: | ||||||
4793 | break; | ||||||
4794 | } | ||||||
4795 | |||||||
4796 | if (Idx == ImmIdx) { | ||||||
4797 | int NumConstants = divideCeil(BitSize, 64); | ||||||
4798 | InstructionCost Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); | ||||||
4799 | return (Cost <= NumConstants * TTI::TCC_Basic) | ||||||
4800 | ? static_cast<int>(TTI::TCC_Free) | ||||||
4801 | : Cost; | ||||||
4802 | } | ||||||
4803 | |||||||
4804 | return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); | ||||||
4805 | } | ||||||
4806 | |||||||
4807 | InstructionCost X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, | ||||||
4808 | const APInt &Imm, Type *Ty, | ||||||
4809 | TTI::TargetCostKind CostKind) { | ||||||
4810 | assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) : __assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 4810, __extension__ __PRETTY_FUNCTION__)); | ||||||
4811 | |||||||
4812 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); | ||||||
4813 | // There is no cost model for constants with a bit size of 0. Return TCC_Free | ||||||
4814 | // here, so that constant hoisting will ignore this constant. | ||||||
4815 | if (BitSize == 0) | ||||||
4816 | return TTI::TCC_Free; | ||||||
4817 | |||||||
4818 | switch (IID) { | ||||||
4819 | default: | ||||||
4820 | return TTI::TCC_Free; | ||||||
4821 | case Intrinsic::sadd_with_overflow: | ||||||
4822 | case Intrinsic::uadd_with_overflow: | ||||||
4823 | case Intrinsic::ssub_with_overflow: | ||||||
4824 | case Intrinsic::usub_with_overflow: | ||||||
4825 | case Intrinsic::smul_with_overflow: | ||||||
4826 | case Intrinsic::umul_with_overflow: | ||||||
4827 | if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue())) | ||||||
4828 | return TTI::TCC_Free; | ||||||
4829 | break; | ||||||
4830 | case Intrinsic::experimental_stackmap: | ||||||
4831 | if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) | ||||||
4832 | return TTI::TCC_Free; | ||||||
4833 | break; | ||||||
4834 | case Intrinsic::experimental_patchpoint_void: | ||||||
4835 | case Intrinsic::experimental_patchpoint_i64: | ||||||
4836 | if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) | ||||||
4837 | return TTI::TCC_Free; | ||||||
4838 | break; | ||||||
4839 | } | ||||||
4840 | return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); | ||||||
4841 | } | ||||||
4842 | |||||||
4843 | InstructionCost X86TTIImpl::getCFInstrCost(unsigned Opcode, | ||||||
4844 | TTI::TargetCostKind CostKind, | ||||||
4845 | const Instruction *I) { | ||||||
4846 | if (CostKind != TTI::TCK_RecipThroughput) | ||||||
4847 | return Opcode == Instruction::PHI ? 0 : 1; | ||||||
4848 | // Branches are assumed to be predicted. | ||||||
4849 | return 0; | ||||||
4850 | } | ||||||
4851 | |||||||
4852 | int X86TTIImpl::getGatherOverhead() const { | ||||||
4853 | // Some CPUs have more overhead for gather. The specified overhead is relative | ||||||
4854 | // to the Load operation. "2" is the number provided by Intel architects. This | ||||||
4855 | // parameter is used for cost estimation of Gather Op and comparison with | ||||||
4856 | // other alternatives. | ||||||
4857 | // TODO: Remove the explicit hasAVX512()?, That would mean we would only | ||||||
4858 | // enable gather with a -march. | ||||||
4859 | if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather())) | ||||||
4860 | return 2; | ||||||
4861 | |||||||
4862 | return 1024; | ||||||
4863 | } | ||||||
4864 | |||||||
4865 | int X86TTIImpl::getScatterOverhead() const { | ||||||
4866 | if (ST->hasAVX512()) | ||||||
4867 | return 2; | ||||||
4868 | |||||||
4869 | return 1024; | ||||||
4870 | } | ||||||
4871 | |||||||
4872 | // Return an average cost of Gather / Scatter instruction, maybe improved later. | ||||||
4873 | // FIXME: Add TargetCostKind support. | ||||||
4874 | InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, | ||||||
4875 | const Value *Ptr, Align Alignment, | ||||||
4876 | unsigned AddressSpace) { | ||||||
4877 | |||||||
4878 | assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost")(static_cast <bool> (isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost") ? void (0) : __assert_fail ("isa<VectorType>(SrcVTy) && \"Unexpected type in getGSVectorCost\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4878, __extension__ __PRETTY_FUNCTION__)); | ||||||
4879 | unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements(); | ||||||
4880 | |||||||
4881 | // Try to reduce index size from 64 bit (default for GEP) | ||||||
4882 | // to 32. It is essential for VF 16. If the index can't be reduced to 32, the | ||||||
4883 | // operation will use 16 x 64 indices which do not fit in a zmm and needs | ||||||
4884 | // to split. Also check that the base pointer is the same for all lanes, | ||||||
4885 | // and that there's at most one variable index. | ||||||
4886 | auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) { | ||||||
4887 | unsigned IndexSize = DL.getPointerSizeInBits(); | ||||||
4888 | const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); | ||||||
4889 | if (IndexSize < 64 || !GEP) | ||||||
4890 | return IndexSize; | ||||||
4891 | |||||||
4892 | unsigned NumOfVarIndices = 0; | ||||||
4893 | const Value *Ptrs = GEP->getPointerOperand(); | ||||||
4894 | if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs)) | ||||||
4895 | return IndexSize; | ||||||
4896 | for (unsigned i = 1; i < GEP->getNumOperands(); ++i) { | ||||||
4897 | if (isa<Constant>(GEP->getOperand(i))) | ||||||
4898 | continue; | ||||||
4899 | Type *IndxTy = GEP->getOperand(i)->getType(); | ||||||
4900 | if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy)) | ||||||
4901 | IndxTy = IndexVTy->getElementType(); | ||||||
4902 | if ((IndxTy->getPrimitiveSizeInBits() == 64 && | ||||||
4903 | !isa<SExtInst>(GEP->getOperand(i))) || | ||||||
4904 | ++NumOfVarIndices > 1) | ||||||
4905 | return IndexSize; // 64 | ||||||
4906 | } | ||||||
4907 | return (unsigned)32; | ||||||
4908 | }; | ||||||
4909 | |||||||
4910 | // Trying to reduce IndexSize to 32 bits for vector 16. | ||||||
4911 | // By default the IndexSize is equal to pointer size. | ||||||
4912 | unsigned IndexSize = (ST->hasAVX512() && VF >= 16) | ||||||
4913 | ? getIndexSizeInBits(Ptr, DL) | ||||||
4914 | : DL.getPointerSizeInBits(); | ||||||
4915 | |||||||
4916 | auto *IndexVTy = FixedVectorType::get( | ||||||
4917 | IntegerType::get(SrcVTy->getContext(), IndexSize), VF); | ||||||
4918 | std::pair<InstructionCost, MVT> IdxsLT = | ||||||
4919 | TLI->getTypeLegalizationCost(DL, IndexVTy); | ||||||
4920 | std::pair<InstructionCost, MVT> SrcLT = | ||||||
4921 | TLI->getTypeLegalizationCost(DL, SrcVTy); | ||||||
4922 | InstructionCost::CostType SplitFactor = | ||||||
4923 | *std::max(IdxsLT.first, SrcLT.first).getValue(); | ||||||
4924 | if (SplitFactor > 1) { | ||||||
4925 | // Handle splitting of vector of pointers | ||||||
4926 | auto *SplitSrcTy = | ||||||
4927 | FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor); | ||||||
4928 | return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment, | ||||||
4929 | AddressSpace); | ||||||
4930 | } | ||||||
4931 | |||||||
4932 | // The gather / scatter cost is given by Intel architects. It is a rough | ||||||
4933 | // number since we are looking at one instruction in a time. | ||||||
4934 | const int GSOverhead = (Opcode == Instruction::Load) | ||||||
4935 | ? getGatherOverhead() | ||||||
4936 | : getScatterOverhead(); | ||||||
4937 | return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), | ||||||
4938 | MaybeAlign(Alignment), AddressSpace, | ||||||
4939 | TTI::TCK_RecipThroughput); | ||||||
4940 | } | ||||||
4941 | |||||||
4942 | /// Return the cost of full scalarization of gather / scatter operation. | ||||||
4943 | /// | ||||||
4944 | /// Opcode - Load or Store instruction. | ||||||
4945 | /// SrcVTy - The type of the data vector that should be gathered or scattered. | ||||||
4946 | /// VariableMask - The mask is non-constant at compile time. | ||||||
4947 | /// Alignment - Alignment for one element. | ||||||
4948 | /// AddressSpace - pointer[s] address space. | ||||||
4949 | /// | ||||||
4950 | /// FIXME: Add TargetCostKind support. | ||||||
4951 | InstructionCost X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy, | ||||||
4952 | bool VariableMask, Align Alignment, | ||||||
4953 | unsigned AddressSpace) { | ||||||
4954 | Type *ScalarTy = SrcVTy->getScalarType(); | ||||||
4955 | unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements(); | ||||||
4956 | APInt DemandedElts = APInt::getAllOnes(VF); | ||||||
4957 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; | ||||||
4958 | |||||||
4959 | InstructionCost MaskUnpackCost = 0; | ||||||
4960 | if (VariableMask) { | ||||||
4961 | auto *MaskTy = | ||||||
4962 | FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF); | ||||||
4963 | MaskUnpackCost = getScalarizationOverhead( | ||||||
4964 | MaskTy, DemandedElts, /*Insert=*/false, /*Extract=*/true); | ||||||
4965 | InstructionCost ScalarCompareCost = getCmpSelInstrCost( | ||||||
4966 | Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr, | ||||||
4967 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | ||||||
4968 | InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind); | ||||||
4969 | MaskUnpackCost += VF * (BranchCost + ScalarCompareCost); | ||||||
4970 | } | ||||||
4971 | |||||||
4972 | InstructionCost AddressUnpackCost = getScalarizationOverhead( | ||||||
4973 | FixedVectorType::get(ScalarTy->getPointerTo(), VF), DemandedElts, | ||||||
4974 | /*Insert=*/false, /*Extract=*/true); | ||||||
4975 | |||||||
4976 | // The cost of the scalar loads/stores. | ||||||
4977 | InstructionCost MemoryOpCost = | ||||||
4978 | VF * getMemoryOpCost(Opcode, ScalarTy, MaybeAlign(Alignment), | ||||||
4979 | AddressSpace, CostKind); | ||||||
4980 | |||||||
4981 | // The cost of forming the vector from loaded scalars/ | ||||||
4982 | // scalarizing the vector to perform scalar stores. | ||||||
4983 | InstructionCost InsertExtractCost = | ||||||
4984 | getScalarizationOverhead(cast<FixedVectorType>(SrcVTy), DemandedElts, | ||||||
4985 | /*Insert=*/Opcode == Instruction::Load, | ||||||
4986 | /*Extract=*/Opcode == Instruction::Store); | ||||||
4987 | |||||||
4988 | return AddressUnpackCost + MemoryOpCost + MaskUnpackCost + InsertExtractCost; | ||||||
4989 | } | ||||||
4990 | |||||||
4991 | /// Calculate the cost of Gather / Scatter operation | ||||||
4992 | InstructionCost X86TTIImpl::getGatherScatterOpCost( | ||||||
4993 | unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask, | ||||||
4994 | Align Alignment, TTI::TargetCostKind CostKind, | ||||||
4995 | const Instruction *I = nullptr) { | ||||||
4996 | if (CostKind != TTI::TCK_RecipThroughput) { | ||||||
4997 | if ((Opcode == Instruction::Load && | ||||||
4998 | isLegalMaskedGather(SrcVTy, Align(Alignment)) && | ||||||
4999 | !forceScalarizeMaskedGather(cast<VectorType>(SrcVTy), | ||||||
5000 | Align(Alignment))) || | ||||||
5001 | (Opcode == Instruction::Store && | ||||||
5002 | isLegalMaskedScatter(SrcVTy, Align(Alignment)) && | ||||||
5003 | !forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy), | ||||||
5004 | Align(Alignment)))) | ||||||
5005 | return 1; | ||||||
5006 | return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask, | ||||||
5007 | Alignment, CostKind, I); | ||||||
5008 | } | ||||||
5009 | |||||||
5010 | assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter")(static_cast <bool> (SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter") ? void (0) : __assert_fail ("SrcVTy->isVectorTy() && \"Unexpected data type for Gather/Scatter\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5010, __extension__ __PRETTY_FUNCTION__)); | ||||||
5011 | PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); | ||||||
5012 | if (!PtrTy && Ptr->getType()->isVectorTy()) | ||||||
5013 | PtrTy = dyn_cast<PointerType>( | ||||||
5014 | cast<VectorType>(Ptr->getType())->getElementType()); | ||||||
5015 | assert(PtrTy && "Unexpected type for Ptr argument")(static_cast <bool> (PtrTy && "Unexpected type for Ptr argument" ) ? void (0) : __assert_fail ("PtrTy && \"Unexpected type for Ptr argument\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5015, __extension__ __PRETTY_FUNCTION__)); | ||||||
5016 | unsigned AddressSpace = PtrTy->getAddressSpace(); | ||||||
5017 | |||||||
5018 | if ((Opcode == Instruction::Load && | ||||||
5019 | (!isLegalMaskedGather(SrcVTy, Align(Alignment)) || | ||||||
5020 | forceScalarizeMaskedGather(cast<VectorType>(SrcVTy), | ||||||
5021 | Align(Alignment)))) || | ||||||
5022 | (Opcode == Instruction::Store && | ||||||
5023 | (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) || | ||||||
5024 | forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy), | ||||||
5025 | Align(Alignment))))) | ||||||
5026 | return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, | ||||||
5027 | AddressSpace); | ||||||
5028 | |||||||
5029 | return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace); | ||||||
5030 | } | ||||||
5031 | |||||||
5032 | bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1, | ||||||
5033 | TargetTransformInfo::LSRCost &C2) { | ||||||
5034 | // X86 specific here are "instruction number 1st priority". | ||||||
5035 | return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, | ||||||
5036 | C1.NumIVMuls, C1.NumBaseAdds, | ||||||
5037 | C1.ScaleCost, C1.ImmCost, C1.SetupCost) < | ||||||
5038 | std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, | ||||||
5039 | C2.NumIVMuls, C2.NumBaseAdds, | ||||||
5040 | C2.ScaleCost, C2.ImmCost, C2.SetupCost); | ||||||
5041 | } | ||||||
5042 | |||||||
5043 | bool X86TTIImpl::canMacroFuseCmp() { | ||||||
5044 | return ST->hasMacroFusion() || ST->hasBranchFusion(); | ||||||
5045 | } | ||||||
5046 | |||||||
5047 | bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) { | ||||||
5048 | if (!ST->hasAVX()) | ||||||
5049 | return false; | ||||||
5050 | |||||||
5051 | // The backend can't handle a single element vector. | ||||||
5052 | if (isa<VectorType>(DataTy) && | ||||||
5053 | cast<FixedVectorType>(DataTy)->getNumElements() == 1) | ||||||
5054 | return false; | ||||||
5055 | Type *ScalarTy = DataTy->getScalarType(); | ||||||
5056 | |||||||
5057 | if (ScalarTy->isPointerTy()) | ||||||
5058 | return true; | ||||||
5059 | |||||||
5060 | if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) | ||||||
5061 | return true; | ||||||
5062 | |||||||
5063 | if (ScalarTy->isHalfTy() && ST->hasBWI() && ST->hasFP16()) | ||||||
5064 | return true; | ||||||
5065 | |||||||
5066 | if (!ScalarTy->isIntegerTy()) | ||||||
5067 | return false; | ||||||
5068 | |||||||
5069 | unsigned IntWidth = ScalarTy->getIntegerBitWidth(); | ||||||
5070 | return IntWidth == 32 || IntWidth == 64 || | ||||||
5071 | ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI()); | ||||||
5072 | } | ||||||
5073 | |||||||
5074 | bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) { | ||||||
5075 | return isLegalMaskedLoad(DataType, Alignment); | ||||||
5076 | } | ||||||
5077 | |||||||
5078 | bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) { | ||||||
5079 | unsigned DataSize = DL.getTypeStoreSize(DataType); | ||||||
5080 | // The only supported nontemporal loads are for aligned vectors of 16 or 32 | ||||||
5081 | // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2 | ||||||
5082 | // (the equivalent stores only require AVX). | ||||||
5083 | if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32)) | ||||||
5084 | return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2(); | ||||||
5085 | |||||||
5086 | return false; | ||||||
5087 | } | ||||||
5088 | |||||||
5089 | bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) { | ||||||
5090 | unsigned DataSize = DL.getTypeStoreSize(DataType); | ||||||
5091 | |||||||
5092 | // SSE4A supports nontemporal stores of float and double at arbitrary | ||||||
5093 | // alignment. | ||||||
5094 | if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy())) | ||||||
5095 | return true; | ||||||
5096 | |||||||
5097 | // Besides the SSE4A subtarget exception above, only aligned stores are | ||||||
5098 | // available nontemporaly on any other subtarget. And only stores with a size | ||||||
5099 | // of 4..32 bytes (powers of 2, only) are permitted. | ||||||
5100 | if (Alignment < DataSize || DataSize < 4 || DataSize > 32 || | ||||||
5101 | !isPowerOf2_32(DataSize)) | ||||||
5102 | return false; | ||||||
5103 | |||||||
5104 | // 32-byte vector nontemporal stores are supported by AVX (the equivalent | ||||||
5105 | // loads require AVX2). | ||||||
5106 | if (DataSize == 32) | ||||||
5107 | return ST->hasAVX(); | ||||||
5108 | if (DataSize == 16) | ||||||
5109 | return ST->hasSSE1(); | ||||||
5110 | return true; | ||||||
5111 | } | ||||||
5112 | |||||||
5113 | bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) { | ||||||
5114 | if (!isa<VectorType>(DataTy)) | ||||||
5115 | return false; | ||||||
5116 | |||||||
5117 | if (!ST->hasAVX512()) | ||||||
5118 | return false; | ||||||
5119 | |||||||
5120 | // The backend can't handle a single element vector. | ||||||
5121 | if (cast<FixedVectorType>(DataTy)->getNumElements() == 1) | ||||||
5122 | return false; | ||||||
5123 | |||||||
5124 | Type *ScalarTy = cast<VectorType>(DataTy)->getElementType(); | ||||||
5125 | |||||||
5126 | if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) | ||||||
5127 | return true; | ||||||
5128 | |||||||
5129 | if (!ScalarTy->isIntegerTy()) | ||||||
5130 | return false; | ||||||
5131 | |||||||
5132 | unsigned IntWidth = ScalarTy->getIntegerBitWidth(); | ||||||
5133 | return IntWidth == 32 || IntWidth == 64 || | ||||||
5134 | ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2()); | ||||||
5135 | } | ||||||
5136 | |||||||
5137 | bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) { | ||||||
5138 | return isLegalMaskedExpandLoad(DataTy); | ||||||
5139 | } | ||||||
5140 | |||||||
5141 | bool X86TTIImpl::supportsGather() const { | ||||||
5142 | // Some CPUs have better gather performance than others. | ||||||
5143 | // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only | ||||||
5144 | // enable gather with a -march. | ||||||
5145 | return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2()); | ||||||
5146 | } | ||||||
5147 | |||||||
5148 | bool X86TTIImpl::forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) { | ||||||
5149 | // Gather / Scatter for vector 2 is not profitable on KNL / SKX | ||||||
5150 | // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend | ||||||
5151 | // it to 8 elements, but zeroing upper bits of the mask vector will add more | ||||||
5152 | // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO: | ||||||
5153 | // Check, maybe the gather/scatter instruction is better in the VariableMask | ||||||
5154 | // case. | ||||||
5155 | unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements(); | ||||||
5156 | return NumElts == 1 || | ||||||
5157 | (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX()))); | ||||||
5158 | } | ||||||
5159 | |||||||
5160 | bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) { | ||||||
5161 | if (!supportsGather()) | ||||||
5162 | return false; | ||||||
5163 | Type *ScalarTy = DataTy->getScalarType(); | ||||||
5164 | if (ScalarTy->isPointerTy()) | ||||||
5165 | return true; | ||||||
5166 | |||||||
5167 | if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) | ||||||
5168 | return true; | ||||||
5169 | |||||||
5170 | if (!ScalarTy->isIntegerTy()) | ||||||
5171 | return false; | ||||||
5172 | |||||||
5173 | unsigned IntWidth = ScalarTy->getIntegerBitWidth(); | ||||||
5174 | return IntWidth == 32 || IntWidth == 64; | ||||||
5175 | } | ||||||
5176 | |||||||
5177 | bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) { | ||||||
5178 | // AVX2 doesn't support scatter | ||||||
5179 | if (!ST->hasAVX512()) | ||||||
5180 | return false; | ||||||
5181 | return isLegalMaskedGather(DataType, Alignment); | ||||||
5182 | } | ||||||
5183 | |||||||
5184 | bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) { | ||||||
5185 | EVT VT = TLI->getValueType(DL, DataType); | ||||||
5186 | return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT); | ||||||
5187 | } | ||||||
5188 | |||||||
5189 | bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) { | ||||||
5190 | return false; | ||||||
5191 | } | ||||||
5192 | |||||||
5193 | bool X86TTIImpl::areInlineCompatible(const Function *Caller, | ||||||
5194 | const Function *Callee) const { | ||||||
5195 | const TargetMachine &TM = getTLI()->getTargetMachine(); | ||||||
5196 | |||||||
5197 | // Work this as a subsetting of subtarget features. | ||||||
5198 | const FeatureBitset &CallerBits = | ||||||
5199 | TM.getSubtargetImpl(*Caller)->getFeatureBits(); | ||||||
5200 | const FeatureBitset &CalleeBits = | ||||||
5201 | TM.getSubtargetImpl(*Callee)->getFeatureBits(); | ||||||
5202 | |||||||
5203 | // Check whether features are the same (apart from the ignore list). | ||||||
5204 | FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList; | ||||||
5205 | FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList; | ||||||
5206 | if (RealCallerBits == RealCalleeBits) | ||||||
5207 | return true; | ||||||
5208 | |||||||
5209 | // If the features are a subset, we need to additionally check for calls | ||||||
5210 | // that may become ABI-incompatible as a result of inlining. | ||||||
5211 | if ((RealCallerBits & RealCalleeBits) != RealCalleeBits) | ||||||
5212 | return false; | ||||||
5213 | |||||||
5214 | for (const Instruction &I : instructions(Callee)) { | ||||||
5215 | if (const auto *CB = dyn_cast<CallBase>(&I)) { | ||||||
5216 | SmallVector<Type *, 8> Types; | ||||||
5217 | for (Value *Arg : CB->args()) | ||||||
5218 | Types.push_back(Arg->getType()); | ||||||
5219 | if (!CB->getType()->isVoidTy()) | ||||||
5220 | Types.push_back(CB->getType()); | ||||||
5221 | |||||||
5222 | // Simple types are always ABI compatible. | ||||||
5223 | auto IsSimpleTy = [](Type *Ty) { | ||||||
5224 | return !Ty->isVectorTy() && !Ty->isAggregateType(); | ||||||
5225 | }; | ||||||
5226 | if (all_of(Types, IsSimpleTy)) | ||||||
5227 | continue; | ||||||
5228 | |||||||
5229 | if (Function *NestedCallee = CB->getCalledFunction()) { | ||||||
5230 | // Assume that intrinsics are always ABI compatible. | ||||||
5231 | if (NestedCallee->isIntrinsic()) | ||||||
5232 | continue; | ||||||
5233 | |||||||
5234 | // Do a precise compatibility check. | ||||||
5235 | if (!areTypesABICompatible(Caller, NestedCallee, Types)) | ||||||
5236 | return false; | ||||||
5237 | } else { | ||||||
5238 | // We don't know the target features of the callee, | ||||||
5239 | // assume it is incompatible. | ||||||
5240 | return false; | ||||||
5241 | } | ||||||
5242 | } | ||||||
5243 | } | ||||||
5244 | return true; | ||||||
5245 | } | ||||||
5246 | |||||||
5247 | bool X86TTIImpl::areTypesABICompatible(const Function *Caller, | ||||||
5248 | const Function *Callee, | ||||||
5249 | const ArrayRef<Type *> &Types) const { | ||||||
5250 | if (!BaseT::areTypesABICompatible(Caller, Callee, Types)) | ||||||
5251 | return false; | ||||||
5252 | |||||||
5253 | // If we get here, we know the target features match. If one function | ||||||
5254 | // considers 512-bit vectors legal and the other does not, consider them | ||||||
5255 | // incompatible. | ||||||
5256 | const TargetMachine &TM = getTLI()->getTargetMachine(); | ||||||
5257 | |||||||
5258 | if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() == | ||||||
5259 | TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs()) | ||||||
5260 | return true; | ||||||
5261 | |||||||
5262 | // Consider the arguments compatible if they aren't vectors or aggregates. | ||||||
5263 | // FIXME: Look at the size of vectors. | ||||||
5264 | // FIXME: Look at the element types of aggregates to see if there are vectors. | ||||||
5265 | return llvm::none_of(Types, | ||||||
5266 | [](Type *T) { return T->isVectorTy() || T->isAggregateType(); }); | ||||||
5267 | } | ||||||
5268 | |||||||
5269 | X86TTIImpl::TTI::MemCmpExpansionOptions | ||||||
5270 | X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { | ||||||
5271 | TTI::MemCmpExpansionOptions Options; | ||||||
5272 | Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); | ||||||
5273 | Options.NumLoadsPerBlock = 2; | ||||||
5274 | // All GPR and vector loads can be unaligned. | ||||||
5275 | Options.AllowOverlappingLoads = true; | ||||||
5276 | if (IsZeroCmp) { | ||||||
5277 | // Only enable vector loads for equality comparison. Right now the vector | ||||||
5278 | // version is not as fast for three way compare (see #33329). | ||||||
5279 | const unsigned PreferredWidth = ST->getPreferVectorWidth(); | ||||||
5280 | if (PreferredWidth >= 512 && ST->hasAVX512()) Options.LoadSizes.push_back(64); | ||||||
5281 | if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32); | ||||||
5282 | if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16); | ||||||
5283 | } | ||||||
5284 | if (ST->is64Bit()) { | ||||||
5285 | Options.LoadSizes.push_back(8); | ||||||
5286 | } | ||||||
5287 | Options.LoadSizes.push_back(4); | ||||||
5288 | Options.LoadSizes.push_back(2); | ||||||
5289 | Options.LoadSizes.push_back(1); | ||||||
5290 | return Options; | ||||||
5291 | } | ||||||
5292 | |||||||
5293 | bool X86TTIImpl::prefersVectorizedAddressing() const { | ||||||
5294 | return supportsGather(); | ||||||
5295 | } | ||||||
5296 | |||||||
5297 | bool X86TTIImpl::supportsEfficientVectorElementLoadStore() const { | ||||||
5298 | return false; | ||||||
5299 | } | ||||||
5300 | |||||||
5301 | bool X86TTIImpl::enableInterleavedAccessVectorization() { | ||||||
5302 | // TODO: We expect this to be beneficial regardless of arch, | ||||||
5303 | // but there are currently some unexplained performance artifacts on Atom. | ||||||
5304 | // As a temporary solution, disable on Atom. | ||||||
5305 | return !(ST->isAtom()); | ||||||
5306 | } | ||||||
5307 | |||||||
5308 | // Get estimation for interleaved load/store operations and strided load. | ||||||
5309 | // \p Indices contains indices for strided load. | ||||||
5310 | // \p Factor - the factor of interleaving. | ||||||
5311 | // AVX-512 provides 3-src shuffles that significantly reduces the cost. | ||||||
5312 | InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512( | ||||||
5313 | unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, | ||||||
5314 | ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace, | ||||||
5315 | TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) { | ||||||
5316 | // VecTy for interleave memop is <VF*Factor x Elt>. | ||||||
5317 | // So, for VF=4, Interleave Factor = 3, Element type = i32 we have | ||||||
5318 | // VecTy = <12 x i32>. | ||||||
5319 | |||||||
5320 | // Calculate the number of memory operations (NumOfMemOps), required | ||||||
5321 | // for load/store the VecTy. | ||||||
5322 | MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; | ||||||
5323 | unsigned VecTySize = DL.getTypeStoreSize(VecTy); | ||||||
5324 | unsigned LegalVTSize = LegalVT.getStoreSize(); | ||||||
5325 | unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize; | ||||||
5326 | |||||||
5327 | // Get the cost of one memory operation. | ||||||
5328 | auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(), | ||||||
5329 | LegalVT.getVectorNumElements()); | ||||||
5330 | InstructionCost MemOpCost; | ||||||
5331 | bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps; | ||||||
5332 | if (UseMaskedMemOp) | ||||||
5333 | MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment, | ||||||
5334 | AddressSpace, CostKind); | ||||||
5335 | else | ||||||
5336 | MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, MaybeAlign(Alignment), | ||||||
5337 | AddressSpace, CostKind); | ||||||
5338 | |||||||
5339 | unsigned VF = VecTy->getNumElements() / Factor; | ||||||
5340 | MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF); | ||||||
5341 | |||||||
5342 | InstructionCost MaskCost; | ||||||
5343 | if (UseMaskedMemOp) { | ||||||
5344 | APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements()); | ||||||
5345 | for (unsigned Index : Indices) { | ||||||
5346 | assert(Index < Factor && "Invalid index for interleaved memory op")(static_cast <bool> (Index < Factor && "Invalid index for interleaved memory op" ) ? void (0) : __assert_fail ("Index < Factor && \"Invalid index for interleaved memory op\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5346, __extension__ __PRETTY_FUNCTION__)); | ||||||
5347 | for (unsigned Elm = 0; Elm < VF; Elm++) | ||||||
5348 | DemandedLoadStoreElts.setBit(Index + Elm * Factor); | ||||||
5349 | } | ||||||
5350 | |||||||
5351 | Type *I1Type = Type::getInt1Ty(VecTy->getContext()); | ||||||
5352 | |||||||
5353 | MaskCost = getReplicationShuffleCost( | ||||||
5354 | I1Type, Factor, VF, | ||||||
5355 | UseMaskForGaps ? DemandedLoadStoreElts | ||||||
5356 | : APInt::getAllOnes(VecTy->getNumElements()), | ||||||
5357 | CostKind); | ||||||
5358 | |||||||
5359 | // The Gaps mask is invariant and created outside the loop, therefore the | ||||||
5360 | // cost of creating it is not accounted for here. However if we have both | ||||||
5361 | // a MaskForGaps and some other mask that guards the execution of the | ||||||
5362 | // memory access, we need to account for the cost of And-ing the two masks | ||||||
5363 | // inside the loop. | ||||||
5364 | if (UseMaskForGaps) { | ||||||
5365 | auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements()); | ||||||
5366 | MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind); | ||||||
5367 | } | ||||||
5368 | } | ||||||
5369 | |||||||
5370 | if (Opcode == Instruction::Load) { | ||||||
5371 | // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl) | ||||||
5372 | // contain the cost of the optimized shuffle sequence that the | ||||||
5373 | // X86InterleavedAccess pass will generate. | ||||||
5374 | // The cost of loads and stores are computed separately from the table. | ||||||
5375 | |||||||
5376 | // X86InterleavedAccess support only the following interleaved-access group. | ||||||
5377 | static const CostTblEntry AVX512InterleavedLoadTbl[] = { | ||||||
5378 | {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8 | ||||||
5379 | {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8 | ||||||
5380 | {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8 | ||||||
5381 | }; | ||||||
5382 | |||||||
5383 | if (const auto *Entry = | ||||||
5384 | CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT)) | ||||||
5385 | return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost; | ||||||
5386 | //If an entry does not exist, fallback to the default implementation. | ||||||
5387 | |||||||
5388 | // Kind of shuffle depends on number of loaded values. | ||||||
5389 | // If we load the entire data in one register, we can use a 1-src shuffle. | ||||||
5390 | // Otherwise, we'll merge 2 sources in each operation. | ||||||
5391 | TTI::ShuffleKind ShuffleKind = | ||||||
5392 | (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc; | ||||||
5393 | |||||||
5394 | InstructionCost ShuffleCost = | ||||||
5395 | getShuffleCost(ShuffleKind, SingleMemOpTy, None, 0, nullptr); | ||||||
5396 | |||||||
5397 | unsigned NumOfLoadsInInterleaveGrp = | ||||||
5398 | Indices.size() ? Indices.size() : Factor; | ||||||
5399 | auto *ResultTy = FixedVectorType::get(VecTy->getElementType(), | ||||||
5400 | VecTy->getNumElements() / Factor); | ||||||
5401 | InstructionCost NumOfResults = | ||||||
5402 | getTLI()->getTypeLegalizationCost(DL, ResultTy).first * | ||||||
5403 | NumOfLoadsInInterleaveGrp; | ||||||
5404 | |||||||
5405 | // About a half of the loads may be folded in shuffles when we have only | ||||||
5406 | // one result. If we have more than one result, or the loads are masked, | ||||||
5407 | // we do not fold loads at all. | ||||||
5408 | unsigned NumOfUnfoldedLoads = | ||||||
5409 | UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2; | ||||||
5410 | |||||||
5411 | // Get a number of shuffle operations per result. | ||||||
5412 | unsigned NumOfShufflesPerResult = | ||||||
5413 | std::max((unsigned)1, (unsigned)(NumOfMemOps - 1)); | ||||||
5414 | |||||||
5415 | // The SK_MergeTwoSrc shuffle clobbers one of src operands. | ||||||
5416 | // When we have more than one destination, we need additional instructions | ||||||
5417 | // to keep sources. | ||||||
5418 | InstructionCost NumOfMoves = 0; | ||||||
5419 | if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc) | ||||||
5420 | NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2; | ||||||
5421 | |||||||
5422 | InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost + | ||||||
5423 | MaskCost + NumOfUnfoldedLoads * MemOpCost + | ||||||
5424 | NumOfMoves; | ||||||
5425 | |||||||
5426 | return Cost; | ||||||
5427 | } | ||||||
5428 | |||||||
5429 | // Store. | ||||||
5430 | assert(Opcode == Instruction::Store &&(static_cast <bool> (Opcode == Instruction::Store && "Expected Store Instruction at this point") ? void (0) : __assert_fail ("Opcode == Instruction::Store && \"Expected Store Instruction at this point\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5431, __extension__ __PRETTY_FUNCTION__)) | ||||||
5431 | "Expected Store Instruction at this point")(static_cast <bool> (Opcode == Instruction::Store && "Expected Store Instruction at this point") ? void (0) : __assert_fail ("Opcode == Instruction::Store && \"Expected Store Instruction at this point\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5431, __extension__ __PRETTY_FUNCTION__)); | ||||||
5432 | // X86InterleavedAccess support only the following interleaved-access group. | ||||||
5433 | static const CostTblEntry AVX512InterleavedStoreTbl[] = { | ||||||
5434 | {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store) | ||||||
5435 | {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store) | ||||||
5436 | {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store) | ||||||
5437 | |||||||
5438 | {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store) | ||||||
5439 | {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store) | ||||||
5440 | {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store) | ||||||
5441 | {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store) | ||||||
5442 | }; | ||||||
5443 | |||||||
5444 | if (const auto *Entry = | ||||||
5445 | CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT)) | ||||||
5446 | return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost; | ||||||
5447 | //If an entry does not exist, fallback to the default implementation. | ||||||
5448 | |||||||
5449 | // There is no strided stores meanwhile. And store can't be folded in | ||||||
5450 | // shuffle. | ||||||
5451 | unsigned NumOfSources = Factor; // The number of values to be merged. | ||||||
5452 | InstructionCost ShuffleCost = | ||||||
5453 | getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, None, 0, nullptr); | ||||||
5454 | unsigned NumOfShufflesPerStore = NumOfSources - 1; | ||||||
5455 | |||||||
5456 | // The SK_MergeTwoSrc shuffle clobbers one of src operands. | ||||||
5457 | // We need additional instructions to keep sources. | ||||||
5458 | unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2; | ||||||
5459 | InstructionCost Cost = | ||||||
5460 | MaskCost + | ||||||
5461 | NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) + | ||||||
5462 | NumOfMoves; | ||||||
5463 | return Cost; | ||||||
5464 | } | ||||||
5465 | |||||||
5466 | InstructionCost X86TTIImpl::getInterleavedMemoryOpCost( | ||||||
5467 | unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices, | ||||||
5468 | Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, | ||||||
5469 | bool UseMaskForCond, bool UseMaskForGaps) { | ||||||
5470 | auto *VecTy = cast<FixedVectorType>(BaseTy); | ||||||
5471 | |||||||
5472 | auto isSupportedOnAVX512 = [&](Type *VecTy, bool HasBW) { | ||||||
5473 | Type *EltTy = cast<VectorType>(VecTy)->getElementType(); | ||||||
5474 | if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) || | ||||||
5475 | EltTy->isIntegerTy(32) || EltTy->isPointerTy()) | ||||||
5476 | return true; | ||||||
5477 | if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || | ||||||
5478 | (!ST->useSoftFloat() && ST->hasFP16() && EltTy->isHalfTy())) | ||||||
5479 | return HasBW; | ||||||
5480 | return false; | ||||||
5481 | }; | ||||||
5482 | if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI())) | ||||||
5483 | return getInterleavedMemoryOpCostAVX512( | ||||||
5484 | Opcode, VecTy, Factor, Indices, Alignment, | ||||||
5485 | AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps); | ||||||
5486 | |||||||
5487 | if (UseMaskForCond || UseMaskForGaps) | ||||||
5488 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | ||||||
5489 | Alignment, AddressSpace, CostKind, | ||||||
5490 | UseMaskForCond, UseMaskForGaps); | ||||||
5491 | |||||||
5492 | // Get estimation for interleaved load/store operations for SSE-AVX2. | ||||||
5493 | // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow | ||||||
5494 | // computing the cost using a generic formula as a function of generic | ||||||
5495 | // shuffles. We therefore use a lookup table instead, filled according to | ||||||
5496 | // the instruction sequences that codegen currently generates. | ||||||
5497 | |||||||
5498 | // VecTy for interleave memop is <VF*Factor x Elt>. | ||||||
5499 | // So, for VF=4, Interleave Factor = 3, Element type = i32 we have | ||||||
5500 | // VecTy = <12 x i32>. | ||||||
5501 | MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; | ||||||
5502 | |||||||
5503 | // This function can be called with VecTy=<6xi128>, Factor=3, in which case | ||||||
5504 | // the VF=2, while v2i128 is an unsupported MVT vector type | ||||||
5505 | // (see MachineValueType.h::getVectorVT()). | ||||||
5506 | if (!LegalVT.isVector()) | ||||||
5507 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | ||||||
5508 | Alignment, AddressSpace, CostKind); | ||||||
5509 | |||||||
5510 | unsigned VF = VecTy->getNumElements() / Factor; | ||||||
5511 | Type *ScalarTy = VecTy->getElementType(); | ||||||
5512 | // Deduplicate entries, model floats/pointers as appropriately-sized integers. | ||||||
5513 | if (!ScalarTy->isIntegerTy()) | ||||||
5514 | ScalarTy = | ||||||
5515 | Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy)); | ||||||
5516 | |||||||
5517 | // Get the cost of all the memory operations. | ||||||
5518 | // FIXME: discount dead loads. | ||||||
5519 | InstructionCost MemOpCosts = getMemoryOpCost( | ||||||
5520 | Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind); | ||||||
5521 | |||||||
5522 | auto *VT = FixedVectorType::get(ScalarTy, VF); | ||||||
5523 | EVT ETy = TLI->getValueType(DL, VT); | ||||||
5524 | if (!ETy.isSimple()) | ||||||
5525 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | ||||||
5526 | Alignment, AddressSpace, CostKind); | ||||||
5527 | |||||||
5528 | // TODO: Complete for other data-types and strides. | ||||||
5529 | // Each combination of Stride, element bit width and VF results in a different | ||||||
5530 | // sequence; The cost tables are therefore accessed with: | ||||||
5531 | // Factor (stride) and VectorType=VFxiN. | ||||||
5532 | // The Cost accounts only for the shuffle sequence; | ||||||
5533 | // The cost of the loads/stores is accounted for separately. | ||||||
5534 | // | ||||||
5535 | static const CostTblEntry AVX2InterleavedLoadTbl[] = { | ||||||
5536 | {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8 | ||||||
5537 | {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8 | ||||||
5538 | {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8 | ||||||
5539 | {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8 | ||||||
5540 | {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8 | ||||||
5541 | |||||||
5542 | {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16 | ||||||
5543 | {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16 | ||||||
5544 | {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16 | ||||||
5545 | |||||||
5546 | {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32 | ||||||
5547 | {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32 | ||||||
5548 | {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32 | ||||||
5549 | |||||||
5550 | {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64 | ||||||
5551 | {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64 | ||||||
5552 | {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64 | ||||||
5553 | {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64 | ||||||
5554 | |||||||
5555 | {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8 | ||||||
5556 | {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8 | ||||||
5557 | {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8 | ||||||
5558 | {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8 | ||||||
5559 | {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8 | ||||||
5560 | |||||||
5561 | {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16 | ||||||
5562 | {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16 | ||||||
5563 | {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16 | ||||||
5564 | {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16 | ||||||
5565 | {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16 | ||||||
5566 | |||||||
5567 | {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32 | ||||||
5568 | {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32 | ||||||
5569 | {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32 | ||||||
5570 | {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32 | ||||||
5571 | {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32 | ||||||
5572 | |||||||
5573 | {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64 | ||||||
5574 | {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64 | ||||||
5575 | {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64 | ||||||
5576 | {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64 | ||||||
5577 | |||||||
5578 | {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8 | ||||||
5579 | {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8 | ||||||
5580 | {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8 | ||||||
5581 | {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8 | ||||||
5582 | {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8 | ||||||
5583 | |||||||
5584 | {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16 | ||||||
5585 | {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16 | ||||||
5586 | {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16 | ||||||
5587 | {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16 | ||||||
5588 | {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16 | ||||||
5589 | |||||||
5590 | {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32 | ||||||
5591 | {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32 | ||||||
5592 | {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32 | ||||||
5593 | {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32 | ||||||
5594 | {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32 | ||||||
5595 | |||||||
5596 | {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64 | ||||||
5597 | {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64 | ||||||
5598 | {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64 | ||||||
5599 | {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64 | ||||||
5600 | |||||||
5601 | {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8 | ||||||
5602 | {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8 | ||||||
5603 | {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8 | ||||||
5604 | {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8 | ||||||
5605 | {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8 | ||||||
5606 | |||||||
5607 | {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16 | ||||||
5608 | {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16 | ||||||
5609 | {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16 | ||||||
5610 | {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16 | ||||||
5611 | {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16 | ||||||
5612 | |||||||
5613 | {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32 | ||||||
5614 | {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32 | ||||||
5615 | {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32 | ||||||
5616 | {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32 | ||||||
5617 | |||||||
5618 | {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64 | ||||||
5619 | {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64 | ||||||
5620 | {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64 | ||||||
5621 | |||||||
5622 | {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32 | ||||||
5623 | }; | ||||||
5624 | |||||||
5625 | static const CostTblEntry SSSE3InterleavedLoadTbl[] = { | ||||||
5626 | {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16 | ||||||
5627 | }; | ||||||
5628 | |||||||
5629 | static const CostTblEntry SSE2InterleavedLoadTbl[] = { | ||||||
5630 | {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16 | ||||||
5631 | {2, MVT::v4i16, 7}, // (load 8i16 and) deinterleave into 2 x 4i16 | ||||||
5632 | |||||||
5633 | {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32 | ||||||
5634 | {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32 | ||||||
5635 | |||||||
5636 | {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64 | ||||||
5637 | }; | ||||||
5638 | |||||||
5639 | static const CostTblEntry AVX2InterleavedStoreTbl[] = { | ||||||
5640 | {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store) | ||||||
5641 | {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store) | ||||||
5642 | |||||||
5643 | {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store) | ||||||
5644 | {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store) | ||||||
5645 | {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store) | ||||||
5646 | |||||||
5647 | {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store) | ||||||
5648 | {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store) | ||||||
5649 | {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store) | ||||||
5650 | {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store) | ||||||
5651 | |||||||
5652 | {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store) | ||||||
5653 | {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store) | ||||||
5654 | {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store) | ||||||
5655 | {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store) | ||||||
5656 | {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store) | ||||||
5657 | |||||||
5658 | {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store) | ||||||
5659 | {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store) | ||||||
5660 | {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store) | ||||||
5661 | {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store) | ||||||
5662 | {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store) | ||||||
5663 | |||||||
5664 | {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store) | ||||||
5665 | {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store) | ||||||
5666 | {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store) | ||||||
5667 | {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store) | ||||||
5668 | {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store) | ||||||
5669 | |||||||
5670 | {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store) | ||||||
5671 | {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store) | ||||||
5672 | {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store) | ||||||
5673 | {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store) | ||||||
5674 | {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store) | ||||||
5675 | |||||||
5676 | {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store) | ||||||
5677 | {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store) | ||||||
5678 | {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store) | ||||||
5679 | {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store) | ||||||
5680 | |||||||
5681 | {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store) | ||||||
5682 | {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store) | ||||||
5683 | {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store) | ||||||
5684 | {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store) | ||||||
5685 | {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store) | ||||||
5686 | |||||||
5687 | {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store) | ||||||
5688 | {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store) | ||||||
5689 | {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store) | ||||||
5690 | {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store) | ||||||
5691 | {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store) | ||||||
5692 | |||||||
5693 | {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store) | ||||||
5694 | {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store) | ||||||
5695 | {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store) | ||||||
5696 | {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store) | ||||||
5697 | {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store) | ||||||
5698 | |||||||
5699 | {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store) | ||||||
5700 | {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store) | ||||||
5701 | {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store) | ||||||
5702 | {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store) | ||||||
5703 | |||||||
5704 | {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store) | ||||||
5705 | {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store) | ||||||
5706 | {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store) | ||||||
5707 | {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store) | ||||||
5708 | {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store) | ||||||
5709 | |||||||
5710 | {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store) | ||||||
5711 | {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store) | ||||||
5712 | {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store) | ||||||
5713 | {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store) | ||||||
5714 | {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store) | ||||||
5715 | |||||||
5716 | {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store) | ||||||
5717 | {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store) | ||||||
5718 | {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store) | ||||||
5719 | {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store) | ||||||
5720 | |||||||
5721 | {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store) | ||||||
5722 | {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store) | ||||||
5723 | {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store) | ||||||
5724 | }; | ||||||
5725 | |||||||
5726 | static const CostTblEntry SSE2InterleavedStoreTbl[] = { | ||||||
5727 | {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store) | ||||||
5728 | {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store) | ||||||
5729 | {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store) | ||||||
5730 | |||||||
5731 | {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store) | ||||||
5732 | {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store) | ||||||
5733 | |||||||
5734 | {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store) | ||||||
5735 | }; | ||||||
5736 | |||||||
5737 | if (Opcode == Instruction::Load) { | ||||||
5738 | auto GetDiscountedCost = [Factor, NumMembers = Indices.size(), | ||||||
5739 | MemOpCosts](const CostTblEntry *Entry) { | ||||||
5740 | // NOTE: this is just an approximation! | ||||||
5741 | // It can over/under -estimate the cost! | ||||||
5742 | return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor); | ||||||
5743 | }; | ||||||
5744 | |||||||
5745 | if (ST->hasAVX2()) | ||||||
5746 | if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor, | ||||||
5747 | ETy.getSimpleVT())) | ||||||
5748 | return GetDiscountedCost(Entry); | ||||||
5749 | |||||||
5750 | if (ST->hasSSSE3()) | ||||||
5751 | if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor, | ||||||
5752 | ETy.getSimpleVT())) | ||||||
5753 | return GetDiscountedCost(Entry); | ||||||
5754 | |||||||
5755 | if (ST->hasSSE2()) | ||||||
5756 | if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor, | ||||||
5757 | ETy.getSimpleVT())) | ||||||
5758 | return GetDiscountedCost(Entry); | ||||||
5759 | } else { | ||||||
5760 | assert(Opcode == Instruction::Store &&(static_cast <bool> (Opcode == Instruction::Store && "Expected Store Instruction at this point") ? void (0) : __assert_fail ("Opcode == Instruction::Store && \"Expected Store Instruction at this point\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5761, __extension__ __PRETTY_FUNCTION__)) | ||||||
5761 | "Expected Store Instruction at this point")(static_cast <bool> (Opcode == Instruction::Store && "Expected Store Instruction at this point") ? void (0) : __assert_fail ("Opcode == Instruction::Store && \"Expected Store Instruction at this point\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5761, __extension__ __PRETTY_FUNCTION__)); | ||||||
5762 | assert((!Indices.size() || Indices.size() == Factor) &&(static_cast <bool> ((!Indices.size() || Indices.size() == Factor) && "Interleaved store only supports fully-interleaved groups." ) ? void (0) : __assert_fail ("(!Indices.size() || Indices.size() == Factor) && \"Interleaved store only supports fully-interleaved groups.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5763, __extension__ __PRETTY_FUNCTION__)) | ||||||
5763 | "Interleaved store only supports fully-interleaved groups.")(static_cast <bool> ((!Indices.size() || Indices.size() == Factor) && "Interleaved store only supports fully-interleaved groups." ) ? void (0) : __assert_fail ("(!Indices.size() || Indices.size() == Factor) && \"Interleaved store only supports fully-interleaved groups.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5763, __extension__ __PRETTY_FUNCTION__)); | ||||||
5764 | if (ST->hasAVX2()) | ||||||
5765 | if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor, | ||||||
5766 | ETy.getSimpleVT())) | ||||||
5767 | return MemOpCosts + Entry->Cost; | ||||||
5768 | |||||||
5769 | if (ST->hasSSE2()) | ||||||
5770 | if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor, | ||||||
5771 | ETy.getSimpleVT())) | ||||||
5772 | return MemOpCosts + Entry->Cost; | ||||||
5773 | } | ||||||
5774 | |||||||
5775 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | ||||||
5776 | Alignment, AddressSpace, CostKind, | ||||||
5777 | UseMaskForCond, UseMaskForGaps); | ||||||
5778 | } |
1 | //===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// This file provides a helper that implements much of the TTI interface in |
11 | /// terms of the target-independent code generator and TargetLowering |
12 | /// interfaces. |
13 | // |
14 | //===----------------------------------------------------------------------===// |
15 | |
16 | #ifndef LLVM_CODEGEN_BASICTTIIMPL_H |
17 | #define LLVM_CODEGEN_BASICTTIIMPL_H |
18 | |
19 | #include "llvm/ADT/APInt.h" |
20 | #include "llvm/ADT/ArrayRef.h" |
21 | #include "llvm/ADT/BitVector.h" |
22 | #include "llvm/ADT/SmallPtrSet.h" |
23 | #include "llvm/ADT/SmallVector.h" |
24 | #include "llvm/Analysis/LoopInfo.h" |
25 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
26 | #include "llvm/Analysis/TargetTransformInfo.h" |
27 | #include "llvm/Analysis/TargetTransformInfoImpl.h" |
28 | #include "llvm/CodeGen/ISDOpcodes.h" |
29 | #include "llvm/CodeGen/TargetLowering.h" |
30 | #include "llvm/CodeGen/TargetSubtargetInfo.h" |
31 | #include "llvm/CodeGen/ValueTypes.h" |
32 | #include "llvm/IR/BasicBlock.h" |
33 | #include "llvm/IR/Constant.h" |
34 | #include "llvm/IR/Constants.h" |
35 | #include "llvm/IR/DataLayout.h" |
36 | #include "llvm/IR/DerivedTypes.h" |
37 | #include "llvm/IR/InstrTypes.h" |
38 | #include "llvm/IR/Instruction.h" |
39 | #include "llvm/IR/Instructions.h" |
40 | #include "llvm/IR/Intrinsics.h" |
41 | #include "llvm/IR/Operator.h" |
42 | #include "llvm/IR/Type.h" |
43 | #include "llvm/IR/Value.h" |
44 | #include "llvm/Support/Casting.h" |
45 | #include "llvm/Support/CommandLine.h" |
46 | #include "llvm/Support/ErrorHandling.h" |
47 | #include "llvm/Support/MachineValueType.h" |
48 | #include "llvm/Support/MathExtras.h" |
49 | #include "llvm/Target/TargetMachine.h" |
50 | #include <algorithm> |
51 | #include <cassert> |
52 | #include <cstdint> |
53 | #include <limits> |
54 | #include <utility> |
55 | |
56 | namespace llvm { |
57 | |
58 | class Function; |
59 | class GlobalValue; |
60 | class LLVMContext; |
61 | class ScalarEvolution; |
62 | class SCEV; |
63 | class TargetMachine; |
64 | |
65 | extern cl::opt<unsigned> PartialUnrollingThreshold; |
66 | |
67 | /// Base class which can be used to help build a TTI implementation. |
68 | /// |
69 | /// This class provides as much implementation of the TTI interface as is |
70 | /// possible using the target independent parts of the code generator. |
71 | /// |
72 | /// In order to subclass it, your class must implement a getST() method to |
73 | /// return the subtarget, and a getTLI() method to return the target lowering. |
74 | /// We need these methods implemented in the derived class so that this class |
75 | /// doesn't have to duplicate storage for them. |
76 | template <typename T> |
77 | class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { |
78 | private: |
79 | using BaseT = TargetTransformInfoImplCRTPBase<T>; |
80 | using TTI = TargetTransformInfo; |
81 | |
82 | /// Helper function to access this as a T. |
83 | T *thisT() { return static_cast<T *>(this); } |
84 | |
85 | /// Estimate a cost of Broadcast as an extract and sequence of insert |
86 | /// operations. |
87 | InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy) { |
88 | InstructionCost Cost = 0; |
89 | // Broadcast cost is equal to the cost of extracting the zero'th element |
90 | // plus the cost of inserting it into every element of the result vector. |
91 | Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 0); |
92 | |
93 | for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { |
94 | Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i); |
95 | } |
96 | return Cost; |
97 | } |
98 | |
99 | /// Estimate a cost of shuffle as a sequence of extract and insert |
100 | /// operations. |
101 | InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy) { |
102 | InstructionCost Cost = 0; |
103 | // Shuffle cost is equal to the cost of extracting element from its argument |
104 | // plus the cost of inserting them onto the result vector. |
105 | |
106 | // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from |
107 | // index 0 of first vector, index 1 of second vector,index 2 of first |
108 | // vector and finally index 3 of second vector and insert them at index |
109 | // <0,1,2,3> of result vector. |
110 | for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { |
111 | Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i); |
112 | Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, i); |
113 | } |
114 | return Cost; |
115 | } |
116 | |
117 | /// Estimate a cost of subvector extraction as a sequence of extract and |
118 | /// insert operations. |
119 | InstructionCost getExtractSubvectorOverhead(VectorType *VTy, int Index, |
120 | FixedVectorType *SubVTy) { |
121 | assert(VTy && SubVTy &&(static_cast <bool> (VTy && SubVTy && "Can only extract subvectors from vectors" ) ? void (0) : __assert_fail ("VTy && SubVTy && \"Can only extract subvectors from vectors\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 122, __extension__ __PRETTY_FUNCTION__)) |
122 | "Can only extract subvectors from vectors")(static_cast <bool> (VTy && SubVTy && "Can only extract subvectors from vectors" ) ? void (0) : __assert_fail ("VTy && SubVTy && \"Can only extract subvectors from vectors\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 122, __extension__ __PRETTY_FUNCTION__)); |
123 | int NumSubElts = SubVTy->getNumElements(); |
124 | assert((!isa<FixedVectorType>(VTy) ||(static_cast <bool> ((!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>( VTy)->getNumElements()) && "SK_ExtractSubvector index out of range" ) ? void (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_ExtractSubvector index out of range\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 127, __extension__ __PRETTY_FUNCTION__)) |
125 | (Index + NumSubElts) <=(static_cast <bool> ((!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>( VTy)->getNumElements()) && "SK_ExtractSubvector index out of range" ) ? void (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_ExtractSubvector index out of range\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 127, __extension__ __PRETTY_FUNCTION__)) |
126 | (int)cast<FixedVectorType>(VTy)->getNumElements()) &&(static_cast <bool> ((!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>( VTy)->getNumElements()) && "SK_ExtractSubvector index out of range" ) ? void (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_ExtractSubvector index out of range\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 127, __extension__ __PRETTY_FUNCTION__)) |
127 | "SK_ExtractSubvector index out of range")(static_cast <bool> ((!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>( VTy)->getNumElements()) && "SK_ExtractSubvector index out of range" ) ? void (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_ExtractSubvector index out of range\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 127, __extension__ __PRETTY_FUNCTION__)); |
128 | |
129 | InstructionCost Cost = 0; |
130 | // Subvector extraction cost is equal to the cost of extracting element from |
131 | // the source type plus the cost of inserting them into the result vector |
132 | // type. |
133 | for (int i = 0; i != NumSubElts; ++i) { |
134 | Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, |
135 | i + Index); |
136 | Cost += |
137 | thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, i); |
138 | } |
139 | return Cost; |
140 | } |
141 | |
142 | /// Estimate a cost of subvector insertion as a sequence of extract and |
143 | /// insert operations. |
144 | InstructionCost getInsertSubvectorOverhead(VectorType *VTy, int Index, |
145 | FixedVectorType *SubVTy) { |
146 | assert(VTy && SubVTy &&(static_cast <bool> (VTy && SubVTy && "Can only insert subvectors into vectors" ) ? void (0) : __assert_fail ("VTy && SubVTy && \"Can only insert subvectors into vectors\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 147, __extension__ __PRETTY_FUNCTION__)) |
147 | "Can only insert subvectors into vectors")(static_cast <bool> (VTy && SubVTy && "Can only insert subvectors into vectors" ) ? void (0) : __assert_fail ("VTy && SubVTy && \"Can only insert subvectors into vectors\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 147, __extension__ __PRETTY_FUNCTION__)); |
148 | int NumSubElts = SubVTy->getNumElements(); |
149 | assert((!isa<FixedVectorType>(VTy) ||(static_cast <bool> ((!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>( VTy)->getNumElements()) && "SK_InsertSubvector index out of range" ) ? void (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_InsertSubvector index out of range\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 152, __extension__ __PRETTY_FUNCTION__)) |
150 | (Index + NumSubElts) <=(static_cast <bool> ((!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>( VTy)->getNumElements()) && "SK_InsertSubvector index out of range" ) ? void (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_InsertSubvector index out of range\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 152, __extension__ __PRETTY_FUNCTION__)) |
151 | (int)cast<FixedVectorType>(VTy)->getNumElements()) &&(static_cast <bool> ((!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>( VTy)->getNumElements()) && "SK_InsertSubvector index out of range" ) ? void (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_InsertSubvector index out of range\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 152, __extension__ __PRETTY_FUNCTION__)) |
152 | "SK_InsertSubvector index out of range")(static_cast <bool> ((!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>( VTy)->getNumElements()) && "SK_InsertSubvector index out of range" ) ? void (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_InsertSubvector index out of range\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 152, __extension__ __PRETTY_FUNCTION__)); |
153 | |
154 | InstructionCost Cost = 0; |
155 | // Subvector insertion cost is equal to the cost of extracting element from |
156 | // the source type plus the cost of inserting them into the result vector |
157 | // type. |
158 | for (int i = 0; i != NumSubElts; ++i) { |
159 | Cost += |
160 | thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, i); |
161 | Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, |
162 | i + Index); |
163 | } |
164 | return Cost; |
165 | } |
166 | |
167 | /// Local query method delegates up to T which *must* implement this! |
168 | const TargetSubtargetInfo *getST() const { |
169 | return static_cast<const T *>(this)->getST(); |
170 | } |
171 | |
172 | /// Local query method delegates up to T which *must* implement this! |
173 | const TargetLoweringBase *getTLI() const { |
174 | return static_cast<const T *>(this)->getTLI(); |
175 | } |
176 | |
177 | static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) { |
178 | switch (M) { |
179 | case TTI::MIM_Unindexed: |
180 | return ISD::UNINDEXED; |
181 | case TTI::MIM_PreInc: |
182 | return ISD::PRE_INC; |
183 | case TTI::MIM_PreDec: |
184 | return ISD::PRE_DEC; |
185 | case TTI::MIM_PostInc: |
186 | return ISD::POST_INC; |
187 | case TTI::MIM_PostDec: |
188 | return ISD::POST_DEC; |
189 | } |
190 | llvm_unreachable("Unexpected MemIndexedMode")::llvm::llvm_unreachable_internal("Unexpected MemIndexedMode" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 190); |
191 | } |
192 | |
193 | InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, |
194 | Align Alignment, |
195 | bool VariableMask, |
196 | bool IsGatherScatter, |
197 | TTI::TargetCostKind CostKind) { |
198 | auto *VT = cast<FixedVectorType>(DataTy); |
199 | // Assume the target does not have support for gather/scatter operations |
200 | // and provide a rough estimate. |
201 | // |
202 | // First, compute the cost of the individual memory operations. |
203 | InstructionCost AddrExtractCost = |
204 | IsGatherScatter |
205 | ? getVectorInstrCost(Instruction::ExtractElement, |
206 | FixedVectorType::get( |
207 | PointerType::get(VT->getElementType(), 0), |
208 | VT->getNumElements()), |
209 | -1) |
210 | : 0; |
211 | InstructionCost LoadCost = |
212 | VT->getNumElements() * |
213 | (AddrExtractCost + |
214 | getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind)); |
215 | |
216 | // Next, compute the cost of packing the result in a vector. |
217 | InstructionCost PackingCost = getScalarizationOverhead( |
218 | VT, Opcode != Instruction::Store, Opcode == Instruction::Store); |
219 | |
220 | InstructionCost ConditionalCost = 0; |
221 | if (VariableMask) { |
222 | // Compute the cost of conditionally executing the memory operations with |
223 | // variable masks. This includes extracting the individual conditions, a |
224 | // branches and PHIs to combine the results. |
225 | // NOTE: Estimating the cost of conditionally executing the memory |
226 | // operations accurately is quite difficult and the current solution |
227 | // provides a very rough estimate only. |
228 | ConditionalCost = |
229 | VT->getNumElements() * |
230 | (getVectorInstrCost( |
231 | Instruction::ExtractElement, |
232 | FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()), |
233 | VT->getNumElements()), |
234 | -1) + |
235 | getCFInstrCost(Instruction::Br, CostKind) + |
236 | getCFInstrCost(Instruction::PHI, CostKind)); |
237 | } |
238 | |
239 | return LoadCost + PackingCost + ConditionalCost; |
240 | } |
241 | |
242 | protected: |
243 | explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL) |
244 | : BaseT(DL) {} |
245 | virtual ~BasicTTIImplBase() = default; |
246 | |
247 | using TargetTransformInfoImplBase::DL; |
248 | |
249 | public: |
250 | /// \name Scalar TTI Implementations |
251 | /// @{ |
252 | bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, |
253 | unsigned AddressSpace, Align Alignment, |
254 | bool *Fast) const { |
255 | EVT E = EVT::getIntegerVT(Context, BitWidth); |
256 | return getTLI()->allowsMisalignedMemoryAccesses( |
257 | E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast); |
258 | } |
259 | |
260 | bool hasBranchDivergence() { return false; } |
261 | |
262 | bool useGPUDivergenceAnalysis() { return false; } |
263 | |
264 | bool isSourceOfDivergence(const Value *V) { return false; } |
265 | |
266 | bool isAlwaysUniform(const Value *V) { return false; } |
267 | |
268 | unsigned getFlatAddressSpace() { |
269 | // Return an invalid address space. |
270 | return -1; |
271 | } |
272 | |
273 | bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, |
274 | Intrinsic::ID IID) const { |
275 | return false; |
276 | } |
277 | |
278 | bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const { |
279 | return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS); |
280 | } |
281 | |
282 | unsigned getAssumedAddrSpace(const Value *V) const { |
283 | return getTLI()->getTargetMachine().getAssumedAddrSpace(V); |
284 | } |
285 | |
286 | std::pair<const Value *, unsigned> |
287 | getPredicatedAddrSpace(const Value *V) const { |
288 | return getTLI()->getTargetMachine().getPredicatedAddrSpace(V); |
289 | } |
290 | |
291 | Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, |
292 | Value *NewV) const { |
293 | return nullptr; |
294 | } |
295 | |
296 | bool isLegalAddImmediate(int64_t imm) { |
297 | return getTLI()->isLegalAddImmediate(imm); |
298 | } |
299 | |
300 | bool isLegalICmpImmediate(int64_t imm) { |
301 | return getTLI()->isLegalICmpImmediate(imm); |
302 | } |
303 | |
304 | bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, |
305 | bool HasBaseReg, int64_t Scale, |
306 | unsigned AddrSpace, Instruction *I = nullptr) { |
307 | TargetLoweringBase::AddrMode AM; |
308 | AM.BaseGV = BaseGV; |
309 | AM.BaseOffs = BaseOffset; |
310 | AM.HasBaseReg = HasBaseReg; |
311 | AM.Scale = Scale; |
312 | return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I); |
313 | } |
314 | |
315 | bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty, |
316 | const DataLayout &DL) const { |
317 | EVT VT = getTLI()->getValueType(DL, Ty); |
318 | return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT); |
319 | } |
320 | |
321 | bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty, |
322 | const DataLayout &DL) const { |
323 | EVT VT = getTLI()->getValueType(DL, Ty); |
324 | return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT); |
325 | } |
326 | |
327 | bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) { |
328 | return TargetTransformInfoImplBase::isLSRCostLess(C1, C2); |
329 | } |
330 | |
331 | bool isNumRegsMajorCostOfLSR() { |
332 | return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR(); |
333 | } |
334 | |
335 | bool isProfitableLSRChainElement(Instruction *I) { |
336 | return TargetTransformInfoImplBase::isProfitableLSRChainElement(I); |
337 | } |
338 | |
339 | InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, |
340 | int64_t BaseOffset, bool HasBaseReg, |
341 | int64_t Scale, unsigned AddrSpace) { |
342 | TargetLoweringBase::AddrMode AM; |
343 | AM.BaseGV = BaseGV; |
344 | AM.BaseOffs = BaseOffset; |
345 | AM.HasBaseReg = HasBaseReg; |
346 | AM.Scale = Scale; |
347 | return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace); |
348 | } |
349 | |
350 | bool isTruncateFree(Type *Ty1, Type *Ty2) { |
351 | return getTLI()->isTruncateFree(Ty1, Ty2); |
352 | } |
353 | |
354 | bool isProfitableToHoist(Instruction *I) { |
355 | return getTLI()->isProfitableToHoist(I); |
356 | } |
357 | |
358 | bool useAA() const { return getST()->useAA(); } |
359 | |
360 | bool isTypeLegal(Type *Ty) { |
361 | EVT VT = getTLI()->getValueType(DL, Ty); |
362 | return getTLI()->isTypeLegal(VT); |
363 | } |
364 | |
365 | InstructionCost getRegUsageForType(Type *Ty) { |
366 | InstructionCost Val = getTLI()->getTypeLegalizationCost(DL, Ty).first; |
367 | assert(Val >= 0 && "Negative cost!")(static_cast <bool> (Val >= 0 && "Negative cost!" ) ? void (0) : __assert_fail ("Val >= 0 && \"Negative cost!\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 367, __extension__ __PRETTY_FUNCTION__)); |
368 | return Val; |
369 | } |
370 | |
371 | InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, |
372 | ArrayRef<const Value *> Operands, |
373 | TTI::TargetCostKind CostKind) { |
374 | return BaseT::getGEPCost(PointeeType, Ptr, Operands, CostKind); |
375 | } |
376 | |
377 | unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, |
378 | unsigned &JumpTableSize, |
379 | ProfileSummaryInfo *PSI, |
380 | BlockFrequencyInfo *BFI) { |
381 | /// Try to find the estimated number of clusters. Note that the number of |
382 | /// clusters identified in this function could be different from the actual |
383 | /// numbers found in lowering. This function ignore switches that are |
384 | /// lowered with a mix of jump table / bit test / BTree. This function was |
385 | /// initially intended to be used when estimating the cost of switch in |
386 | /// inline cost heuristic, but it's a generic cost model to be used in other |
387 | /// places (e.g., in loop unrolling). |
388 | unsigned N = SI.getNumCases(); |
389 | const TargetLoweringBase *TLI = getTLI(); |
390 | const DataLayout &DL = this->getDataLayout(); |
391 | |
392 | JumpTableSize = 0; |
393 | bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent()); |
394 | |
395 | // Early exit if both a jump table and bit test are not allowed. |
396 | if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N)) |
397 | return N; |
398 | |
399 | APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue(); |
400 | APInt MinCaseVal = MaxCaseVal; |
401 | for (auto CI : SI.cases()) { |
402 | const APInt &CaseVal = CI.getCaseValue()->getValue(); |
403 | if (CaseVal.sgt(MaxCaseVal)) |
404 | MaxCaseVal = CaseVal; |
405 | if (CaseVal.slt(MinCaseVal)) |
406 | MinCaseVal = CaseVal; |
407 | } |
408 | |
409 | // Check if suitable for a bit test |
410 | if (N <= DL.getIndexSizeInBits(0u)) { |
411 | SmallPtrSet<const BasicBlock *, 4> Dests; |
412 | for (auto I : SI.cases()) |
413 | Dests.insert(I.getCaseSuccessor()); |
414 | |
415 | if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal, |
416 | DL)) |
417 | return 1; |
418 | } |
419 | |
420 | // Check if suitable for a jump table. |
421 | if (IsJTAllowed) { |
422 | if (N < 2 || N < TLI->getMinimumJumpTableEntries()) |
423 | return N; |
424 | uint64_t Range = |
425 | (MaxCaseVal - MinCaseVal) |
426 | .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1; |
427 | // Check whether a range of clusters is dense enough for a jump table |
428 | if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) { |
429 | JumpTableSize = Range; |
430 | return 1; |
431 | } |
432 | } |
433 | return N; |
434 | } |
435 | |
436 | bool shouldBuildLookupTables() { |
437 | const TargetLoweringBase *TLI = getTLI(); |
438 | return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) || |
439 | TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other); |
440 | } |
441 | |
442 | bool shouldBuildRelLookupTables() const { |
443 | const TargetMachine &TM = getTLI()->getTargetMachine(); |
444 | // If non-PIC mode, do not generate a relative lookup table. |
445 | if (!TM.isPositionIndependent()) |
446 | return false; |
447 | |
448 | /// Relative lookup table entries consist of 32-bit offsets. |
449 | /// Do not generate relative lookup tables for large code models |
450 | /// in 64-bit achitectures where 32-bit offsets might not be enough. |
451 | if (TM.getCodeModel() == CodeModel::Medium || |
452 | TM.getCodeModel() == CodeModel::Large) |
453 | return false; |
454 | |
455 | Triple TargetTriple = TM.getTargetTriple(); |
456 | if (!TargetTriple.isArch64Bit()) |
457 | return false; |
458 | |
459 | // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it |
460 | // there. |
461 | if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin()) |
462 | return false; |
463 | |
464 | return true; |
465 | } |
466 | |
467 | bool haveFastSqrt(Type *Ty) { |
468 | const TargetLoweringBase *TLI = getTLI(); |
469 | EVT VT = TLI->getValueType(DL, Ty); |
470 | return TLI->isTypeLegal(VT) && |
471 | TLI->isOperationLegalOrCustom(ISD::FSQRT, VT); |
472 | } |
473 | |
474 | bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) { |
475 | return true; |
476 | } |
477 | |
478 | InstructionCost getFPOpCost(Type *Ty) { |
479 | // Check whether FADD is available, as a proxy for floating-point in |
480 | // general. |
481 | const TargetLoweringBase *TLI = getTLI(); |
482 | EVT VT = TLI->getValueType(DL, Ty); |
483 | if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT)) |
484 | return TargetTransformInfo::TCC_Basic; |
485 | return TargetTransformInfo::TCC_Expensive; |
486 | } |
487 | |
488 | unsigned getInliningThresholdMultiplier() { return 1; } |
489 | unsigned adjustInliningThreshold(const CallBase *CB) { return 0; } |
490 | |
491 | int getInlinerVectorBonusPercent() { return 150; } |
492 | |
493 | void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, |
494 | TTI::UnrollingPreferences &UP, |
495 | OptimizationRemarkEmitter *ORE) { |
496 | // This unrolling functionality is target independent, but to provide some |
497 | // motivation for its intended use, for x86: |
498 | |
499 | // According to the Intel 64 and IA-32 Architectures Optimization Reference |
500 | // Manual, Intel Core models and later have a loop stream detector (and |
501 | // associated uop queue) that can benefit from partial unrolling. |
502 | // The relevant requirements are: |
503 | // - The loop must have no more than 4 (8 for Nehalem and later) branches |
504 | // taken, and none of them may be calls. |
505 | // - The loop can have no more than 18 (28 for Nehalem and later) uops. |
506 | |
507 | // According to the Software Optimization Guide for AMD Family 15h |
508 | // Processors, models 30h-4fh (Steamroller and later) have a loop predictor |
509 | // and loop buffer which can benefit from partial unrolling. |
510 | // The relevant requirements are: |
511 | // - The loop must have fewer than 16 branches |
512 | // - The loop must have less than 40 uops in all executed loop branches |
513 | |
514 | // The number of taken branches in a loop is hard to estimate here, and |
515 | // benchmarking has revealed that it is better not to be conservative when |
516 | // estimating the branch count. As a result, we'll ignore the branch limits |
517 | // until someone finds a case where it matters in practice. |
518 | |
519 | unsigned MaxOps; |
520 | const TargetSubtargetInfo *ST = getST(); |
521 | if (PartialUnrollingThreshold.getNumOccurrences() > 0) |
522 | MaxOps = PartialUnrollingThreshold; |
523 | else if (ST->getSchedModel().LoopMicroOpBufferSize > 0) |
524 | MaxOps = ST->getSchedModel().LoopMicroOpBufferSize; |
525 | else |
526 | return; |
527 | |
528 | // Scan the loop: don't unroll loops with calls. |
529 | for (BasicBlock *BB : L->blocks()) { |
530 | for (Instruction &I : *BB) { |
531 | if (isa<CallInst>(I) || isa<InvokeInst>(I)) { |
532 | if (const Function *F = cast<CallBase>(I).getCalledFunction()) { |
533 | if (!thisT()->isLoweredToCall(F)) |
534 | continue; |
535 | } |
536 | |
537 | if (ORE) { |
538 | ORE->emit([&]() { |
539 | return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(), |
540 | L->getHeader()) |
541 | << "advising against unrolling the loop because it " |
542 | "contains a " |
543 | << ore::NV("Call", &I); |
544 | }); |
545 | } |
546 | return; |
547 | } |
548 | } |
549 | } |
550 | |
551 | // Enable runtime and partial unrolling up to the specified size. |
552 | // Enable using trip count upper bound to unroll loops. |
553 | UP.Partial = UP.Runtime = UP.UpperBound = true; |
554 | UP.PartialThreshold = MaxOps; |
555 | |
556 | // Avoid unrolling when optimizing for size. |
557 | UP.OptSizeThreshold = 0; |
558 | UP.PartialOptSizeThreshold = 0; |
559 | |
560 | // Set number of instructions optimized when "back edge" |
561 | // becomes "fall through" to default value of 2. |
562 | UP.BEInsns = 2; |
563 | } |
564 | |
565 | void getPeelingPreferences(Loop *L, ScalarEvolution &SE, |
566 | TTI::PeelingPreferences &PP) { |
567 | PP.PeelCount = 0; |
568 | PP.AllowPeeling = true; |
569 | PP.AllowLoopNestsPeeling = false; |
570 | PP.PeelProfiledIterations = true; |
571 | } |
572 | |
573 | bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, |
574 | AssumptionCache &AC, |
575 | TargetLibraryInfo *LibInfo, |
576 | HardwareLoopInfo &HWLoopInfo) { |
577 | return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo); |
578 | } |
579 | |
580 | bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, |
581 | AssumptionCache &AC, TargetLibraryInfo *TLI, |
582 | DominatorTree *DT, |
583 | const LoopAccessInfo *LAI) { |
584 | return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI); |
585 | } |
586 | |
587 | bool emitGetActiveLaneMask() { |
588 | return BaseT::emitGetActiveLaneMask(); |
589 | } |
590 | |
591 | Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC, |
592 | IntrinsicInst &II) { |
593 | return BaseT::instCombineIntrinsic(IC, II); |
594 | } |
595 | |
596 | Optional<Value *> simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, |
597 | IntrinsicInst &II, |
598 | APInt DemandedMask, |
599 | KnownBits &Known, |
600 | bool &KnownBitsComputed) { |
601 | return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known, |
602 | KnownBitsComputed); |
603 | } |
604 | |
605 | Optional<Value *> simplifyDemandedVectorEltsIntrinsic( |
606 | InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, |
607 | APInt &UndefElts2, APInt &UndefElts3, |
608 | std::function<void(Instruction *, unsigned, APInt, APInt &)> |
609 | SimplifyAndSetOp) { |
610 | return BaseT::simplifyDemandedVectorEltsIntrinsic( |
611 | IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3, |
612 | SimplifyAndSetOp); |
613 | } |
614 | |
615 | InstructionCost getInstructionLatency(const Instruction *I) { |
616 | if (isa<LoadInst>(I)) |
617 | return getST()->getSchedModel().DefaultLoadLatency; |
618 | |
619 | return BaseT::getInstructionLatency(I); |
620 | } |
621 | |
622 | virtual Optional<unsigned> |
623 | getCacheSize(TargetTransformInfo::CacheLevel Level) const { |
624 | return Optional<unsigned>( |
625 | getST()->getCacheSize(static_cast<unsigned>(Level))); |
626 | } |
627 | |
628 | virtual Optional<unsigned> |
629 | getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const { |
630 | Optional<unsigned> TargetResult = |
631 | getST()->getCacheAssociativity(static_cast<unsigned>(Level)); |
632 | |
633 | if (TargetResult) |
634 | return TargetResult; |
635 | |
636 | return BaseT::getCacheAssociativity(Level); |
637 | } |
638 | |
639 | virtual unsigned getCacheLineSize() const { |
640 | return getST()->getCacheLineSize(); |
641 | } |
642 | |
643 | virtual unsigned getPrefetchDistance() const { |
644 | return getST()->getPrefetchDistance(); |
645 | } |
646 | |
647 | virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, |
648 | unsigned NumStridedMemAccesses, |
649 | unsigned NumPrefetches, |
650 | bool HasCall) const { |
651 | return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses, |
652 | NumPrefetches, HasCall); |
653 | } |
654 | |
655 | virtual unsigned getMaxPrefetchIterationsAhead() const { |
656 | return getST()->getMaxPrefetchIterationsAhead(); |
657 | } |
658 | |
659 | virtual bool enableWritePrefetching() const { |
660 | return getST()->enableWritePrefetching(); |
661 | } |
662 | |
663 | /// @} |
664 | |
665 | /// \name Vector TTI Implementations |
666 | /// @{ |
667 | |
668 | TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { |
669 | return TypeSize::getFixed(32); |
670 | } |
671 | |
672 | Optional<unsigned> getMaxVScale() const { return None; } |
673 | Optional<unsigned> getVScaleForTuning() const { return None; } |
674 | |
675 | /// Estimate the overhead of scalarizing an instruction. Insert and Extract |
676 | /// are set if the demanded result elements need to be inserted and/or |
677 | /// extracted from vectors. |
678 | InstructionCost getScalarizationOverhead(VectorType *InTy, |
679 | const APInt &DemandedElts, |
680 | bool Insert, bool Extract) { |
681 | /// FIXME: a bitfield is not a reasonable abstraction for talking about |
682 | /// which elements are needed from a scalable vector |
683 | auto *Ty = cast<FixedVectorType>(InTy); |
684 | |
685 | assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&(static_cast <bool> (DemandedElts.getBitWidth() == Ty-> getNumElements() && "Vector size mismatch") ? void (0 ) : __assert_fail ("DemandedElts.getBitWidth() == Ty->getNumElements() && \"Vector size mismatch\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 686, __extension__ __PRETTY_FUNCTION__)) |
686 | "Vector size mismatch")(static_cast <bool> (DemandedElts.getBitWidth() == Ty-> getNumElements() && "Vector size mismatch") ? void (0 ) : __assert_fail ("DemandedElts.getBitWidth() == Ty->getNumElements() && \"Vector size mismatch\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 686, __extension__ __PRETTY_FUNCTION__)); |
687 | |
688 | InstructionCost Cost = 0; |
689 | |
690 | for (int i = 0, e = Ty->getNumElements(); i < e; ++i) { |
691 | if (!DemandedElts[i]) |
692 | continue; |
693 | if (Insert) |
694 | Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, i); |
695 | if (Extract) |
696 | Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, i); |
697 | } |
698 | |
699 | return Cost; |
700 | } |
701 | |
702 | /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead. |
703 | InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert, |
704 | bool Extract) { |
705 | auto *Ty = cast<FixedVectorType>(InTy); |
706 | |
707 | APInt DemandedElts = APInt::getAllOnes(Ty->getNumElements()); |
708 | return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract); |
709 | } |
710 | |
711 | /// Estimate the overhead of scalarizing an instructions unique |
712 | /// non-constant operands. The (potentially vector) types to use for each of |
713 | /// argument are passes via Tys. |
714 | InstructionCost getOperandsScalarizationOverhead(ArrayRef<const Value *> Args, |
715 | ArrayRef<Type *> Tys) { |
716 | assert(Args.size() == Tys.size() && "Expected matching Args and Tys")(static_cast <bool> (Args.size() == Tys.size() && "Expected matching Args and Tys") ? void (0) : __assert_fail ("Args.size() == Tys.size() && \"Expected matching Args and Tys\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 716, __extension__ __PRETTY_FUNCTION__)); |
717 | |
718 | InstructionCost Cost = 0; |
719 | SmallPtrSet<const Value*, 4> UniqueOperands; |
720 | for (int I = 0, E = Args.size(); I != E; I++) { |
721 | // Disregard things like metadata arguments. |
722 | const Value *A = Args[I]; |
723 | Type *Ty = Tys[I]; |
724 | if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() && |
725 | !Ty->isPtrOrPtrVectorTy()) |
726 | continue; |
727 | |
728 | if (!isa<Constant>(A) && UniqueOperands.insert(A).second) { |
729 | if (auto *VecTy = dyn_cast<VectorType>(Ty)) |
730 | Cost += getScalarizationOverhead(VecTy, false, true); |
731 | } |
732 | } |
733 | |
734 | return Cost; |
735 | } |
736 | |
737 | /// Estimate the overhead of scalarizing the inputs and outputs of an |
738 | /// instruction, with return type RetTy and arguments Args of type Tys. If |
739 | /// Args are unknown (empty), then the cost associated with one argument is |
740 | /// added as a heuristic. |
741 | InstructionCost getScalarizationOverhead(VectorType *RetTy, |
742 | ArrayRef<const Value *> Args, |
743 | ArrayRef<Type *> Tys) { |
744 | InstructionCost Cost = getScalarizationOverhead(RetTy, true, false); |
745 | if (!Args.empty()) |
746 | Cost += getOperandsScalarizationOverhead(Args, Tys); |
747 | else |
748 | // When no information on arguments is provided, we add the cost |
749 | // associated with one argument as a heuristic. |
750 | Cost += getScalarizationOverhead(RetTy, false, true); |
751 | |
752 | return Cost; |
753 | } |
754 | |
755 | unsigned getMaxInterleaveFactor(unsigned VF) { return 1; } |
756 | |
757 | InstructionCost getArithmeticInstrCost( |
758 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, |
759 | TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, |
760 | TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, |
761 | TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, |
762 | TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, |
763 | ArrayRef<const Value *> Args = ArrayRef<const Value *>(), |
764 | const Instruction *CxtI = nullptr) { |
765 | // Check if any of the operands are vector operands. |
766 | const TargetLoweringBase *TLI = getTLI(); |
767 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
768 | assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ? void (0) : __assert_fail ("ISD && \"Invalid opcode\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 768, __extension__ __PRETTY_FUNCTION__)); |
769 | |
770 | // TODO: Handle more cost kinds. |
771 | if (CostKind != TTI::TCK_RecipThroughput) |
772 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, |
773 | Opd1Info, Opd2Info, |
774 | Opd1PropInfo, Opd2PropInfo, |
775 | Args, CxtI); |
776 | |
777 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); |
778 | |
779 | bool IsFloat = Ty->isFPOrFPVectorTy(); |
780 | // Assume that floating point arithmetic operations cost twice as much as |
781 | // integer operations. |
782 | InstructionCost OpCost = (IsFloat ? 2 : 1); |
783 | |
784 | if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { |
785 | // The operation is legal. Assume it costs 1. |
786 | // TODO: Once we have extract/insert subvector cost we need to use them. |
787 | return LT.first * OpCost; |
788 | } |
789 | |
790 | if (!TLI->isOperationExpand(ISD, LT.second)) { |
791 | // If the operation is custom lowered, then assume that the code is twice |
792 | // as expensive. |
793 | return LT.first * 2 * OpCost; |
794 | } |
795 | |
796 | // An 'Expand' of URem and SRem is special because it may default |
797 | // to expanding the operation into a sequence of sub-operations |
798 | // i.e. X % Y -> X-(X/Y)*Y. |
799 | if (ISD == ISD::UREM || ISD == ISD::SREM) { |
800 | bool IsSigned = ISD == ISD::SREM; |
801 | if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, |
802 | LT.second) || |
803 | TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV, |
804 | LT.second)) { |
805 | unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv; |
806 | InstructionCost DivCost = thisT()->getArithmeticInstrCost( |
807 | DivOpc, Ty, CostKind, Opd1Info, Opd2Info, Opd1PropInfo, |
808 | Opd2PropInfo); |
809 | InstructionCost MulCost = |
810 | thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind); |
811 | InstructionCost SubCost = |
812 | thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind); |
813 | return DivCost + MulCost + SubCost; |
814 | } |
815 | } |
816 | |
817 | // We cannot scalarize scalable vectors, so return Invalid. |
818 | if (isa<ScalableVectorType>(Ty)) |
819 | return InstructionCost::getInvalid(); |
820 | |
821 | // Else, assume that we need to scalarize this op. |
822 | // TODO: If one of the types get legalized by splitting, handle this |
823 | // similarly to what getCastInstrCost() does. |
824 | if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) { |
825 | InstructionCost Cost = thisT()->getArithmeticInstrCost( |
826 | Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info, |
827 | Opd1PropInfo, Opd2PropInfo, Args, CxtI); |
828 | // Return the cost of multiple scalar invocation plus the cost of |
829 | // inserting and extracting the values. |
830 | SmallVector<Type *> Tys(Args.size(), Ty); |
831 | return getScalarizationOverhead(VTy, Args, Tys) + |
832 | VTy->getNumElements() * Cost; |
833 | } |
834 | |
835 | // We don't know anything about this scalar instruction. |
836 | return OpCost; |
837 | } |
838 | |
839 | TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, |
840 | ArrayRef<int> Mask) const { |
841 | int Limit = Mask.size() * 2; |
842 | if (Mask.empty() || |
843 | // Extra check required by isSingleSourceMaskImpl function (called by |
844 | // ShuffleVectorInst::isSingleSourceMask). |
845 | any_of(Mask, [Limit](int I) { return I >= Limit; })) |
846 | return Kind; |
847 | switch (Kind) { |
848 | case TTI::SK_PermuteSingleSrc: |
849 | if (ShuffleVectorInst::isReverseMask(Mask)) |
850 | return TTI::SK_Reverse; |
851 | if (ShuffleVectorInst::isZeroEltSplatMask(Mask)) |
852 | return TTI::SK_Broadcast; |
853 | break; |
854 | case TTI::SK_PermuteTwoSrc: |
855 | if (ShuffleVectorInst::isSelectMask(Mask)) |
856 | return TTI::SK_Select; |
857 | if (ShuffleVectorInst::isTransposeMask(Mask)) |
858 | return TTI::SK_Transpose; |
859 | break; |
860 | case TTI::SK_Select: |
861 | case TTI::SK_Reverse: |
862 | case TTI::SK_Broadcast: |
863 | case TTI::SK_Transpose: |
864 | case TTI::SK_InsertSubvector: |
865 | case TTI::SK_ExtractSubvector: |
866 | case TTI::SK_Splice: |
867 | break; |
868 | } |
869 | return Kind; |
870 | } |
871 | |
872 | InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, |
873 | ArrayRef<int> Mask, int Index, |
874 | VectorType *SubTp) { |
875 | |
876 | switch (improveShuffleKindFromMask(Kind, Mask)) { |
877 | case TTI::SK_Broadcast: |
878 | if (auto *FVT = dyn_cast<FixedVectorType>(Tp)) |
879 | return getBroadcastShuffleOverhead(FVT); |
880 | return InstructionCost::getInvalid(); |
881 | case TTI::SK_Select: |
882 | case TTI::SK_Splice: |
883 | case TTI::SK_Reverse: |
884 | case TTI::SK_Transpose: |
885 | case TTI::SK_PermuteSingleSrc: |
886 | case TTI::SK_PermuteTwoSrc: |
887 | if (auto *FVT = dyn_cast<FixedVectorType>(Tp)) |
888 | return getPermuteShuffleOverhead(FVT); |
889 | return InstructionCost::getInvalid(); |
890 | case TTI::SK_ExtractSubvector: |
891 | return getExtractSubvectorOverhead(Tp, Index, |
892 | cast<FixedVectorType>(SubTp)); |
893 | case TTI::SK_InsertSubvector: |
894 | return getInsertSubvectorOverhead(Tp, Index, |
895 | cast<FixedVectorType>(SubTp)); |
896 | } |
897 | llvm_unreachable("Unknown TTI::ShuffleKind")::llvm::llvm_unreachable_internal("Unknown TTI::ShuffleKind", "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 897); |
898 | } |
899 | |
900 | InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, |
901 | TTI::CastContextHint CCH, |
902 | TTI::TargetCostKind CostKind, |
903 | const Instruction *I = nullptr) { |
904 | if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0) |
905 | return 0; |
906 | |
907 | const TargetLoweringBase *TLI = getTLI(); |
908 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
909 | assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ? void (0) : __assert_fail ("ISD && \"Invalid opcode\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 909, __extension__ __PRETTY_FUNCTION__)); |
910 | std::pair<InstructionCost, MVT> SrcLT = |
911 | TLI->getTypeLegalizationCost(DL, Src); |
912 | std::pair<InstructionCost, MVT> DstLT = |
913 | TLI->getTypeLegalizationCost(DL, Dst); |
914 | |
915 | TypeSize SrcSize = SrcLT.second.getSizeInBits(); |
916 | TypeSize DstSize = DstLT.second.getSizeInBits(); |
917 | bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy(); |
918 | bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy(); |
919 | |
920 | switch (Opcode) { |
921 | default: |
922 | break; |
923 | case Instruction::Trunc: |
924 | // Check for NOOP conversions. |
925 | if (TLI->isTruncateFree(SrcLT.second, DstLT.second)) |
926 | return 0; |
927 | LLVM_FALLTHROUGH[[gnu::fallthrough]]; |
928 | case Instruction::BitCast: |
929 | // Bitcast between types that are legalized to the same type are free and |
930 | // assume int to/from ptr of the same size is also free. |
931 | if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst && |
932 | SrcSize == DstSize) |
933 | return 0; |
934 | break; |
935 | case Instruction::FPExt: |
936 | if (I && getTLI()->isExtFree(I)) |
937 | return 0; |
938 | break; |
939 | case Instruction::ZExt: |
940 | if (TLI->isZExtFree(SrcLT.second, DstLT.second)) |
941 | return 0; |
942 | LLVM_FALLTHROUGH[[gnu::fallthrough]]; |
943 | case Instruction::SExt: |
944 | if (I && getTLI()->isExtFree(I)) |
945 | return 0; |
946 | |
947 | // If this is a zext/sext of a load, return 0 if the corresponding |
948 | // extending load exists on target and the result type is legal. |
949 | if (CCH == TTI::CastContextHint::Normal) { |
950 | EVT ExtVT = EVT::getEVT(Dst); |
951 | EVT LoadVT = EVT::getEVT(Src); |
952 | unsigned LType = |
953 | ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD); |
954 | if (DstLT.first == SrcLT.first && |
955 | TLI->isLoadExtLegal(LType, ExtVT, LoadVT)) |
956 | return 0; |
957 | } |
958 | break; |
959 | case Instruction::AddrSpaceCast: |
960 | if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(), |
961 | Dst->getPointerAddressSpace())) |
962 | return 0; |
963 | break; |
964 | } |
965 | |
966 | auto *SrcVTy = dyn_cast<VectorType>(Src); |
967 | auto *DstVTy = dyn_cast<VectorType>(Dst); |
968 | |
969 | // If the cast is marked as legal (or promote) then assume low cost. |
970 | if (SrcLT.first == DstLT.first && |
971 | TLI->isOperationLegalOrPromote(ISD, DstLT.second)) |
972 | return SrcLT.first; |
973 | |
974 | // Handle scalar conversions. |
975 | if (!SrcVTy && !DstVTy) { |
976 | // Just check the op cost. If the operation is legal then assume it costs |
977 | // 1. |
978 | if (!TLI->isOperationExpand(ISD, DstLT.second)) |
979 | return 1; |
980 | |
981 | // Assume that illegal scalar instruction are expensive. |
982 | return 4; |
983 | } |
984 | |
985 | // Check vector-to-vector casts. |
986 | if (DstVTy && SrcVTy) { |
987 | // If the cast is between same-sized registers, then the check is simple. |
988 | if (SrcLT.first == DstLT.first && SrcSize == DstSize) { |
989 | |
990 | // Assume that Zext is done using AND. |
991 | if (Opcode == Instruction::ZExt) |
992 | return SrcLT.first; |
993 | |
994 | // Assume that sext is done using SHL and SRA. |
995 | if (Opcode == Instruction::SExt) |
996 | return SrcLT.first * 2; |
997 | |
998 | // Just check the op cost. If the operation is legal then assume it |
999 | // costs |
1000 | // 1 and multiply by the type-legalization overhead. |
1001 | if (!TLI->isOperationExpand(ISD, DstLT.second)) |
1002 | return SrcLT.first * 1; |
1003 | } |
1004 | |
1005 | // If we are legalizing by splitting, query the concrete TTI for the cost |
1006 | // of casting the original vector twice. We also need to factor in the |
1007 | // cost of the split itself. Count that as 1, to be consistent with |
1008 | // TLI->getTypeLegalizationCost(). |
1009 | bool SplitSrc = |
1010 | TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) == |
1011 | TargetLowering::TypeSplitVector; |
1012 | bool SplitDst = |
1013 | TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) == |
1014 | TargetLowering::TypeSplitVector; |
1015 | if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() && |
1016 | DstVTy->getElementCount().isVector()) { |
1017 | Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy); |
1018 | Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy); |
1019 | T *TTI = static_cast<T *>(this); |
1020 | // If both types need to be split then the split is free. |
1021 | InstructionCost SplitCost = |
1022 | (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0; |
1023 | return SplitCost + |
1024 | (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH, |
1025 | CostKind, I)); |
1026 | } |
1027 | |
1028 | // Scalarization cost is Invalid, can't assume any num elements. |
1029 | if (isa<ScalableVectorType>(DstVTy)) |
1030 | return InstructionCost::getInvalid(); |
1031 | |
1032 | // In other cases where the source or destination are illegal, assume |
1033 | // the operation will get scalarized. |
1034 | unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements(); |
1035 | InstructionCost Cost = thisT()->getCastInstrCost( |
1036 | Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I); |
1037 | |
1038 | // Return the cost of multiple scalar invocation plus the cost of |
1039 | // inserting and extracting the values. |
1040 | return getScalarizationOverhead(DstVTy, true, true) + Num * Cost; |
1041 | } |
1042 | |
1043 | // We already handled vector-to-vector and scalar-to-scalar conversions. |
1044 | // This |
1045 | // is where we handle bitcast between vectors and scalars. We need to assume |
1046 | // that the conversion is scalarized in one way or another. |
1047 | if (Opcode == Instruction::BitCast) { |
1048 | // Illegal bitcasts are done by storing and loading from a stack slot. |
1049 | return (SrcVTy ? getScalarizationOverhead(SrcVTy, false, true) : 0) + |
1050 | (DstVTy ? getScalarizationOverhead(DstVTy, true, false) : 0); |
1051 | } |
1052 | |
1053 | llvm_unreachable("Unhandled cast")::llvm::llvm_unreachable_internal("Unhandled cast", "llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 1053); |
1054 | } |
1055 | |
1056 | InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, |
1057 | VectorType *VecTy, unsigned Index) { |
1058 | return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy, |
1059 | Index) + |
1060 | thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(), |
1061 | TTI::CastContextHint::None, |
1062 | TTI::TCK_RecipThroughput); |
1063 | } |
1064 | |
1065 | InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, |
1066 | const Instruction *I = nullptr) { |
1067 | return BaseT::getCFInstrCost(Opcode, CostKind, I); |
1068 | } |
1069 | |
1070 | InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, |
1071 | CmpInst::Predicate VecPred, |
1072 | TTI::TargetCostKind CostKind, |
1073 | const Instruction *I = nullptr) { |
1074 | const TargetLoweringBase *TLI = getTLI(); |
1075 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
1076 | assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ? void (0) : __assert_fail ("ISD && \"Invalid opcode\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 1076, __extension__ __PRETTY_FUNCTION__)); |
1077 | |
1078 | // TODO: Handle other cost kinds. |
1079 | if (CostKind != TTI::TCK_RecipThroughput) |
1080 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, |
1081 | I); |
1082 | |
1083 | // Selects on vectors are actually vector selects. |
1084 | if (ISD == ISD::SELECT) { |
1085 | assert(CondTy && "CondTy must exist")(static_cast <bool> (CondTy && "CondTy must exist" ) ? void (0) : __assert_fail ("CondTy && \"CondTy must exist\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 1085, __extension__ __PRETTY_FUNCTION__)); |
1086 | if (CondTy->isVectorTy()) |
1087 | ISD = ISD::VSELECT; |
1088 | } |
1089 | std::pair<InstructionCost, MVT> LT = |
1090 | TLI->getTypeLegalizationCost(DL, ValTy); |
1091 | |
1092 | if (!(ValTy->isVectorTy() && !LT.second.isVector()) && |
1093 | !TLI->isOperationExpand(ISD, LT.second)) { |
1094 | // The operation is legal. Assume it costs 1. Multiply |
1095 | // by the type-legalization overhead. |
1096 | return LT.first * 1; |
1097 | } |
1098 | |
1099 | // Otherwise, assume that the cast is scalarized. |
1100 | // TODO: If one of the types get legalized by splitting, handle this |
1101 | // similarly to what getCastInstrCost() does. |
1102 | if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) { |
1103 | unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements(); |
1104 | if (CondTy) |
1105 | CondTy = CondTy->getScalarType(); |
1106 | InstructionCost Cost = thisT()->getCmpSelInstrCost( |
1107 | Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I); |
1108 | |
1109 | // Return the cost of multiple scalar invocation plus the cost of |
1110 | // inserting and extracting the values. |
1111 | return getScalarizationOverhead(ValVTy, true, false) + Num * Cost; |
1112 | } |
1113 | |
1114 | // Unknown scalar opcode. |
1115 | return 1; |
1116 | } |
1117 | |
1118 | InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, |
1119 | unsigned Index) { |
1120 | std::pair<InstructionCost, MVT> LT = |
1121 | getTLI()->getTypeLegalizationCost(DL, Val->getScalarType()); |
1122 | |
1123 | return LT.first; |
1124 | } |
1125 | |
1126 | InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, |
1127 | int VF, |
1128 | const APInt &DemandedDstElts, |
1129 | TTI::TargetCostKind CostKind) { |
1130 | assert(DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor &&(static_cast <bool> (DemandedDstElts.getBitWidth() == ( unsigned)VF * ReplicationFactor && "Unexpected size of DemandedDstElts." ) ? void (0) : __assert_fail ("DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor && \"Unexpected size of DemandedDstElts.\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 1131, __extension__ __PRETTY_FUNCTION__)) |
1131 | "Unexpected size of DemandedDstElts.")(static_cast <bool> (DemandedDstElts.getBitWidth() == ( unsigned)VF * ReplicationFactor && "Unexpected size of DemandedDstElts." ) ? void (0) : __assert_fail ("DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor && \"Unexpected size of DemandedDstElts.\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 1131, __extension__ __PRETTY_FUNCTION__)); |
1132 | |
1133 | InstructionCost Cost; |
1134 | |
1135 | auto *SrcVT = FixedVectorType::get(EltTy, VF); |
1136 | auto *ReplicatedVT = FixedVectorType::get(EltTy, VF * ReplicationFactor); |
1137 | |
1138 | // The Mask shuffling cost is extract all the elements of the Mask |
1139 | // and insert each of them Factor times into the wide vector: |
1140 | // |
1141 | // E.g. an interleaved group with factor 3: |
1142 | // %mask = icmp ult <8 x i32> %vec1, %vec2 |
1143 | // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef, |
1144 | // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7> |
1145 | // The cost is estimated as extract all mask elements from the <8xi1> mask |
1146 | // vector and insert them factor times into the <24xi1> shuffled mask |
1147 | // vector. |
1148 | APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedDstElts, VF); |
1149 | Cost += thisT()->getScalarizationOverhead(SrcVT, DemandedSrcElts, |
1150 | /*Insert*/ false, |
1151 | /*Extract*/ true); |
1152 | Cost += |
1153 | thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts, |
1154 | /*Insert*/ true, /*Extract*/ false); |
1155 | |
1156 | return Cost; |
1157 | } |
1158 | |
1159 | InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, |
1160 | MaybeAlign Alignment, unsigned AddressSpace, |
1161 | TTI::TargetCostKind CostKind, |
1162 | const Instruction *I = nullptr) { |
1163 | assert(!Src->isVoidTy() && "Invalid type")(static_cast <bool> (!Src->isVoidTy() && "Invalid type" ) ? void (0) : __assert_fail ("!Src->isVoidTy() && \"Invalid type\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 1163, __extension__ __PRETTY_FUNCTION__)); |
1164 | // Assume types, such as structs, are expensive. |
1165 | if (getTLI()->getValueType(DL, Src, true) == MVT::Other) |
1166 | return 4; |
1167 | std::pair<InstructionCost, MVT> LT = |
1168 | getTLI()->getTypeLegalizationCost(DL, Src); |
1169 | |
1170 | // Assuming that all loads of legal types cost 1. |
1171 | InstructionCost Cost = LT.first; |
1172 | if (CostKind != TTI::TCK_RecipThroughput) |
1173 | return Cost; |
1174 | |
1175 | if (Src->isVectorTy() && |
1176 | // In practice it's not currently possible to have a change in lane |
1177 | // length for extending loads or truncating stores so both types should |
1178 | // have the same scalable property. |
1179 | TypeSize::isKnownLT(Src->getPrimitiveSizeInBits(), |
1180 | LT.second.getSizeInBits())) { |
1181 | // This is a vector load that legalizes to a larger type than the vector |
1182 | // itself. Unless the corresponding extending load or truncating store is |
1183 | // legal, then this will scalarize. |
1184 | TargetLowering::LegalizeAction LA = TargetLowering::Expand; |
1185 | EVT MemVT = getTLI()->getValueType(DL, Src); |
1186 | if (Opcode == Instruction::Store) |
1187 | LA = getTLI()->getTruncStoreAction(LT.second, MemVT); |
1188 | else |
1189 | LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT); |
1190 | |
1191 | if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) { |
1192 | // This is a vector load/store for some illegal type that is scalarized. |
1193 | // We must account for the cost of building or decomposing the vector. |
1194 | Cost += getScalarizationOverhead(cast<VectorType>(Src), |
1195 | Opcode != Instruction::Store, |
1196 | Opcode == Instruction::Store); |
1197 | } |
1198 | } |
1199 | |
1200 | return Cost; |
1201 | } |
1202 | |
1203 | InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, |
1204 | Align Alignment, unsigned AddressSpace, |
1205 | TTI::TargetCostKind CostKind) { |
1206 | return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false, |
1207 | CostKind); |
1208 | } |
1209 | |
1210 | InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, |
1211 | const Value *Ptr, bool VariableMask, |
1212 | Align Alignment, |
1213 | TTI::TargetCostKind CostKind, |
1214 | const Instruction *I = nullptr) { |
1215 | return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask, |
1216 | true, CostKind); |
1217 | } |
1218 | |
1219 | InstructionCost getInterleavedMemoryOpCost( |
1220 | unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, |
1221 | Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, |
1222 | bool UseMaskForCond = false, bool UseMaskForGaps = false) { |
1223 | auto *VT = cast<FixedVectorType>(VecTy); |
1224 | |
1225 | unsigned NumElts = VT->getNumElements(); |
1226 | assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor")(static_cast <bool> (Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor") ? void ( 0) : __assert_fail ("Factor > 1 && NumElts % Factor == 0 && \"Invalid interleave factor\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 1226, __extension__ __PRETTY_FUNCTION__)); |
1227 | |
1228 | unsigned NumSubElts = NumElts / Factor; |
1229 | auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts); |
1230 | |
1231 | // Firstly, the cost of load/store operation. |
1232 | InstructionCost Cost; |
1233 | if (UseMaskForCond || UseMaskForGaps) |
1234 | Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment, |
1235 | AddressSpace, CostKind); |
1236 | else |
1237 | Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, |
1238 | CostKind); |
1239 | |
1240 | // Legalize the vector type, and get the legalized and unlegalized type |
1241 | // sizes. |
1242 | MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; |
1243 | unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy); |
1244 | unsigned VecTyLTSize = VecTyLT.getStoreSize(); |
1245 | |
1246 | // Scale the cost of the memory operation by the fraction of legalized |
1247 | // instructions that will actually be used. We shouldn't account for the |
1248 | // cost of dead instructions since they will be removed. |
1249 | // |
1250 | // E.g., An interleaved load of factor 8: |
1251 | // %vec = load <16 x i64>, <16 x i64>* %ptr |
1252 | // %v0 = shufflevector %vec, undef, <0, 8> |
1253 | // |
1254 | // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be |
1255 | // used (those corresponding to elements [0:1] and [8:9] of the unlegalized |
1256 | // type). The other loads are unused. |
1257 | // |
1258 | // TODO: Note that legalization can turn masked loads/stores into unmasked |
1259 | // (legalized) loads/stores. This can be reflected in the cost. |
1260 | if (Cost.isValid() && VecTySize > VecTyLTSize) { |
1261 | // The number of loads of a legal type it will take to represent a load |
1262 | // of the unlegalized vector type. |
1263 | unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize); |
1264 | |
1265 | // The number of elements of the unlegalized type that correspond to a |
1266 | // single legal instruction. |
1267 | unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts); |
1268 | |
1269 | // Determine which legal instructions will be used. |
1270 | BitVector UsedInsts(NumLegalInsts, false); |
1271 | for (unsigned Index : Indices) |
1272 | for (unsigned Elt = 0; Elt < NumSubElts; ++Elt) |
1273 | UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst); |
1274 | |
1275 | // Scale the cost of the load by the fraction of legal instructions that |
1276 | // will be used. |
1277 | Cost = divideCeil(UsedInsts.count() * Cost.getValue().getValue(), |
1278 | NumLegalInsts); |
1279 | } |
1280 | |
1281 | // Then plus the cost of interleave operation. |
1282 | assert(Indices.size() <= Factor &&(static_cast <bool> (Indices.size() <= Factor && "Interleaved memory op has too many members") ? void (0) : __assert_fail ("Indices.size() <= Factor && \"Interleaved memory op has too many members\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 1283, __extension__ __PRETTY_FUNCTION__)) |
1283 | "Interleaved memory op has too many members")(static_cast <bool> (Indices.size() <= Factor && "Interleaved memory op has too many members") ? void (0) : __assert_fail ("Indices.size() <= Factor && \"Interleaved memory op has too many members\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 1283, __extension__ __PRETTY_FUNCTION__)); |
1284 | |
1285 | const APInt DemandedAllSubElts = APInt::getAllOnes(NumSubElts); |
1286 | const APInt DemandedAllResultElts = APInt::getAllOnes(NumElts); |
1287 | |
1288 | APInt DemandedLoadStoreElts = APInt::getZero(NumElts); |
1289 | for (unsigned Index : Indices) { |
1290 | assert(Index < Factor && "Invalid index for interleaved memory op")(static_cast <bool> (Index < Factor && "Invalid index for interleaved memory op" ) ? void (0) : __assert_fail ("Index < Factor && \"Invalid index for interleaved memory op\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 1290, __extension__ __PRETTY_FUNCTION__)); |
1291 | for (unsigned Elm = 0; Elm < NumSubElts; Elm++) |
1292 | DemandedLoadStoreElts.setBit(Index + Elm * Factor); |
1293 | } |
1294 | |
1295 | if (Opcode == Instruction::Load) { |
1296 | // The interleave cost is similar to extract sub vectors' elements |
1297 | // from the wide vector, and insert them into sub vectors. |
1298 | // |
1299 | // E.g. An interleaved load of factor 2 (with one member of index 0): |
1300 | // %vec = load <8 x i32>, <8 x i32>* %ptr |
1301 | // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0 |
1302 | // The cost is estimated as extract elements at 0, 2, 4, 6 from the |
1303 | // <8 x i32> vector and insert them into a <4 x i32> vector. |
1304 | InstructionCost InsSubCost = |
1305 | thisT()->getScalarizationOverhead(SubVT, DemandedAllSubElts, |
1306 | /*Insert*/ true, /*Extract*/ false); |
1307 | Cost += Indices.size() * InsSubCost; |
1308 | Cost += |
1309 | thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts, |
1310 | /*Insert*/ false, /*Extract*/ true); |
1311 | } else { |
1312 | // The interleave cost is extract elements from sub vectors, and |
1313 | // insert them into the wide vector. |
1314 | // |
1315 | // E.g. An interleaved store of factor 3 with 2 members at indices 0,1: |
1316 | // (using VF=4): |
1317 | // %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef> |
1318 | // %gaps.mask = <true, true, false, true, true, false, |
1319 | // true, true, false, true, true, false> |
1320 | // call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr, |
1321 | // i32 Align, <12 x i1> %gaps.mask |
1322 | // The cost is estimated as extract all elements (of actual members, |
1323 | // excluding gaps) from both <4 x i32> vectors and insert into the <12 x |
1324 | // i32> vector. |
1325 | InstructionCost ExtSubCost = |
1326 | thisT()->getScalarizationOverhead(SubVT, DemandedAllSubElts, |
1327 | /*Insert*/ false, /*Extract*/ true); |
1328 | Cost += ExtSubCost * Indices.size(); |
1329 | Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts, |
1330 | /*Insert*/ true, |
1331 | /*Extract*/ false); |
1332 | } |
1333 | |
1334 | if (!UseMaskForCond) |
1335 | return Cost; |
1336 | |
1337 | Type *I8Type = Type::getInt8Ty(VT->getContext()); |
1338 | |
1339 | Cost += thisT()->getReplicationShuffleCost( |
1340 | I8Type, Factor, NumSubElts, |
1341 | UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts, |
1342 | CostKind); |
1343 | |
1344 | // The Gaps mask is invariant and created outside the loop, therefore the |
1345 | // cost of creating it is not accounted for here. However if we have both |
1346 | // a MaskForGaps and some other mask that guards the execution of the |
1347 | // memory access, we need to account for the cost of And-ing the two masks |
1348 | // inside the loop. |
1349 | if (UseMaskForGaps) { |
1350 | auto *MaskVT = FixedVectorType::get(I8Type, NumElts); |
1351 | Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT, |
1352 | CostKind); |
1353 | } |
1354 | |
1355 | return Cost; |
1356 | } |
1357 | |
1358 | /// Get intrinsic cost based on arguments. |
1359 | InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, |
1360 | TTI::TargetCostKind CostKind) { |
1361 | // Check for generically free intrinsics. |
1362 | if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0) |
1363 | return 0; |
1364 | |
1365 | // Assume that target intrinsics are cheap. |
1366 | Intrinsic::ID IID = ICA.getID(); |
1367 | if (Function::isTargetIntrinsic(IID)) |
1368 | return TargetTransformInfo::TCC_Basic; |
1369 | |
1370 | if (ICA.isTypeBasedOnly()) |
1371 | return getTypeBasedIntrinsicInstrCost(ICA, CostKind); |
1372 | |
1373 | Type *RetTy = ICA.getReturnType(); |
1374 | |
1375 | ElementCount RetVF = |
1376 | (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount() |
1377 | : ElementCount::getFixed(1)); |
1378 | const IntrinsicInst *I = ICA.getInst(); |
1379 | const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); |
1380 | FastMathFlags FMF = ICA.getFlags(); |
1381 | switch (IID) { |
1382 | default: |
1383 | break; |
1384 | |
1385 | case Intrinsic::cttz: |
1386 | // FIXME: If necessary, this should go in target-specific overrides. |
1387 | if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz()) |
1388 | return TargetTransformInfo::TCC_Basic; |
1389 | break; |
1390 | |
1391 | case Intrinsic::ctlz: |
1392 | // FIXME: If necessary, this should go in target-specific overrides. |
1393 | if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz()) |
1394 | return TargetTransformInfo::TCC_Basic; |
1395 | break; |
1396 | |
1397 | case Intrinsic::memcpy: |
1398 | return thisT()->getMemcpyCost(ICA.getInst()); |
1399 | |
1400 | case Intrinsic::masked_scatter: { |
1401 | const Value *Mask = Args[3]; |
1402 | bool VarMask = !isa<Constant>(Mask); |
1403 | Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue(); |
1404 | return thisT()->getGatherScatterOpCost(Instruction::Store, |
1405 | ICA.getArgTypes()[0], Args[1], |
1406 | VarMask, Alignment, CostKind, I); |
1407 | } |
1408 | case Intrinsic::masked_gather: { |
1409 | const Value *Mask = Args[2]; |
1410 | bool VarMask = !isa<Constant>(Mask); |
1411 | Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue(); |
1412 | return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0], |
1413 | VarMask, Alignment, CostKind, I); |
1414 | } |
1415 | case Intrinsic::experimental_stepvector: { |
1416 | if (isa<ScalableVectorType>(RetTy)) |
1417 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); |
1418 | // The cost of materialising a constant integer vector. |
1419 | return TargetTransformInfo::TCC_Basic; |
1420 | } |
1421 | case Intrinsic::experimental_vector_extract: { |
1422 | // FIXME: Handle case where a scalable vector is extracted from a scalable |
1423 | // vector |
1424 | if (isa<ScalableVectorType>(RetTy)) |
1425 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); |
1426 | unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue(); |
1427 | return thisT()->getShuffleCost(TTI::SK_ExtractSubvector, |
1428 | cast<VectorType>(Args[0]->getType()), None, |
1429 | Index, cast<VectorType>(RetTy)); |
1430 | } |
1431 | case Intrinsic::experimental_vector_insert: { |
1432 | // FIXME: Handle case where a scalable vector is inserted into a scalable |
1433 | // vector |
1434 | if (isa<ScalableVectorType>(Args[1]->getType())) |
1435 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); |
1436 | unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue(); |
1437 | return thisT()->getShuffleCost( |
1438 | TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()), None, |
1439 | Index, cast<VectorType>(Args[1]->getType())); |
1440 | } |
1441 | case Intrinsic::experimental_vector_reverse: { |
1442 | return thisT()->getShuffleCost(TTI::SK_Reverse, |
1443 | cast<VectorType>(Args[0]->getType()), None, |
1444 | 0, cast<VectorType>(RetTy)); |
1445 | } |
1446 | case Intrinsic::experimental_vector_splice: { |
1447 | unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue(); |
1448 | return thisT()->getShuffleCost(TTI::SK_Splice, |
1449 | cast<VectorType>(Args[0]->getType()), None, |
1450 | Index, cast<VectorType>(RetTy)); |
1451 | } |
1452 | case Intrinsic::vector_reduce_add: |
1453 | case Intrinsic::vector_reduce_mul: |
1454 | case Intrinsic::vector_reduce_and: |
1455 | case Intrinsic::vector_reduce_or: |
1456 | case Intrinsic::vector_reduce_xor: |
1457 | case Intrinsic::vector_reduce_smax: |
1458 | case Intrinsic::vector_reduce_smin: |
1459 | case Intrinsic::vector_reduce_fmax: |
1460 | case Intrinsic::vector_reduce_fmin: |
1461 | case Intrinsic::vector_reduce_umax: |
1462 | case Intrinsic::vector_reduce_umin: { |
1463 | IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1); |
1464 | return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); |
1465 | } |
1466 | case Intrinsic::vector_reduce_fadd: |
1467 | case Intrinsic::vector_reduce_fmul: { |
1468 | IntrinsicCostAttributes Attrs( |
1469 | IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1); |
1470 | return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); |
1471 | } |
1472 | case Intrinsic::fshl: |
1473 | case Intrinsic::fshr: { |
1474 | if (isa<ScalableVectorType>(RetTy)) |
1475 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); |
1476 | const Value *X = Args[0]; |
1477 | const Value *Y = Args[1]; |
1478 | const Value *Z = Args[2]; |
1479 | TTI::OperandValueProperties OpPropsX, OpPropsY, OpPropsZ, OpPropsBW; |
1480 | TTI::OperandValueKind OpKindX = TTI::getOperandInfo(X, OpPropsX); |
1481 | TTI::OperandValueKind OpKindY = TTI::getOperandInfo(Y, OpPropsY); |
1482 | TTI::OperandValueKind OpKindZ = TTI::getOperandInfo(Z, OpPropsZ); |
1483 | TTI::OperandValueKind OpKindBW = TTI::OK_UniformConstantValue; |
1484 | OpPropsBW = isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2 |
1485 | : TTI::OP_None; |
1486 | // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW))) |
1487 | // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW)) |
1488 | InstructionCost Cost = 0; |
1489 | Cost += |
1490 | thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind); |
1491 | Cost += |
1492 | thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind); |
1493 | Cost += thisT()->getArithmeticInstrCost( |
1494 | BinaryOperator::Shl, RetTy, CostKind, OpKindX, OpKindZ, OpPropsX); |
1495 | Cost += thisT()->getArithmeticInstrCost( |
1496 | BinaryOperator::LShr, RetTy, CostKind, OpKindY, OpKindZ, OpPropsY); |
1497 | // Non-constant shift amounts requires a modulo. |
1498 | if (OpKindZ != TTI::OK_UniformConstantValue && |
1499 | OpKindZ != TTI::OK_NonUniformConstantValue) |
1500 | Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy, |
1501 | CostKind, OpKindZ, OpKindBW, |
1502 | OpPropsZ, OpPropsBW); |
1503 | // For non-rotates (X != Y) we must add shift-by-zero handling costs. |
1504 | if (X != Y) { |
1505 | Type *CondTy = RetTy->getWithNewBitWidth(1); |
1506 | Cost += |
1507 | thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, |
1508 | CmpInst::ICMP_EQ, CostKind); |
1509 | Cost += |
1510 | thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, |
1511 | CmpInst::ICMP_EQ, CostKind); |
1512 | } |
1513 | return Cost; |
1514 | } |
1515 | } |
1516 | |
1517 | // Assume that we need to scalarize this intrinsic. |
1518 | // Compute the scalarization overhead based on Args for a vector |
1519 | // intrinsic. |
1520 | InstructionCost ScalarizationCost = InstructionCost::getInvalid(); |
1521 | if (RetVF.isVector() && !RetVF.isScalable()) { |
1522 | ScalarizationCost = 0; |
1523 | if (!RetTy->isVoidTy()) |
1524 | ScalarizationCost += |
1525 | getScalarizationOverhead(cast<VectorType>(RetTy), true, false); |
1526 | ScalarizationCost += |
1527 | getOperandsScalarizationOverhead(Args, ICA.getArgTypes()); |
1528 | } |
1529 | |
1530 | IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I, |
1531 | ScalarizationCost); |
1532 | return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind); |
1533 | } |
1534 | |
1535 | /// Get intrinsic cost based on argument types. |
1536 | /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the |
1537 | /// cost of scalarizing the arguments and the return value will be computed |
1538 | /// based on types. |
1539 | InstructionCost |
1540 | getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, |
1541 | TTI::TargetCostKind CostKind) { |
1542 | Intrinsic::ID IID = ICA.getID(); |
1543 | Type *RetTy = ICA.getReturnType(); |
1544 | const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes(); |
1545 | FastMathFlags FMF = ICA.getFlags(); |
1546 | InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost(); |
1547 | bool SkipScalarizationCost = ICA.skipScalarizationCost(); |
1548 | |
1549 | VectorType *VecOpTy = nullptr; |
1550 | if (!Tys.empty()) { |
1551 | // The vector reduction operand is operand 0 except for fadd/fmul. |
1552 | // Their operand 0 is a scalar start value, so the vector op is operand 1. |
1553 | unsigned VecTyIndex = 0; |
1554 | if (IID == Intrinsic::vector_reduce_fadd || |
1555 | IID == Intrinsic::vector_reduce_fmul) |
1556 | VecTyIndex = 1; |
1557 | assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes")(static_cast <bool> (Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes") ? void (0) : __assert_fail ("Tys.size() > VecTyIndex && \"Unexpected IntrinsicCostAttributes\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 1557, __extension__ __PRETTY_FUNCTION__)); |
1558 | VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]); |
1559 | } |
1560 | |
1561 | // Library call cost - other than size, make it expensive. |
1562 | unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10; |
1563 | SmallVector<unsigned, 2> ISDs; |
1564 | switch (IID) { |
1565 | default: { |
1566 | // Scalable vectors cannot be scalarized, so return Invalid. |
1567 | if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) { |
1568 | return isa<ScalableVectorType>(Ty); |
1569 | })) |
1570 | return InstructionCost::getInvalid(); |
1571 | |
1572 | // Assume that we need to scalarize this intrinsic. |
1573 | InstructionCost ScalarizationCost = |
1574 | SkipScalarizationCost ? ScalarizationCostPassed : 0; |
1575 | unsigned ScalarCalls = 1; |
1576 | Type *ScalarRetTy = RetTy; |
1577 | if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) { |
1578 | if (!SkipScalarizationCost) |
1579 | ScalarizationCost = getScalarizationOverhead(RetVTy, true, false); |
1580 | ScalarCalls = std::max(ScalarCalls, |
1581 | cast<FixedVectorType>(RetVTy)->getNumElements()); |
1582 | ScalarRetTy = RetTy->getScalarType(); |
1583 | } |
1584 | SmallVector<Type *, 4> ScalarTys; |
1585 | for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { |
1586 | Type *Ty = Tys[i]; |
1587 | if (auto *VTy = dyn_cast<VectorType>(Ty)) { |
1588 | if (!SkipScalarizationCost) |
1589 | ScalarizationCost += getScalarizationOverhead(VTy, false, true); |
1590 | ScalarCalls = std::max(ScalarCalls, |
1591 | cast<FixedVectorType>(VTy)->getNumElements()); |
1592 | Ty = Ty->getScalarType(); |
1593 | } |
1594 | ScalarTys.push_back(Ty); |
1595 | } |
1596 | if (ScalarCalls == 1) |
1597 | return 1; // Return cost of a scalar intrinsic. Assume it to be cheap. |
1598 | |
1599 | IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF); |
1600 | InstructionCost ScalarCost = |
1601 | thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind); |
1602 | |
1603 | return ScalarCalls * ScalarCost + ScalarizationCost; |
1604 | } |
1605 | // Look for intrinsics that can be lowered directly or turned into a scalar |
1606 | // intrinsic call. |
1607 | case Intrinsic::sqrt: |
1608 | ISDs.push_back(ISD::FSQRT); |
1609 | break; |
1610 | case Intrinsic::sin: |
1611 | ISDs.push_back(ISD::FSIN); |
1612 | break; |
1613 | case Intrinsic::cos: |
1614 | ISDs.push_back(ISD::FCOS); |
1615 | break; |
1616 | case Intrinsic::exp: |
1617 | ISDs.push_back(ISD::FEXP); |
1618 | break; |
1619 | case Intrinsic::exp2: |
1620 | ISDs.push_back(ISD::FEXP2); |
1621 | break; |
1622 | case Intrinsic::log: |
1623 | ISDs.push_back(ISD::FLOG); |
1624 | break; |
1625 | case Intrinsic::log10: |
1626 | ISDs.push_back(ISD::FLOG10); |
1627 | break; |
1628 | case Intrinsic::log2: |
1629 | ISDs.push_back(ISD::FLOG2); |
1630 | break; |
1631 | case Intrinsic::fabs: |
1632 | ISDs.push_back(ISD::FABS); |
1633 | break; |
1634 | case Intrinsic::canonicalize: |
1635 | ISDs.push_back(ISD::FCANONICALIZE); |
1636 | break; |
1637 | case Intrinsic::minnum: |
1638 | ISDs.push_back(ISD::FMINNUM); |
1639 | break; |
1640 | case Intrinsic::maxnum: |
1641 | ISDs.push_back(ISD::FMAXNUM); |
1642 | break; |
1643 | case Intrinsic::minimum: |
1644 | ISDs.push_back(ISD::FMINIMUM); |
1645 | break; |
1646 | case Intrinsic::maximum: |
1647 | ISDs.push_back(ISD::FMAXIMUM); |
1648 | break; |
1649 | case Intrinsic::copysign: |
1650 | ISDs.push_back(ISD::FCOPYSIGN); |
1651 | break; |
1652 | case Intrinsic::floor: |
1653 | ISDs.push_back(ISD::FFLOOR); |
1654 | break; |
1655 | case Intrinsic::ceil: |
1656 | ISDs.push_back(ISD::FCEIL); |
1657 | break; |
1658 | case Intrinsic::trunc: |
1659 | ISDs.push_back(ISD::FTRUNC); |
1660 | break; |
1661 | case Intrinsic::nearbyint: |
1662 | ISDs.push_back(ISD::FNEARBYINT); |
1663 | break; |
1664 | case Intrinsic::rint: |
1665 | ISDs.push_back(ISD::FRINT); |
1666 | break; |
1667 | case Intrinsic::round: |
1668 | ISDs.push_back(ISD::FROUND); |
1669 | break; |
1670 | case Intrinsic::roundeven: |
1671 | ISDs.push_back(ISD::FROUNDEVEN); |
1672 | break; |
1673 | case Intrinsic::pow: |
1674 | ISDs.push_back(ISD::FPOW); |
1675 | break; |
1676 | case Intrinsic::fma: |
1677 | ISDs.push_back(ISD::FMA); |
1678 | break; |
1679 | case Intrinsic::fmuladd: |
1680 | ISDs.push_back(ISD::FMA); |
1681 | break; |
1682 | case Intrinsic::experimental_constrained_fmuladd: |
1683 | ISDs.push_back(ISD::STRICT_FMA); |
1684 | break; |
1685 | // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free. |
1686 | case Intrinsic::lifetime_start: |
1687 | case Intrinsic::lifetime_end: |
1688 | case Intrinsic::sideeffect: |
1689 | case Intrinsic::pseudoprobe: |
1690 | case Intrinsic::arithmetic_fence: |
1691 | return 0; |
1692 | case Intrinsic::masked_store: { |
1693 | Type *Ty = Tys[0]; |
1694 | Align TyAlign = thisT()->DL.getABITypeAlign(Ty); |
1695 | return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0, |
1696 | CostKind); |
1697 | } |
1698 | case Intrinsic::masked_load: { |
1699 | Type *Ty = RetTy; |
1700 | Align TyAlign = thisT()->DL.getABITypeAlign(Ty); |
1701 | return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0, |
1702 | CostKind); |
1703 | } |
1704 | case Intrinsic::vector_reduce_add: |
1705 | return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy, |
1706 | None, CostKind); |
1707 | case Intrinsic::vector_reduce_mul: |
1708 | return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy, |
1709 | None, CostKind); |
1710 | case Intrinsic::vector_reduce_and: |
1711 | return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy, |
1712 | None, CostKind); |
1713 | case Intrinsic::vector_reduce_or: |
1714 | return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy, None, |
1715 | CostKind); |
1716 | case Intrinsic::vector_reduce_xor: |
1717 | return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy, |
1718 | None, CostKind); |
1719 | case Intrinsic::vector_reduce_fadd: |
1720 | return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy, |
1721 | FMF, CostKind); |
1722 | case Intrinsic::vector_reduce_fmul: |
1723 | return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy, |
1724 | FMF, CostKind); |
1725 | case Intrinsic::vector_reduce_smax: |
1726 | case Intrinsic::vector_reduce_smin: |
1727 | case Intrinsic::vector_reduce_fmax: |
1728 | case Intrinsic::vector_reduce_fmin: |
1729 | return thisT()->getMinMaxReductionCost( |
1730 | VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)), |
1731 | /*IsUnsigned=*/false, CostKind); |
1732 | case Intrinsic::vector_reduce_umax: |
1733 | case Intrinsic::vector_reduce_umin: |
1734 | return thisT()->getMinMaxReductionCost( |
1735 | VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)), |
1736 | /*IsUnsigned=*/true, CostKind); |
1737 | case Intrinsic::abs: { |
1738 | // abs(X) = select(icmp(X,0),X,sub(0,X)) |
1739 | Type *CondTy = RetTy->getWithNewBitWidth(1); |
1740 | CmpInst::Predicate Pred = CmpInst::ICMP_SGT; |
1741 | InstructionCost Cost = 0; |
1742 | Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, |
1743 | Pred, CostKind); |
1744 | Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, |
1745 | Pred, CostKind); |
1746 | // TODO: Should we add an OperandValueProperties::OP_Zero property? |
1747 | Cost += thisT()->getArithmeticInstrCost( |
1748 | BinaryOperator::Sub, RetTy, CostKind, TTI::OK_UniformConstantValue); |
1749 | return Cost; |
1750 | } |
1751 | case Intrinsic::smax: |
1752 | case Intrinsic::smin: |
1753 | case Intrinsic::umax: |
1754 | case Intrinsic::umin: { |
1755 | // minmax(X,Y) = select(icmp(X,Y),X,Y) |
1756 | Type *CondTy = RetTy->getWithNewBitWidth(1); |
1757 | bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin; |
1758 | CmpInst::Predicate Pred = |
1759 | IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT; |
1760 | InstructionCost Cost = 0; |
1761 | Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, |
1762 | Pred, CostKind); |
1763 | Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, |
1764 | Pred, CostKind); |
1765 | return Cost; |
1766 | } |
1767 | case Intrinsic::sadd_sat: |
1768 | case Intrinsic::ssub_sat: { |
1769 | Type *CondTy = RetTy->getWithNewBitWidth(1); |
1770 | |
1771 | Type *OpTy = StructType::create({RetTy, CondTy}); |
1772 | Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat |
1773 | ? Intrinsic::sadd_with_overflow |
1774 | : Intrinsic::ssub_with_overflow; |
1775 | CmpInst::Predicate Pred = CmpInst::ICMP_SGT; |
1776 | |
1777 | // SatMax -> Overflow && SumDiff < 0 |
1778 | // SatMin -> Overflow && SumDiff >= 0 |
1779 | InstructionCost Cost = 0; |
1780 | IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, |
1781 | nullptr, ScalarizationCostPassed); |
1782 | Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); |
1783 | Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, |
1784 | Pred, CostKind); |
1785 | Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, |
1786 | CondTy, Pred, CostKind); |
1787 | return Cost; |
1788 | } |
1789 | case Intrinsic::uadd_sat: |
1790 | case Intrinsic::usub_sat: { |
1791 | Type *CondTy = RetTy->getWithNewBitWidth(1); |
1792 | |
1793 | Type *OpTy = StructType::create({RetTy, CondTy}); |
1794 | Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat |
1795 | ? Intrinsic::uadd_with_overflow |
1796 | : Intrinsic::usub_with_overflow; |
1797 | |
1798 | InstructionCost Cost = 0; |
1799 | IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, |
1800 | nullptr, ScalarizationCostPassed); |
1801 | Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); |
1802 | Cost += |
1803 | thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, |
1804 | CmpInst::BAD_ICMP_PREDICATE, CostKind); |
1805 | return Cost; |
1806 | } |
1807 | case Intrinsic::smul_fix: |
1808 | case Intrinsic::umul_fix: { |
1809 | unsigned ExtSize = RetTy->getScalarSizeInBits() * 2; |
1810 | Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize); |
1811 | |
1812 | unsigned ExtOp = |
1813 | IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt; |
1814 | TTI::CastContextHint CCH = TTI::CastContextHint::None; |
1815 | |
1816 | InstructionCost Cost = 0; |
1817 | Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind); |
1818 | Cost += |
1819 | thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); |
1820 | Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy, |
1821 | CCH, CostKind); |
1822 | Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy, |
1823 | CostKind, TTI::OK_AnyValue, |
1824 | TTI::OK_UniformConstantValue); |
1825 | Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind, |
1826 | TTI::OK_AnyValue, |
1827 | TTI::OK_UniformConstantValue); |
1828 | Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind); |
1829 | return Cost; |
1830 | } |
1831 | case Intrinsic::sadd_with_overflow: |
1832 | case Intrinsic::ssub_with_overflow: { |
1833 | Type *SumTy = RetTy->getContainedType(0); |
1834 | Type *OverflowTy = RetTy->getContainedType(1); |
1835 | unsigned Opcode = IID == Intrinsic::sadd_with_overflow |
1836 | ? BinaryOperator::Add |
1837 | : BinaryOperator::Sub; |
1838 | |
1839 | // Add: |
1840 | // Overflow -> (Result < LHS) ^ (RHS < 0) |
1841 | // Sub: |
1842 | // Overflow -> (Result < LHS) ^ (RHS > 0) |
1843 | InstructionCost Cost = 0; |
1844 | Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind); |
1845 | Cost += 2 * thisT()->getCmpSelInstrCost( |
1846 | Instruction::ICmp, SumTy, OverflowTy, |
1847 | CmpInst::ICMP_SGT, CostKind); |
1848 | Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy, |
1849 | CostKind); |
1850 | return Cost; |
1851 | } |
1852 | case Intrinsic::uadd_with_overflow: |
1853 | case Intrinsic::usub_with_overflow: { |
1854 | Type *SumTy = RetTy->getContainedType(0); |
1855 | Type *OverflowTy = RetTy->getContainedType(1); |
1856 | unsigned Opcode = IID == Intrinsic::uadd_with_overflow |
1857 | ? BinaryOperator::Add |
1858 | : BinaryOperator::Sub; |
1859 | CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow |
1860 | ? CmpInst::ICMP_ULT |
1861 | : CmpInst::ICMP_UGT; |
1862 | |
1863 | InstructionCost Cost = 0; |
1864 | Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind); |
1865 | Cost += |
1866 | thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy, |
1867 | Pred, CostKind); |
1868 | return Cost; |
1869 | } |
1870 | case Intrinsic::smul_with_overflow: |
1871 | case Intrinsic::umul_with_overflow: { |
1872 | Type *MulTy = RetTy->getContainedType(0); |
1873 | Type *OverflowTy = RetTy->getContainedType(1); |
1874 | unsigned ExtSize = MulTy->getScalarSizeInBits() * 2; |
1875 | Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize); |
1876 | bool IsSigned = IID == Intrinsic::smul_with_overflow; |
1877 | |
1878 | unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt; |
1879 | TTI::CastContextHint CCH = TTI::CastContextHint::None; |
1880 | |
1881 | InstructionCost Cost = 0; |
1882 | Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind); |
1883 | Cost += |
1884 | thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); |
1885 | Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy, |
1886 | CCH, CostKind); |
1887 | Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, ExtTy, |
1888 | CostKind, TTI::OK_AnyValue, |
1889 | TTI::OK_UniformConstantValue); |
1890 | |
1891 | if (IsSigned) |
1892 | Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy, |
1893 | CostKind, TTI::OK_AnyValue, |
1894 | TTI::OK_UniformConstantValue); |
1895 | |
1896 | Cost += thisT()->getCmpSelInstrCost( |
1897 | BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind); |
1898 | return Cost; |
1899 | } |
1900 | case Intrinsic::ctpop: |
1901 | ISDs.push_back(ISD::CTPOP); |
1902 | // In case of legalization use TCC_Expensive. This is cheaper than a |
1903 | // library call but still not a cheap instruction. |
1904 | SingleCallCost = TargetTransformInfo::TCC_Expensive; |
1905 | break; |
1906 | case Intrinsic::ctlz: |
1907 | ISDs.push_back(ISD::CTLZ); |
1908 | break; |
1909 | case Intrinsic::cttz: |
1910 | ISDs.push_back(ISD::CTTZ); |
1911 | break; |
1912 | case Intrinsic::bswap: |
1913 | ISDs.push_back(ISD::BSWAP); |
1914 | break; |
1915 | case Intrinsic::bitreverse: |
1916 | ISDs.push_back(ISD::BITREVERSE); |
1917 | break; |
1918 | } |
1919 | |
1920 | const TargetLoweringBase *TLI = getTLI(); |
1921 | std::pair<InstructionCost, MVT> LT = |
1922 | TLI->getTypeLegalizationCost(DL, RetTy); |
1923 | |
1924 | SmallVector<InstructionCost, 2> LegalCost; |
1925 | SmallVector<InstructionCost, 2> CustomCost; |
1926 | for (unsigned ISD : ISDs) { |
1927 | if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { |
1928 | if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() && |
1929 | TLI->isFAbsFree(LT.second)) { |
1930 | return 0; |
1931 | } |
1932 | |
1933 | // The operation is legal. Assume it costs 1. |
1934 | // If the type is split to multiple registers, assume that there is some |
1935 | // overhead to this. |
1936 | // TODO: Once we have extract/insert subvector cost we need to use them. |
1937 | if (LT.first > 1) |
1938 | LegalCost.push_back(LT.first * 2); |
1939 | else |
1940 | LegalCost.push_back(LT.first * 1); |
1941 | } else if (!TLI->isOperationExpand(ISD, LT.second)) { |
1942 | // If the operation is custom lowered then assume |
1943 | // that the code is twice as expensive. |
1944 | CustomCost.push_back(LT.first * 2); |
1945 | } |
1946 | } |
1947 | |
1948 | auto *MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end()); |
1949 | if (MinLegalCostI != LegalCost.end()) |
1950 | return *MinLegalCostI; |
1951 | |
1952 | auto MinCustomCostI = |
1953 | std::min_element(CustomCost.begin(), CustomCost.end()); |
1954 | if (MinCustomCostI != CustomCost.end()) |
1955 | return *MinCustomCostI; |
1956 | |
1957 | // If we can't lower fmuladd into an FMA estimate the cost as a floating |
1958 | // point mul followed by an add. |
1959 | if (IID == Intrinsic::fmuladd) |
1960 | return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy, |
1961 | CostKind) + |
1962 | thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy, |
1963 | CostKind); |
1964 | if (IID == Intrinsic::experimental_constrained_fmuladd) { |
1965 | IntrinsicCostAttributes FMulAttrs( |
1966 | Intrinsic::experimental_constrained_fmul, RetTy, Tys); |
1967 | IntrinsicCostAttributes FAddAttrs( |
1968 | Intrinsic::experimental_constrained_fadd, RetTy, Tys); |
1969 | return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) + |
1970 | thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind); |
1971 | } |
1972 | |
1973 | // Else, assume that we need to scalarize this intrinsic. For math builtins |
1974 | // this will emit a costly libcall, adding call overhead and spills. Make it |
1975 | // very expensive. |
1976 | if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) { |
1977 | // Scalable vectors cannot be scalarized, so return Invalid. |
1978 | if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) { |
1979 | return isa<ScalableVectorType>(Ty); |
1980 | })) |
1981 | return InstructionCost::getInvalid(); |
1982 | |
1983 | InstructionCost ScalarizationCost = |
1984 | SkipScalarizationCost ? ScalarizationCostPassed |
1985 | : getScalarizationOverhead(RetVTy, true, false); |
1986 | |
1987 | unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements(); |
1988 | SmallVector<Type *, 4> ScalarTys; |
1989 | for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { |
1990 | Type *Ty = Tys[i]; |
1991 | if (Ty->isVectorTy()) |
1992 | Ty = Ty->getScalarType(); |
1993 | ScalarTys.push_back(Ty); |
1994 | } |
1995 | IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF); |
1996 | InstructionCost ScalarCost = |
1997 | thisT()->getIntrinsicInstrCost(Attrs, CostKind); |
1998 | for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { |
1999 | if (auto *VTy = dyn_cast<VectorType>(Tys[i])) { |
2000 | if (!ICA.skipScalarizationCost()) |
2001 | ScalarizationCost += getScalarizationOverhead(VTy, false, true); |
2002 | ScalarCalls = std::max(ScalarCalls, |
2003 | cast<FixedVectorType>(VTy)->getNumElements()); |
2004 | } |
2005 | } |
2006 | return ScalarCalls * ScalarCost + ScalarizationCost; |
2007 | } |
2008 | |
2009 | // This is going to be turned into a library call, make it expensive. |
2010 | return SingleCallCost; |
2011 | } |
2012 | |
2013 | /// Compute a cost of the given call instruction. |
2014 | /// |
2015 | /// Compute the cost of calling function F with return type RetTy and |
2016 | /// argument types Tys. F might be nullptr, in this case the cost of an |
2017 | /// arbitrary call with the specified signature will be returned. |
2018 | /// This is used, for instance, when we estimate call of a vector |
2019 | /// counterpart of the given function. |
2020 | /// \param F Called function, might be nullptr. |
2021 | /// \param RetTy Return value types. |
2022 | /// \param Tys Argument types. |
2023 | /// \returns The cost of Call instruction. |
2024 | InstructionCost getCallInstrCost(Function *F, Type *RetTy, |
2025 | ArrayRef<Type *> Tys, |
2026 | TTI::TargetCostKind CostKind) { |
2027 | return 10; |
2028 | } |
2029 | |
2030 | unsigned getNumberOfParts(Type *Tp) { |
2031 | std::pair<InstructionCost, MVT> LT = |
2032 | getTLI()->getTypeLegalizationCost(DL, Tp); |
2033 | return LT.first.isValid() ? *LT.first.getValue() : 0; |
2034 | } |
2035 | |
2036 | InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, |
2037 | const SCEV *) { |
2038 | return 0; |
2039 | } |
2040 | |
2041 | /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics. |
2042 | /// We're assuming that reduction operation are performing the following way: |
2043 | /// |
2044 | /// %val1 = shufflevector<n x t> %val, <n x t> %undef, |
2045 | /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef> |
2046 | /// \----------------v-------------/ \----------v------------/ |
2047 | /// n/2 elements n/2 elements |
2048 | /// %red1 = op <n x t> %val, <n x t> val1 |
2049 | /// After this operation we have a vector %red1 where only the first n/2 |
2050 | /// elements are meaningful, the second n/2 elements are undefined and can be |
2051 | /// dropped. All other operations are actually working with the vector of |
2052 | /// length n/2, not n, though the real vector length is still n. |
2053 | /// %val2 = shufflevector<n x t> %red1, <n x t> %undef, |
2054 | /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef> |
2055 | /// \----------------v-------------/ \----------v------------/ |
2056 | /// n/4 elements 3*n/4 elements |
2057 | /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of |
2058 | /// length n/2, the resulting vector has length n/4 etc. |
2059 | /// |
2060 | /// The cost model should take into account that the actual length of the |
2061 | /// vector is reduced on each iteration. |
2062 | InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty, |
2063 | TTI::TargetCostKind CostKind) { |
2064 | Type *ScalarTy = Ty->getElementType(); |
2065 | unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements(); |
2066 | if ((Opcode == Instruction::Or || Opcode == Instruction::And) && |
2067 | ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) && |
2068 | NumVecElts >= 2) { |
2069 | // Or reduction for i1 is represented as: |
2070 | // %val = bitcast <ReduxWidth x i1> to iReduxWidth |
2071 | // %res = cmp ne iReduxWidth %val, 0 |
2072 | // And reduction for i1 is represented as: |
2073 | // %val = bitcast <ReduxWidth x i1> to iReduxWidth |
2074 | // %res = cmp eq iReduxWidth %val, 11111 |
2075 | Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts); |
2076 | return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty, |
2077 | TTI::CastContextHint::None, CostKind) + |
2078 | thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy, |
2079 | CmpInst::makeCmpResultType(ValTy), |
2080 | CmpInst::BAD_ICMP_PREDICATE, CostKind); |
2081 | } |
2082 | unsigned NumReduxLevels = Log2_32(NumVecElts); |
2083 | InstructionCost ArithCost = 0; |
2084 | InstructionCost ShuffleCost = 0; |
2085 | std::pair<InstructionCost, MVT> LT = |
2086 | thisT()->getTLI()->getTypeLegalizationCost(DL, Ty); |
2087 | unsigned LongVectorCount = 0; |
2088 | unsigned MVTLen = |
2089 | LT.second.isVector() ? LT.second.getVectorNumElements() : 1; |
2090 | while (NumVecElts > MVTLen) { |
2091 | NumVecElts /= 2; |
2092 | VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); |
2093 | ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, |
2094 | NumVecElts, SubTy); |
2095 | ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind); |
2096 | Ty = SubTy; |
2097 | ++LongVectorCount; |
2098 | } |
2099 | |
2100 | NumReduxLevels -= LongVectorCount; |
2101 | |
2102 | // The minimal length of the vector is limited by the real length of vector |
2103 | // operations performed on the current platform. That's why several final |
2104 | // reduction operations are performed on the vectors with the same |
2105 | // architecture-dependent length. |
2106 | |
2107 | // By default reductions need one shuffle per reduction level. |
2108 | ShuffleCost += NumReduxLevels * thisT()->getShuffleCost( |
2109 | TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty); |
2110 | ArithCost += |
2111 | NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind); |
2112 | return ShuffleCost + ArithCost + |
2113 | thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0); |
2114 | } |
2115 | |
2116 | /// Try to calculate the cost of performing strict (in-order) reductions, |
2117 | /// which involves doing a sequence of floating point additions in lane |
2118 | /// order, starting with an initial value. For example, consider a scalar |
2119 | /// initial value 'InitVal' of type float and a vector of type <4 x float>: |
2120 | /// |
2121 | /// Vector = <float %v0, float %v1, float %v2, float %v3> |
2122 | /// |
2123 | /// %add1 = %InitVal + %v0 |
2124 | /// %add2 = %add1 + %v1 |
2125 | /// %add3 = %add2 + %v2 |
2126 | /// %add4 = %add3 + %v3 |
2127 | /// |
2128 | /// As a simple estimate we can say the cost of such a reduction is 4 times |
2129 | /// the cost of a scalar FP addition. We can only estimate the costs for |
2130 | /// fixed-width vectors here because for scalable vectors we do not know the |
2131 | /// runtime number of operations. |
2132 | InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty, |
2133 | TTI::TargetCostKind CostKind) { |
2134 | // Targets must implement a default value for the scalable case, since |
2135 | // we don't know how many lanes the vector has. |
2136 | if (isa<ScalableVectorType>(Ty)) |
2137 | return InstructionCost::getInvalid(); |
2138 | |
2139 | auto *VTy = cast<FixedVectorType>(Ty); |
2140 | InstructionCost ExtractCost = |
2141 | getScalarizationOverhead(VTy, /*Insert=*/false, /*Extract=*/true); |
2142 | InstructionCost ArithCost = thisT()->getArithmeticInstrCost( |
2143 | Opcode, VTy->getElementType(), CostKind); |
2144 | ArithCost *= VTy->getNumElements(); |
2145 | |
2146 | return ExtractCost + ArithCost; |
2147 | } |
2148 | |
2149 | InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, |
2150 | Optional<FastMathFlags> FMF, |
2151 | TTI::TargetCostKind CostKind) { |
2152 | if (TTI::requiresOrderedReduction(FMF)) |
2153 | return getOrderedReductionCost(Opcode, Ty, CostKind); |
2154 | return getTreeReductionCost(Opcode, Ty, CostKind); |
2155 | } |
2156 | |
2157 | /// Try to calculate op costs for min/max reduction operations. |
2158 | /// \param CondTy Conditional type for the Select instruction. |
2159 | InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, |
2160 | bool IsUnsigned, |
2161 | TTI::TargetCostKind CostKind) { |
2162 | Type *ScalarTy = Ty->getElementType(); |
2163 | Type *ScalarCondTy = CondTy->getElementType(); |
2164 | unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements(); |
2165 | unsigned NumReduxLevels = Log2_32(NumVecElts); |
2166 | unsigned CmpOpcode; |
2167 | if (Ty->isFPOrFPVectorTy()) { |
2168 | CmpOpcode = Instruction::FCmp; |
2169 | } else { |
2170 | assert(Ty->isIntOrIntVectorTy() &&(static_cast <bool> (Ty->isIntOrIntVectorTy() && "expecting floating point or integer type for min/max reduction" ) ? void (0) : __assert_fail ("Ty->isIntOrIntVectorTy() && \"expecting floating point or integer type for min/max reduction\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 2171, __extension__ __PRETTY_FUNCTION__)) |
2171 | "expecting floating point or integer type for min/max reduction")(static_cast <bool> (Ty->isIntOrIntVectorTy() && "expecting floating point or integer type for min/max reduction" ) ? void (0) : __assert_fail ("Ty->isIntOrIntVectorTy() && \"expecting floating point or integer type for min/max reduction\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 2171, __extension__ __PRETTY_FUNCTION__)); |
2172 | CmpOpcode = Instruction::ICmp; |
2173 | } |
2174 | InstructionCost MinMaxCost = 0; |
2175 | InstructionCost ShuffleCost = 0; |
2176 | std::pair<InstructionCost, MVT> LT = |
2177 | thisT()->getTLI()->getTypeLegalizationCost(DL, Ty); |
2178 | unsigned LongVectorCount = 0; |
2179 | unsigned MVTLen = |
2180 | LT.second.isVector() ? LT.second.getVectorNumElements() : 1; |
2181 | while (NumVecElts > MVTLen) { |
2182 | NumVecElts /= 2; |
2183 | auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); |
2184 | CondTy = FixedVectorType::get(ScalarCondTy, NumVecElts); |
2185 | |
2186 | ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, |
2187 | NumVecElts, SubTy); |
2188 | MinMaxCost += |
2189 | thisT()->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy, |
2190 | CmpInst::BAD_ICMP_PREDICATE, CostKind) + |
2191 | thisT()->getCmpSelInstrCost(Instruction::Select, SubTy, CondTy, |
2192 | CmpInst::BAD_ICMP_PREDICATE, CostKind); |
2193 | Ty = SubTy; |
2194 | ++LongVectorCount; |
2195 | } |
2196 | |
2197 | NumReduxLevels -= LongVectorCount; |
2198 | |
2199 | // The minimal length of the vector is limited by the real length of vector |
2200 | // operations performed on the current platform. That's why several final |
2201 | // reduction opertions are perfomed on the vectors with the same |
2202 | // architecture-dependent length. |
2203 | ShuffleCost += NumReduxLevels * thisT()->getShuffleCost( |
2204 | TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty); |
2205 | MinMaxCost += |
2206 | NumReduxLevels * |
2207 | (thisT()->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, |
2208 | CmpInst::BAD_ICMP_PREDICATE, CostKind) + |
2209 | thisT()->getCmpSelInstrCost(Instruction::Select, Ty, CondTy, |
2210 | CmpInst::BAD_ICMP_PREDICATE, CostKind)); |
2211 | // The last min/max should be in vector registers and we counted it above. |
2212 | // So just need a single extractelement. |
2213 | return ShuffleCost + MinMaxCost + |
2214 | thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0); |
2215 | } |
2216 | |
2217 | InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, |
2218 | Type *ResTy, VectorType *Ty, |
2219 | TTI::TargetCostKind CostKind) { |
2220 | // Without any native support, this is equivalent to the cost of |
2221 | // vecreduce.add(ext) or if IsMLA vecreduce.add(mul(ext, ext)) |
2222 | VectorType *ExtTy = VectorType::get(ResTy, Ty); |
2223 | InstructionCost RedCost = thisT()->getArithmeticReductionCost( |
2224 | Instruction::Add, ExtTy, None, CostKind); |
2225 | InstructionCost MulCost = 0; |
2226 | InstructionCost ExtCost = thisT()->getCastInstrCost( |
2227 | IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty, |
2228 | TTI::CastContextHint::None, CostKind); |
2229 | if (IsMLA) { |
2230 | MulCost = |
2231 | thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); |
2232 | ExtCost *= 2; |
2233 | } |
2234 | |
2235 | return RedCost + MulCost + ExtCost; |
2236 | } |
2237 | |
2238 | InstructionCost getVectorSplitCost() { return 1; } |
2239 | |
2240 | /// @} |
2241 | }; |
2242 | |
2243 | /// Concrete BasicTTIImpl that can be used if no further customization |
2244 | /// is needed. |
2245 | class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> { |
2246 | using BaseT = BasicTTIImplBase<BasicTTIImpl>; |
2247 | |
2248 | friend class BasicTTIImplBase<BasicTTIImpl>; |
2249 | |
2250 | const TargetSubtargetInfo *ST; |
2251 | const TargetLoweringBase *TLI; |
2252 | |
2253 | const TargetSubtargetInfo *getST() const { return ST; } |
2254 | const TargetLoweringBase *getTLI() const { return TLI; } |
2255 | |
2256 | public: |
2257 | explicit BasicTTIImpl(const TargetMachine *TM, const Function &F); |
2258 | }; |
2259 | |
2260 | } // end namespace llvm |
2261 | |
2262 | #endif // LLVM_CODEGEN_BASICTTIIMPL_H |
1 | //===- Support/MachineValueType.h - Machine-Level types ---------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file defines the set of machine-level target independent types which |
10 | // legal values in the code generator use. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #ifndef LLVM_SUPPORT_MACHINEVALUETYPE_H |
15 | #define LLVM_SUPPORT_MACHINEVALUETYPE_H |
16 | |
17 | #include "llvm/ADT/Sequence.h" |
18 | #include "llvm/ADT/iterator_range.h" |
19 | #include "llvm/Support/ErrorHandling.h" |
20 | #include "llvm/Support/MathExtras.h" |
21 | #include "llvm/Support/TypeSize.h" |
22 | #include <cassert> |
23 | |
24 | namespace llvm { |
25 | |
26 | class Type; |
27 | |
28 | /// Machine Value Type. Every type that is supported natively by some |
29 | /// processor targeted by LLVM occurs here. This means that any legal value |
30 | /// type can be represented by an MVT. |
31 | class MVT { |
32 | public: |
33 | enum SimpleValueType : uint8_t { |
34 | // clang-format off |
35 | |
36 | // Simple value types that aren't explicitly part of this enumeration |
37 | // are considered extended value types. |
38 | INVALID_SIMPLE_VALUE_TYPE = 0, |
39 | |
40 | // If you change this numbering, you must change the values in |
41 | // ValueTypes.td as well! |
42 | Other = 1, // This is a non-standard value |
43 | i1 = 2, // This is a 1 bit integer value |
44 | i8 = 3, // This is an 8 bit integer value |
45 | i16 = 4, // This is a 16 bit integer value |
46 | i32 = 5, // This is a 32 bit integer value |
47 | i64 = 6, // This is a 64 bit integer value |
48 | i128 = 7, // This is a 128 bit integer value |
49 | |
50 | FIRST_INTEGER_VALUETYPE = i1, |
51 | LAST_INTEGER_VALUETYPE = i128, |
52 | |
53 | bf16 = 8, // This is a 16 bit brain floating point value |
54 | f16 = 9, // This is a 16 bit floating point value |
55 | f32 = 10, // This is a 32 bit floating point value |
56 | f64 = 11, // This is a 64 bit floating point value |
57 | f80 = 12, // This is a 80 bit floating point value |
58 | f128 = 13, // This is a 128 bit floating point value |
59 | ppcf128 = 14, // This is a PPC 128-bit floating point value |
60 | |
61 | FIRST_FP_VALUETYPE = bf16, |
62 | LAST_FP_VALUETYPE = ppcf128, |
63 | |
64 | v1i1 = 15, // 1 x i1 |
65 | v2i1 = 16, // 2 x i1 |
66 | v4i1 = 17, // 4 x i1 |
67 | v8i1 = 18, // 8 x i1 |
68 | v16i1 = 19, // 16 x i1 |
69 | v32i1 = 20, // 32 x i1 |
70 | v64i1 = 21, // 64 x i1 |
71 | v128i1 = 22, // 128 x i1 |
72 | v256i1 = 23, // 256 x i1 |
73 | v512i1 = 24, // 512 x i1 |
74 | v1024i1 = 25, // 1024 x i1 |
75 | |
76 | v1i8 = 26, // 1 x i8 |
77 | v2i8 = 27, // 2 x i8 |
78 | v4i8 = 28, // 4 x i8 |
79 | v8i8 = 29, // 8 x i8 |
80 | v16i8 = 30, // 16 x i8 |
81 | v32i8 = 31, // 32 x i8 |
82 | v64i8 = 32, // 64 x i8 |
83 | v128i8 = 33, // 128 x i8 |
84 | v256i8 = 34, // 256 x i8 |
85 | v512i8 = 35, // 512 x i8 |
86 | v1024i8 = 36, // 1024 x i8 |
87 | |
88 | v1i16 = 37, // 1 x i16 |
89 | v2i16 = 38, // 2 x i16 |
90 | v3i16 = 39, // 3 x i16 |
91 | v4i16 = 40, // 4 x i16 |
92 | v8i16 = 41, // 8 x i16 |
93 | v16i16 = 42, // 16 x i16 |
94 | v32i16 = 43, // 32 x i16 |
95 | v64i16 = 44, // 64 x i16 |
96 | v128i16 = 45, // 128 x i16 |
97 | v256i16 = 46, // 256 x i16 |
98 | v512i16 = 47, // 512 x i16 |
99 | |
100 | v1i32 = 48, // 1 x i32 |
101 | v2i32 = 49, // 2 x i32 |
102 | v3i32 = 50, // 3 x i32 |
103 | v4i32 = 51, // 4 x i32 |
104 | v5i32 = 52, // 5 x i32 |
105 | v6i32 = 53, // 6 x i32 |
106 | v7i32 = 54, // 7 x i32 |
107 | v8i32 = 55, // 8 x i32 |
108 | v16i32 = 56, // 16 x i32 |
109 | v32i32 = 57, // 32 x i32 |
110 | v64i32 = 58, // 64 x i32 |
111 | v128i32 = 59, // 128 x i32 |
112 | v256i32 = 60, // 256 x i32 |
113 | v512i32 = 61, // 512 x i32 |
114 | v1024i32 = 62, // 1024 x i32 |
115 | v2048i32 = 63, // 2048 x i32 |
116 | |
117 | v1i64 = 64, // 1 x i64 |
118 | v2i64 = 65, // 2 x i64 |
119 | v3i64 = 66, // 3 x i64 |
120 | v4i64 = 67, // 4 x i64 |
121 | v8i64 = 68, // 8 x i64 |
122 | v16i64 = 69, // 16 x i64 |
123 | v32i64 = 70, // 32 x i64 |
124 | v64i64 = 71, // 64 x i64 |
125 | v128i64 = 72, // 128 x i64 |
126 | v256i64 = 73, // 256 x i64 |
127 | |
128 | v1i128 = 74, // 1 x i128 |
129 | |
130 | FIRST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE = v1i1, |
131 | LAST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE = v1i128, |
132 | |
133 | v1f16 = 75, // 1 x f16 |
134 | v2f16 = 76, // 2 x f16 |
135 | v3f16 = 77, // 3 x f16 |
136 | v4f16 = 78, // 4 x f16 |
137 | v8f16 = 79, // 8 x f16 |
138 | v16f16 = 80, // 16 x f16 |
139 | v32f16 = 81, // 32 x f16 |
140 | v64f16 = 82, // 64 x f16 |
141 | v128f16 = 83, // 128 x f16 |
142 | v256f16 = 84, // 256 x f16 |
143 | v512f16 = 85, // 256 x f16 |
144 | |
145 | v2bf16 = 86, // 2 x bf16 |
146 | v3bf16 = 87, // 3 x bf16 |
147 | v4bf16 = 88, // 4 x bf16 |
148 | v8bf16 = 89, // 8 x bf16 |
149 | v16bf16 = 90, // 16 x bf16 |
150 | v32bf16 = 91, // 32 x bf16 |
151 | v64bf16 = 92, // 64 x bf16 |
152 | v128bf16 = 93, // 128 x bf16 |
153 | |
154 | v1f32 = 94, // 1 x f32 |
155 | v2f32 = 95, // 2 x f32 |
156 | v3f32 = 96, // 3 x f32 |
157 | v4f32 = 97, // 4 x f32 |
158 | v5f32 = 98, // 5 x f32 |
159 | v6f32 = 99, // 6 x f32 |
160 | v7f32 = 100, // 7 x f32 |
161 | v8f32 = 101, // 8 x f32 |
162 | v16f32 = 102, // 16 x f32 |
163 | v32f32 = 103, // 32 x f32 |
164 | v64f32 = 104, // 64 x f32 |
165 | v128f32 = 105, // 128 x f32 |
166 | v256f32 = 106, // 256 x f32 |
167 | v512f32 = 107, // 512 x f32 |
168 | v1024f32 = 108, // 1024 x f32 |
169 | v2048f32 = 109, // 2048 x f32 |
170 | |
171 | v1f64 = 110, // 1 x f64 |
172 | v2f64 = 111, // 2 x f64 |
173 | v3f64 = 112, // 3 x f64 |
174 | v4f64 = 113, // 4 x f64 |
175 | v8f64 = 114, // 8 x f64 |
176 | v16f64 = 115, // 16 x f64 |
177 | v32f64 = 116, // 32 x f64 |
178 | v64f64 = 117, // 64 x f64 |
179 | v128f64 = 118, // 128 x f64 |
180 | v256f64 = 119, // 256 x f64 |
181 | |
182 | FIRST_FP_FIXEDLEN_VECTOR_VALUETYPE = v1f16, |
183 | LAST_FP_FIXEDLEN_VECTOR_VALUETYPE = v256f64, |
184 | |
185 | FIRST_FIXEDLEN_VECTOR_VALUETYPE = v1i1, |
186 | LAST_FIXEDLEN_VECTOR_VALUETYPE = v256f64, |
187 | |
188 | nxv1i1 = 120, // n x 1 x i1 |
189 | nxv2i1 = 121, // n x 2 x i1 |
190 | nxv4i1 = 122, // n x 4 x i1 |
191 | nxv8i1 = 123, // n x 8 x i1 |
192 | nxv16i1 = 124, // n x 16 x i1 |
193 | nxv32i1 = 125, // n x 32 x i1 |
194 | nxv64i1 = 126, // n x 64 x i1 |
195 | |
196 | nxv1i8 = 127, // n x 1 x i8 |
197 | nxv2i8 = 128, // n x 2 x i8 |
198 | nxv4i8 = 129, // n x 4 x i8 |
199 | nxv8i8 = 130, // n x 8 x i8 |
200 | nxv16i8 = 131, // n x 16 x i8 |
201 | nxv32i8 = 132, // n x 32 x i8 |
202 | nxv64i8 = 133, // n x 64 x i8 |
203 | |
204 | nxv1i16 = 134, // n x 1 x i16 |
205 | nxv2i16 = 135, // n x 2 x i16 |
206 | nxv4i16 = 136, // n x 4 x i16 |
207 | nxv8i16 = 137, // n x 8 x i16 |
208 | nxv16i16 = 138, // n x 16 x i16 |
209 | nxv32i16 = 139, // n x 32 x i16 |
210 | |
211 | nxv1i32 = 140, // n x 1 x i32 |
212 | nxv2i32 = 141, // n x 2 x i32 |
213 | nxv4i32 = 142, // n x 4 x i32 |
214 | nxv8i32 = 143, // n x 8 x i32 |
215 | nxv16i32 = 144, // n x 16 x i32 |
216 | nxv32i32 = 145, // n x 32 x i32 |
217 | |
218 | nxv1i64 = 146, // n x 1 x i64 |
219 | nxv2i64 = 147, // n x 2 x i64 |
220 | nxv4i64 = 148, // n x 4 x i64 |
221 | nxv8i64 = 149, // n x 8 x i64 |
222 | nxv16i64 = 150, // n x 16 x i64 |
223 | nxv32i64 = 151, // n x 32 x i64 |
224 | |
225 | FIRST_INTEGER_SCALABLE_VECTOR_VALUETYPE = nxv1i1, |
226 | LAST_INTEGER_SCALABLE_VECTOR_VALUETYPE = nxv32i64, |
227 | |
228 | nxv1f16 = 152, // n x 1 x f16 |
229 | nxv2f16 = 153, // n x 2 x f16 |
230 | nxv4f16 = 154, // n x 4 x f16 |
231 | nxv8f16 = 155, // n x 8 x f16 |
232 | nxv16f16 = 156, // n x 16 x f16 |
233 | nxv32f16 = 157, // n x 32 x f16 |
234 | |
235 | nxv1bf16 = 158, // n x 1 x bf16 |
236 | nxv2bf16 = 159, // n x 2 x bf16 |
237 | nxv4bf16 = 160, // n x 4 x bf16 |
238 | nxv8bf16 = 161, // n x 8 x bf16 |
239 | |
240 | nxv1f32 = 162, // n x 1 x f32 |
241 | nxv2f32 = 163, // n x 2 x f32 |
242 | nxv4f32 = 164, // n x 4 x f32 |
243 | nxv8f32 = 165, // n x 8 x f32 |
244 | nxv16f32 = 166, // n x 16 x f32 |
245 | |
246 | nxv1f64 = 167, // n x 1 x f64 |
247 | nxv2f64 = 168, // n x 2 x f64 |
248 | nxv4f64 = 169, // n x 4 x f64 |
249 | nxv8f64 = 170, // n x 8 x f64 |
250 | |
251 | FIRST_FP_SCALABLE_VECTOR_VALUETYPE = nxv1f16, |
252 | LAST_FP_SCALABLE_VECTOR_VALUETYPE = nxv8f64, |
253 | |
254 | FIRST_SCALABLE_VECTOR_VALUETYPE = nxv1i1, |
255 | LAST_SCALABLE_VECTOR_VALUETYPE = nxv8f64, |
256 | |
257 | FIRST_VECTOR_VALUETYPE = v1i1, |
258 | LAST_VECTOR_VALUETYPE = nxv8f64, |
259 | |
260 | x86mmx = 171, // This is an X86 MMX value |
261 | |
262 | Glue = 172, // This glues nodes together during pre-RA sched |
263 | |
264 | isVoid = 173, // This has no value |
265 | |
266 | Untyped = 174, // This value takes a register, but has |
267 | // unspecified type. The register class |
268 | // will be determined by the opcode. |
269 | |
270 | funcref = 175, // WebAssembly's funcref type |
271 | externref = 176, // WebAssembly's externref type |
272 | x86amx = 177, // This is an X86 AMX value |
273 | i64x8 = 178, // 8 Consecutive GPRs (AArch64) |
274 | |
275 | FIRST_VALUETYPE = 1, // This is always the beginning of the list. |
276 | LAST_VALUETYPE = i64x8, // This always remains at the end of the list. |
277 | VALUETYPE_SIZE = LAST_VALUETYPE + 1, |
278 | |
279 | // This is the current maximum for LAST_VALUETYPE. |
280 | // MVT::MAX_ALLOWED_VALUETYPE is used for asserts and to size bit vectors |
281 | // This value must be a multiple of 32. |
282 | MAX_ALLOWED_VALUETYPE = 192, |
283 | |
284 | // A value of type llvm::TokenTy |
285 | token = 248, |
286 | |
287 | // This is MDNode or MDString. |
288 | Metadata = 249, |
289 | |
290 | // An int value the size of the pointer of the current |
291 | // target to any address space. This must only be used internal to |
292 | // tblgen. Other than for overloading, we treat iPTRAny the same as iPTR. |
293 | iPTRAny = 250, |
294 | |
295 | // A vector with any length and element size. This is used |
296 | // for intrinsics that have overloadings based on vector types. |
297 | // This is only for tblgen's consumption! |
298 | vAny = 251, |
299 | |
300 | // Any floating-point or vector floating-point value. This is used |
301 | // for intrinsics that have overloadings based on floating-point types. |
302 | // This is only for tblgen's consumption! |
303 | fAny = 252, |
304 | |
305 | // An integer or vector integer value of any bit width. This is |
306 | // used for intrinsics that have overloadings based on integer bit widths. |
307 | // This is only for tblgen's consumption! |
308 | iAny = 253, |
309 | |
310 | // An int value the size of the pointer of the current |
311 | // target. This should only be used internal to tblgen! |
312 | iPTR = 254, |
313 | |
314 | // Any type. This is used for intrinsics that have overloadings. |
315 | // This is only for tblgen's consumption! |
316 | Any = 255 |
317 | |
318 | // clang-format on |
319 | }; |
320 | |
321 | SimpleValueType SimpleTy = INVALID_SIMPLE_VALUE_TYPE; |
322 | |
323 | constexpr MVT() = default; |
324 | constexpr MVT(SimpleValueType SVT) : SimpleTy(SVT) {} |
325 | |
326 | bool operator>(const MVT& S) const { return SimpleTy > S.SimpleTy; } |
327 | bool operator<(const MVT& S) const { return SimpleTy < S.SimpleTy; } |
328 | bool operator==(const MVT& S) const { return SimpleTy == S.SimpleTy; } |
329 | bool operator!=(const MVT& S) const { return SimpleTy != S.SimpleTy; } |
330 | bool operator>=(const MVT& S) const { return SimpleTy >= S.SimpleTy; } |
331 | bool operator<=(const MVT& S) const { return SimpleTy <= S.SimpleTy; } |
332 | |
333 | /// Return true if this is a valid simple valuetype. |
334 | bool isValid() const { |
335 | return (SimpleTy >= MVT::FIRST_VALUETYPE && |
336 | SimpleTy <= MVT::LAST_VALUETYPE); |
337 | } |
338 | |
339 | /// Return true if this is a FP or a vector FP type. |
340 | bool isFloatingPoint() const { |
341 | return ((SimpleTy >= MVT::FIRST_FP_VALUETYPE && |
342 | SimpleTy <= MVT::LAST_FP_VALUETYPE) || |
343 | (SimpleTy >= MVT::FIRST_FP_FIXEDLEN_VECTOR_VALUETYPE && |
344 | SimpleTy <= MVT::LAST_FP_FIXEDLEN_VECTOR_VALUETYPE) || |
345 | (SimpleTy >= MVT::FIRST_FP_SCALABLE_VECTOR_VALUETYPE && |
346 | SimpleTy <= MVT::LAST_FP_SCALABLE_VECTOR_VALUETYPE)); |
347 | } |
348 | |
349 | /// Return true if this is an integer or a vector integer type. |
350 | bool isInteger() const { |
351 | return ((SimpleTy >= MVT::FIRST_INTEGER_VALUETYPE && |
352 | SimpleTy <= MVT::LAST_INTEGER_VALUETYPE) || |
353 | (SimpleTy >= MVT::FIRST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE && |
354 | SimpleTy <= MVT::LAST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE) || |
355 | (SimpleTy >= MVT::FIRST_INTEGER_SCALABLE_VECTOR_VALUETYPE && |
356 | SimpleTy <= MVT::LAST_INTEGER_SCALABLE_VECTOR_VALUETYPE)); |
357 | } |
358 | |
359 | /// Return true if this is an integer, not including vectors. |
360 | bool isScalarInteger() const { |
361 | return (SimpleTy >= MVT::FIRST_INTEGER_VALUETYPE && |
362 | SimpleTy <= MVT::LAST_INTEGER_VALUETYPE); |
363 | } |
364 | |
365 | /// Return true if this is a vector value type. |
366 | bool isVector() const { |
367 | return (SimpleTy >= MVT::FIRST_VECTOR_VALUETYPE && |
368 | SimpleTy <= MVT::LAST_VECTOR_VALUETYPE); |
369 | } |
370 | |
371 | /// Return true if this is a vector value type where the |
372 | /// runtime length is machine dependent |
373 | bool isScalableVector() const { |
374 | return (SimpleTy >= MVT::FIRST_SCALABLE_VECTOR_VALUETYPE && |
375 | SimpleTy <= MVT::LAST_SCALABLE_VECTOR_VALUETYPE); |
376 | } |
377 | |
378 | bool isFixedLengthVector() const { |
379 | return (SimpleTy >= MVT::FIRST_FIXEDLEN_VECTOR_VALUETYPE && |
380 | SimpleTy <= MVT::LAST_FIXEDLEN_VECTOR_VALUETYPE); |
381 | } |
382 | |
383 | /// Return true if this is a 16-bit vector type. |
384 | bool is16BitVector() const { |
385 | return (SimpleTy == MVT::v2i8 || SimpleTy == MVT::v1i16 || |
386 | SimpleTy == MVT::v16i1 || SimpleTy == MVT::v1f16); |
387 | } |
388 | |
389 | /// Return true if this is a 32-bit vector type. |
390 | bool is32BitVector() const { |
391 | return (SimpleTy == MVT::v32i1 || SimpleTy == MVT::v4i8 || |
392 | SimpleTy == MVT::v2i16 || SimpleTy == MVT::v1i32 || |
393 | SimpleTy == MVT::v2f16 || SimpleTy == MVT::v2bf16 || |
394 | SimpleTy == MVT::v1f32); |
395 | } |
396 | |
397 | /// Return true if this is a 64-bit vector type. |
398 | bool is64BitVector() const { |
399 | return (SimpleTy == MVT::v64i1 || SimpleTy == MVT::v8i8 || |
400 | SimpleTy == MVT::v4i16 || SimpleTy == MVT::v2i32 || |
401 | SimpleTy == MVT::v1i64 || SimpleTy == MVT::v4f16 || |
402 | SimpleTy == MVT::v4bf16 ||SimpleTy == MVT::v2f32 || |
403 | SimpleTy == MVT::v1f64); |
404 | } |
405 | |
406 | /// Return true if this is a 128-bit vector type. |
407 | bool is128BitVector() const { |
408 | return (SimpleTy == MVT::v128i1 || SimpleTy == MVT::v16i8 || |
409 | SimpleTy == MVT::v8i16 || SimpleTy == MVT::v4i32 || |
410 | SimpleTy == MVT::v2i64 || SimpleTy == MVT::v1i128 || |
411 | SimpleTy == MVT::v8f16 || SimpleTy == MVT::v8bf16 || |
412 | SimpleTy == MVT::v4f32 || SimpleTy == MVT::v2f64); |
413 | } |
414 | |
415 | /// Return true if this is a 256-bit vector type. |
416 | bool is256BitVector() const { |
417 | return (SimpleTy == MVT::v16f16 || SimpleTy == MVT::v16bf16 || |
418 | SimpleTy == MVT::v8f32 || SimpleTy == MVT::v4f64 || |
419 | SimpleTy == MVT::v32i8 || SimpleTy == MVT::v16i16 || |
420 | SimpleTy == MVT::v8i32 || SimpleTy == MVT::v4i64 || |
421 | SimpleTy == MVT::v256i1); |
422 | } |
423 | |
424 | /// Return true if this is a 512-bit vector type. |
425 | bool is512BitVector() const { |
426 | return (SimpleTy == MVT::v32f16 || SimpleTy == MVT::v32bf16 || |
427 | SimpleTy == MVT::v16f32 || SimpleTy == MVT::v8f64 || |
428 | SimpleTy == MVT::v512i1 || SimpleTy == MVT::v64i8 || |
429 | SimpleTy == MVT::v32i16 || SimpleTy == MVT::v16i32 || |
430 | SimpleTy == MVT::v8i64); |
431 | } |
432 | |
433 | /// Return true if this is a 1024-bit vector type. |
434 | bool is1024BitVector() const { |
435 | return (SimpleTy == MVT::v1024i1 || SimpleTy == MVT::v128i8 || |
436 | SimpleTy == MVT::v64i16 || SimpleTy == MVT::v32i32 || |
437 | SimpleTy == MVT::v16i64 || SimpleTy == MVT::v64f16 || |
438 | SimpleTy == MVT::v32f32 || SimpleTy == MVT::v16f64 || |
439 | SimpleTy == MVT::v64bf16); |
440 | } |
441 | |
442 | /// Return true if this is a 2048-bit vector type. |
443 | bool is2048BitVector() const { |
444 | return (SimpleTy == MVT::v256i8 || SimpleTy == MVT::v128i16 || |
445 | SimpleTy == MVT::v64i32 || SimpleTy == MVT::v32i64 || |
446 | SimpleTy == MVT::v128f16 || SimpleTy == MVT::v64f32 || |
447 | SimpleTy == MVT::v32f64 || SimpleTy == MVT::v128bf16); |
448 | } |
449 | |
450 | /// Return true if this is an overloaded type for TableGen. |
451 | bool isOverloaded() const { |
452 | return (SimpleTy == MVT::Any || SimpleTy == MVT::iAny || |
453 | SimpleTy == MVT::fAny || SimpleTy == MVT::vAny || |
454 | SimpleTy == MVT::iPTRAny); |
455 | } |
456 | |
457 | /// Return a vector with the same number of elements as this vector, but |
458 | /// with the element type converted to an integer type with the same |
459 | /// bitwidth. |
460 | MVT changeVectorElementTypeToInteger() const { |
461 | MVT EltTy = getVectorElementType(); |
462 | MVT IntTy = MVT::getIntegerVT(EltTy.getSizeInBits()); |
463 | MVT VecTy = MVT::getVectorVT(IntTy, getVectorElementCount()); |
464 | assert(VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE &&(static_cast <bool> (VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE && "Simple vector VT not representable by simple integer vector VT!" ) ? void (0) : __assert_fail ("VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE && \"Simple vector VT not representable by simple integer vector VT!\"" , "llvm/include/llvm/Support/MachineValueType.h", 465, __extension__ __PRETTY_FUNCTION__)) |
465 | "Simple vector VT not representable by simple integer vector VT!")(static_cast <bool> (VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE && "Simple vector VT not representable by simple integer vector VT!" ) ? void (0) : __assert_fail ("VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE && \"Simple vector VT not representable by simple integer vector VT!\"" , "llvm/include/llvm/Support/MachineValueType.h", 465, __extension__ __PRETTY_FUNCTION__)); |
466 | return VecTy; |
467 | } |
468 | |
469 | /// Return a VT for a vector type whose attributes match ourselves |
470 | /// with the exception of the element type that is chosen by the caller. |
471 | MVT changeVectorElementType(MVT EltVT) const { |
472 | MVT VecTy = MVT::getVectorVT(EltVT, getVectorElementCount()); |
473 | assert(VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE &&(static_cast <bool> (VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE && "Simple vector VT not representable by simple integer vector VT!" ) ? void (0) : __assert_fail ("VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE && \"Simple vector VT not representable by simple integer vector VT!\"" , "llvm/include/llvm/Support/MachineValueType.h", 474, __extension__ __PRETTY_FUNCTION__)) |
474 | "Simple vector VT not representable by simple integer vector VT!")(static_cast <bool> (VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE && "Simple vector VT not representable by simple integer vector VT!" ) ? void (0) : __assert_fail ("VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE && \"Simple vector VT not representable by simple integer vector VT!\"" , "llvm/include/llvm/Support/MachineValueType.h", 474, __extension__ __PRETTY_FUNCTION__)); |
475 | return VecTy; |
476 | } |
477 | |
478 | /// Return the type converted to an equivalently sized integer or vector |
479 | /// with integer element type. Similar to changeVectorElementTypeToInteger, |
480 | /// but also handles scalars. |
481 | MVT changeTypeToInteger() { |
482 | if (isVector()) |
483 | return changeVectorElementTypeToInteger(); |
484 | return MVT::getIntegerVT(getSizeInBits()); |
485 | } |
486 | |
487 | /// Return a VT for a vector type with the same element type but |
488 | /// half the number of elements. |
489 | MVT getHalfNumVectorElementsVT() const { |
490 | MVT EltVT = getVectorElementType(); |
491 | auto EltCnt = getVectorElementCount(); |
492 | assert(EltCnt.isKnownEven() && "Splitting vector, but not in half!")(static_cast <bool> (EltCnt.isKnownEven() && "Splitting vector, but not in half!" ) ? void (0) : __assert_fail ("EltCnt.isKnownEven() && \"Splitting vector, but not in half!\"" , "llvm/include/llvm/Support/MachineValueType.h", 492, __extension__ __PRETTY_FUNCTION__)); |
493 | return getVectorVT(EltVT, EltCnt.divideCoefficientBy(2)); |
494 | } |
495 | |
496 | /// Returns true if the given vector is a power of 2. |
497 | bool isPow2VectorType() const { |
498 | unsigned NElts = getVectorMinNumElements(); |
499 | return !(NElts & (NElts - 1)); |
500 | } |
501 | |
502 | /// Widens the length of the given vector MVT up to the nearest power of 2 |
503 | /// and returns that type. |
504 | MVT getPow2VectorType() const { |
505 | if (isPow2VectorType()) |
506 | return *this; |
507 | |
508 | ElementCount NElts = getVectorElementCount(); |
509 | unsigned NewMinCount = 1 << Log2_32_Ceil(NElts.getKnownMinValue()); |
510 | NElts = ElementCount::get(NewMinCount, NElts.isScalable()); |
511 | return MVT::getVectorVT(getVectorElementType(), NElts); |
512 | } |
513 | |
514 | /// If this is a vector, return the element type, otherwise return this. |
515 | MVT getScalarType() const { |
516 | return isVector() ? getVectorElementType() : *this; |
517 | } |
518 | |
519 | MVT getVectorElementType() const { |
520 | switch (SimpleTy) { |
521 | default: |
522 | llvm_unreachable("Not a vector MVT!")::llvm::llvm_unreachable_internal("Not a vector MVT!", "llvm/include/llvm/Support/MachineValueType.h" , 522); |
523 | case v1i1: |
524 | case v2i1: |
525 | case v4i1: |
526 | case v8i1: |
527 | case v16i1: |
528 | case v32i1: |
529 | case v64i1: |
530 | case v128i1: |
531 | case v256i1: |
532 | case v512i1: |
533 | case v1024i1: |
534 | case nxv1i1: |
535 | case nxv2i1: |
536 | case nxv4i1: |
537 | case nxv8i1: |
538 | case nxv16i1: |
539 | case nxv32i1: |
540 | case nxv64i1: return i1; |
541 | case v1i8: |
542 | case v2i8: |
543 | case v4i8: |
544 | case v8i8: |
545 | case v16i8: |
546 | case v32i8: |
547 | case v64i8: |
548 | case v128i8: |
549 | case v256i8: |
550 | case v512i8: |
551 | case v1024i8: |
552 | case nxv1i8: |
553 | case nxv2i8: |
554 | case nxv4i8: |
555 | case nxv8i8: |
556 | case nxv16i8: |
557 | case nxv32i8: |
558 | case nxv64i8: return i8; |
559 | case v1i16: |
560 | case v2i16: |
561 | case v3i16: |
562 | case v4i16: |
563 | case v8i16: |
564 | case v16i16: |
565 | case v32i16: |
566 | case v64i16: |
567 | case v128i16: |
568 | case v256i16: |
569 | case v512i16: |
570 | case nxv1i16: |
571 | case nxv2i16: |
572 | case nxv4i16: |
573 | case nxv8i16: |
574 | case nxv16i16: |
575 | case nxv32i16: return i16; |
576 | case v1i32: |
577 | case v2i32: |
578 | case v3i32: |
579 | case v4i32: |
580 | case v5i32: |
581 | case v6i32: |
582 | case v7i32: |
583 | case v8i32: |
584 | case v16i32: |
585 | case v32i32: |
586 | case v64i32: |
587 | case v128i32: |
588 | case v256i32: |
589 | case v512i32: |
590 | case v1024i32: |
591 | case v2048i32: |
592 | case nxv1i32: |
593 | case nxv2i32: |
594 | case nxv4i32: |
595 | case nxv8i32: |
596 | case nxv16i32: |
597 | case nxv32i32: return i32; |
598 | case v1i64: |
599 | case v2i64: |
600 | case v3i64: |
601 | case v4i64: |
602 | case v8i64: |
603 | case v16i64: |
604 | case v32i64: |
605 | case v64i64: |
606 | case v128i64: |
607 | case v256i64: |
608 | case nxv1i64: |
609 | case nxv2i64: |
610 | case nxv4i64: |
611 | case nxv8i64: |
612 | case nxv16i64: |
613 | case nxv32i64: return i64; |
614 | case v1i128: return i128; |
615 | case v1f16: |
616 | case v2f16: |
617 | case v3f16: |
618 | case v4f16: |
619 | case v8f16: |
620 | case v16f16: |
621 | case v32f16: |
622 | case v64f16: |
623 | case v128f16: |
624 | case v256f16: |
625 | case v512f16: |
626 | case nxv1f16: |
627 | case nxv2f16: |
628 | case nxv4f16: |
629 | case nxv8f16: |
630 | case nxv16f16: |
631 | case nxv32f16: return f16; |
632 | case v2bf16: |
633 | case v3bf16: |
634 | case v4bf16: |
635 | case v8bf16: |
636 | case v16bf16: |
637 | case v32bf16: |
638 | case v64bf16: |
639 | case v128bf16: |
640 | case nxv1bf16: |
641 | case nxv2bf16: |
642 | case nxv4bf16: |
643 | case nxv8bf16: return bf16; |
644 | case v1f32: |
645 | case v2f32: |
646 | case v3f32: |
647 | case v4f32: |
648 | case v5f32: |
649 | case v6f32: |
650 | case v7f32: |
651 | case v8f32: |
652 | case v16f32: |
653 | case v32f32: |
654 | case v64f32: |
655 | case v128f32: |
656 | case v256f32: |
657 | case v512f32: |
658 | case v1024f32: |
659 | case v2048f32: |
660 | case nxv1f32: |
661 | case nxv2f32: |
662 | case nxv4f32: |
663 | case nxv8f32: |
664 | case nxv16f32: return f32; |
665 | case v1f64: |
666 | case v2f64: |
667 | case v3f64: |
668 | case v4f64: |
669 | case v8f64: |
670 | case v16f64: |
671 | case v32f64: |
672 | case v64f64: |
673 | case v128f64: |
674 | case v256f64: |
675 | case nxv1f64: |
676 | case nxv2f64: |
677 | case nxv4f64: |
678 | case nxv8f64: return f64; |
679 | } |
680 | } |
681 | |
682 | /// Given a vector type, return the minimum number of elements it contains. |
683 | unsigned getVectorMinNumElements() const { |
684 | switch (SimpleTy) { |
685 | default: |
686 | llvm_unreachable("Not a vector MVT!")::llvm::llvm_unreachable_internal("Not a vector MVT!", "llvm/include/llvm/Support/MachineValueType.h" , 686); |
687 | case v2048i32: |
688 | case v2048f32: return 2048; |
689 | case v1024i1: |
690 | case v1024i8: |
691 | case v1024i32: |
692 | case v1024f32: return 1024; |
693 | case v512i1: |
694 | case v512i8: |
695 | case v512i16: |
696 | case v512i32: |
697 | case v512f16: |
698 | case v512f32: return 512; |
699 | case v256i1: |
700 | case v256i8: |
701 | case v256i16: |
702 | case v256f16: |
703 | case v256i32: |
704 | case v256i64: |
705 | case v256f32: |
706 | case v256f64: return 256; |
707 | case v128i1: |
708 | case v128i8: |
709 | case v128i16: |
710 | case v128i32: |
711 | case v128i64: |
712 | case v128f16: |
713 | case v128bf16: |
714 | case v128f32: |
715 | case v128f64: return 128; |
716 | case v64i1: |
717 | case v64i8: |
718 | case v64i16: |
719 | case v64i32: |
720 | case v64i64: |
721 | case v64f16: |
722 | case v64bf16: |
723 | case v64f32: |
724 | case v64f64: |
725 | case nxv64i1: |
726 | case nxv64i8: return 64; |
727 | case v32i1: |
728 | case v32i8: |
729 | case v32i16: |
730 | case v32i32: |
731 | case v32i64: |
732 | case v32f16: |
733 | case v32bf16: |
734 | case v32f32: |
735 | case v32f64: |
736 | case nxv32i1: |
737 | case nxv32i8: |
738 | case nxv32i16: |
739 | case nxv32i32: |
740 | case nxv32i64: |
741 | case nxv32f16: return 32; |
742 | case v16i1: |
743 | case v16i8: |
744 | case v16i16: |
745 | case v16i32: |
746 | case v16i64: |
747 | case v16f16: |
748 | case v16bf16: |
749 | case v16f32: |
750 | case v16f64: |
751 | case nxv16i1: |
752 | case nxv16i8: |
753 | case nxv16i16: |
754 | case nxv16i32: |
755 | case nxv16i64: |
756 | case nxv16f16: |
757 | case nxv16f32: return 16; |
758 | case v8i1: |
759 | case v8i8: |
760 | case v8i16: |
761 | case v8i32: |
762 | case v8i64: |
763 | case v8f16: |
764 | case v8bf16: |
765 | case v8f32: |
766 | case v8f64: |
767 | case nxv8i1: |
768 | case nxv8i8: |
769 | case nxv8i16: |
770 | case nxv8i32: |
771 | case nxv8i64: |
772 | case nxv8f16: |
773 | case nxv8bf16: |
774 | case nxv8f32: |
775 | case nxv8f64: return 8; |
776 | case v7i32: |
777 | case v7f32: return 7; |
778 | case v6i32: |
779 | case v6f32: return 6; |
780 | case v5i32: |
781 | case v5f32: return 5; |
782 | case v4i1: |
783 | case v4i8: |
784 | case v4i16: |
785 | case v4i32: |
786 | case v4i64: |
787 | case v4f16: |
788 | case v4bf16: |
789 | case v4f32: |
790 | case v4f64: |
791 | case nxv4i1: |
792 | case nxv4i8: |
793 | case nxv4i16: |
794 | case nxv4i32: |
795 | case nxv4i64: |
796 | case nxv4f16: |
797 | case nxv4bf16: |
798 | case nxv4f32: |
799 | case nxv4f64: return 4; |
800 | case v3i16: |
801 | case v3i32: |
802 | case v3i64: |
803 | case v3f16: |
804 | case v3bf16: |
805 | case v3f32: |
806 | case v3f64: return 3; |
807 | case v2i1: |
808 | case v2i8: |
809 | case v2i16: |
810 | case v2i32: |
811 | case v2i64: |
812 | case v2f16: |
813 | case v2bf16: |
814 | case v2f32: |
815 | case v2f64: |
816 | case nxv2i1: |
817 | case nxv2i8: |
818 | case nxv2i16: |
819 | case nxv2i32: |
820 | case nxv2i64: |
821 | case nxv2f16: |
822 | case nxv2bf16: |
823 | case nxv2f32: |
824 | case nxv2f64: return 2; |
825 | case v1i1: |
826 | case v1i8: |
827 | case v1i16: |
828 | case v1i32: |
829 | case v1i64: |
830 | case v1i128: |
831 | case v1f16: |
832 | case v1f32: |
833 | case v1f64: |
834 | case nxv1i1: |
835 | case nxv1i8: |
836 | case nxv1i16: |
837 | case nxv1i32: |
838 | case nxv1i64: |
839 | case nxv1f16: |
840 | case nxv1bf16: |
841 | case nxv1f32: |
842 | case nxv1f64: return 1; |
843 | } |
844 | } |
845 | |
846 | ElementCount getVectorElementCount() const { |
847 | return ElementCount::get(getVectorMinNumElements(), isScalableVector()); |
848 | } |
849 | |
850 | unsigned getVectorNumElements() const { |
851 | if (isScalableVector()) |
852 | llvm::reportInvalidSizeRequest( |
853 | "Possible incorrect use of MVT::getVectorNumElements() for " |
854 | "scalable vector. Scalable flag may be dropped, use " |
855 | "MVT::getVectorElementCount() instead"); |
856 | return getVectorMinNumElements(); |
857 | } |
858 | |
859 | /// Returns the size of the specified MVT in bits. |
860 | /// |
861 | /// If the value type is a scalable vector type, the scalable property will |
862 | /// be set and the runtime size will be a positive integer multiple of the |
863 | /// base size. |
864 | TypeSize getSizeInBits() const { |
865 | switch (SimpleTy) { |
866 | default: |
867 | llvm_unreachable("getSizeInBits called on extended MVT.")::llvm::llvm_unreachable_internal("getSizeInBits called on extended MVT." , "llvm/include/llvm/Support/MachineValueType.h", 867); |
868 | case Other: |
869 | llvm_unreachable("Value type is non-standard value, Other.")::llvm::llvm_unreachable_internal("Value type is non-standard value, Other." , "llvm/include/llvm/Support/MachineValueType.h", 869); |
870 | case iPTR: |
871 | llvm_unreachable("Value type size is target-dependent. Ask TLI.")::llvm::llvm_unreachable_internal("Value type size is target-dependent. Ask TLI." , "llvm/include/llvm/Support/MachineValueType.h", 871); |
872 | case iPTRAny: |
873 | case iAny: |
874 | case fAny: |
875 | case vAny: |
876 | case Any: |
877 | llvm_unreachable("Value type is overloaded.")::llvm::llvm_unreachable_internal("Value type is overloaded." , "llvm/include/llvm/Support/MachineValueType.h", 877); |
878 | case token: |
879 | llvm_unreachable("Token type is a sentinel that cannot be used "::llvm::llvm_unreachable_internal("Token type is a sentinel that cannot be used " "in codegen and has no size", "llvm/include/llvm/Support/MachineValueType.h" , 880) |
880 | "in codegen and has no size")::llvm::llvm_unreachable_internal("Token type is a sentinel that cannot be used " "in codegen and has no size", "llvm/include/llvm/Support/MachineValueType.h" , 880); |
881 | case Metadata: |
882 | llvm_unreachable("Value type is metadata.")::llvm::llvm_unreachable_internal("Value type is metadata.", "llvm/include/llvm/Support/MachineValueType.h" , 882); |
883 | case i1: |
884 | case v1i1: return TypeSize::Fixed(1); |
885 | case nxv1i1: return TypeSize::Scalable(1); |
886 | case v2i1: return TypeSize::Fixed(2); |
887 | case nxv2i1: return TypeSize::Scalable(2); |
888 | case v4i1: return TypeSize::Fixed(4); |
889 | case nxv4i1: return TypeSize::Scalable(4); |
890 | case i8 : |
891 | case v1i8: |
892 | case v8i1: return TypeSize::Fixed(8); |
893 | case nxv1i8: |
894 | case nxv8i1: return TypeSize::Scalable(8); |
895 | case i16 : |
896 | case f16: |
897 | case bf16: |
898 | case v16i1: |
899 | case v2i8: |
900 | case v1i16: |
901 | case v1f16: return TypeSize::Fixed(16); |
902 | case nxv16i1: |
903 | case nxv2i8: |
904 | case nxv1i16: |
905 | case nxv1bf16: |
906 | case nxv1f16: return TypeSize::Scalable(16); |
907 | case f32 : |
908 | case i32 : |
909 | case v32i1: |
910 | case v4i8: |
911 | case v2i16: |
912 | case v2f16: |
913 | case v2bf16: |
914 | case v1f32: |
915 | case v1i32: return TypeSize::Fixed(32); |
916 | case nxv32i1: |
917 | case nxv4i8: |
918 | case nxv2i16: |
919 | case nxv1i32: |
920 | case nxv2f16: |
921 | case nxv2bf16: |
922 | case nxv1f32: return TypeSize::Scalable(32); |
923 | case v3i16: |
924 | case v3f16: |
925 | case v3bf16: return TypeSize::Fixed(48); |
926 | case x86mmx: |
927 | case f64 : |
928 | case i64 : |
929 | case v64i1: |
930 | case v8i8: |
931 | case v4i16: |
932 | case v2i32: |
933 | case v1i64: |
934 | case v4f16: |
935 | case v4bf16: |
936 | case v2f32: |
937 | case v1f64: return TypeSize::Fixed(64); |
938 | case nxv64i1: |
939 | case nxv8i8: |
940 | case nxv4i16: |
941 | case nxv2i32: |
942 | case nxv1i64: |
943 | case nxv4f16: |
944 | case nxv4bf16: |
945 | case nxv2f32: |
946 | case nxv1f64: return TypeSize::Scalable(64); |
947 | case f80 : return TypeSize::Fixed(80); |
948 | case v3i32: |
949 | case v3f32: return TypeSize::Fixed(96); |
950 | case f128: |
951 | case ppcf128: |
952 | case i128: |
953 | case v128i1: |
954 | case v16i8: |
955 | case v8i16: |
956 | case v4i32: |
957 | case v2i64: |
958 | case v1i128: |
959 | case v8f16: |
960 | case v8bf16: |
961 | case v4f32: |
962 | case v2f64: return TypeSize::Fixed(128); |
963 | case nxv16i8: |
964 | case nxv8i16: |
965 | case nxv4i32: |
966 | case nxv2i64: |
967 | case nxv8f16: |
968 | case nxv8bf16: |
969 | case nxv4f32: |
970 | case nxv2f64: return TypeSize::Scalable(128); |
971 | case v5i32: |
972 | case v5f32: return TypeSize::Fixed(160); |
973 | case v6i32: |
974 | case v3i64: |
975 | case v6f32: |
976 | case v3f64: return TypeSize::Fixed(192); |
977 | case v7i32: |
978 | case v7f32: return TypeSize::Fixed(224); |
979 | case v256i1: |
980 | case v32i8: |
981 | case v16i16: |
982 | case v8i32: |
983 | case v4i64: |
984 | case v16f16: |
985 | case v16bf16: |
986 | case v8f32: |
987 | case v4f64: return TypeSize::Fixed(256); |
988 | case nxv32i8: |
989 | case nxv16i16: |
990 | case nxv8i32: |
991 | case nxv4i64: |
992 | case nxv16f16: |
993 | case nxv8f32: |
994 | case nxv4f64: return TypeSize::Scalable(256); |
995 | case i64x8: |
996 | case v512i1: |
997 | case v64i8: |
998 | case v32i16: |
999 | case v16i32: |
1000 | case v8i64: |
1001 | case v32f16: |
1002 | case v32bf16: |
1003 | case v16f32: |
1004 | case v8f64: return TypeSize::Fixed(512); |
1005 | case nxv64i8: |
1006 | case nxv32i16: |
1007 | case nxv16i32: |
1008 | case nxv8i64: |
1009 | case nxv32f16: |
1010 | case nxv16f32: |
1011 | case nxv8f64: return TypeSize::Scalable(512); |
1012 | case v1024i1: |
1013 | case v128i8: |
1014 | case v64i16: |
1015 | case v32i32: |
1016 | case v16i64: |
1017 | case v64f16: |
1018 | case v64bf16: |
1019 | case v32f32: |
1020 | case v16f64: return TypeSize::Fixed(1024); |
1021 | case nxv32i32: |
1022 | case nxv16i64: return TypeSize::Scalable(1024); |
1023 | case v256i8: |
1024 | case v128i16: |
1025 | case v64i32: |
1026 | case v32i64: |
1027 | case v128f16: |
1028 | case v128bf16: |
1029 | case v64f32: |
1030 | case v32f64: return TypeSize::Fixed(2048); |
1031 | case nxv32i64: return TypeSize::Scalable(2048); |
1032 | case v512i8: |
1033 | case v256i16: |
1034 | case v128i32: |
1035 | case v64i64: |
1036 | case v256f16: |
1037 | case v128f32: |
1038 | case v64f64: return TypeSize::Fixed(4096); |
1039 | case v1024i8: |
1040 | case v512i16: |
1041 | case v256i32: |
1042 | case v128i64: |
1043 | case v512f16: |
1044 | case v256f32: |
1045 | case x86amx: |
1046 | case v128f64: return TypeSize::Fixed(8192); |
1047 | case v512i32: |
1048 | case v256i64: |
1049 | case v512f32: |
1050 | case v256f64: return TypeSize::Fixed(16384); |
1051 | case v1024i32: |
1052 | case v1024f32: return TypeSize::Fixed(32768); |
1053 | case v2048i32: |
1054 | case v2048f32: return TypeSize::Fixed(65536); |
1055 | case funcref: |
1056 | case externref: return TypeSize::Fixed(0); // opaque type |
1057 | } |
1058 | } |
1059 | |
1060 | /// Return the size of the specified fixed width value type in bits. The |
1061 | /// function will assert if the type is scalable. |
1062 | uint64_t getFixedSizeInBits() const { |
1063 | return getSizeInBits().getFixedSize(); |
1064 | } |
1065 | |
1066 | uint64_t getScalarSizeInBits() const { |
1067 | return getScalarType().getSizeInBits().getFixedSize(); |
1068 | } |
1069 | |
1070 | /// Return the number of bytes overwritten by a store of the specified value |
1071 | /// type. |
1072 | /// |
1073 | /// If the value type is a scalable vector type, the scalable property will |
1074 | /// be set and the runtime size will be a positive integer multiple of the |
1075 | /// base size. |
1076 | TypeSize getStoreSize() const { |
1077 | TypeSize BaseSize = getSizeInBits(); |
1078 | return {(BaseSize.getKnownMinSize() + 7) / 8, BaseSize.isScalable()}; |
1079 | } |
1080 | |
1081 | /// Return the number of bits overwritten by a store of the specified value |
1082 | /// type. |
1083 | /// |
1084 | /// If the value type is a scalable vector type, the scalable property will |
1085 | /// be set and the runtime size will be a positive integer multiple of the |
1086 | /// base size. |
1087 | TypeSize getStoreSizeInBits() const { |
1088 | return getStoreSize() * 8; |
1089 | } |
1090 | |
1091 | /// Returns true if the number of bits for the type is a multiple of an |
1092 | /// 8-bit byte. |
1093 | bool isByteSized() const { return getSizeInBits().isKnownMultipleOf(8); } |
1094 | |
1095 | /// Return true if we know at compile time this has more bits than VT. |
1096 | bool knownBitsGT(MVT VT) const { |
1097 | return TypeSize::isKnownGT(getSizeInBits(), VT.getSizeInBits()); |
1098 | } |
1099 | |
1100 | /// Return true if we know at compile time this has more than or the same |
1101 | /// bits as VT. |
1102 | bool knownBitsGE(MVT VT) const { |
1103 | return TypeSize::isKnownGE(getSizeInBits(), VT.getSizeInBits()); |
1104 | } |
1105 | |
1106 | /// Return true if we know at compile time this has fewer bits than VT. |
1107 | bool knownBitsLT(MVT VT) const { |
1108 | return TypeSize::isKnownLT(getSizeInBits(), VT.getSizeInBits()); |
1109 | } |
1110 | |
1111 | /// Return true if we know at compile time this has fewer than or the same |
1112 | /// bits as VT. |
1113 | bool knownBitsLE(MVT VT) const { |
1114 | return TypeSize::isKnownLE(getSizeInBits(), VT.getSizeInBits()); |
1115 | } |
1116 | |
1117 | /// Return true if this has more bits than VT. |
1118 | bool bitsGT(MVT VT) const { |
1119 | assert(isScalableVector() == VT.isScalableVector() &&(static_cast <bool> (isScalableVector() == VT.isScalableVector () && "Comparison between scalable and fixed types") ? void (0) : __assert_fail ("isScalableVector() == VT.isScalableVector() && \"Comparison between scalable and fixed types\"" , "llvm/include/llvm/Support/MachineValueType.h", 1120, __extension__ __PRETTY_FUNCTION__)) |
1120 | "Comparison between scalable and fixed types")(static_cast <bool> (isScalableVector() == VT.isScalableVector () && "Comparison between scalable and fixed types") ? void (0) : __assert_fail ("isScalableVector() == VT.isScalableVector() && \"Comparison between scalable and fixed types\"" , "llvm/include/llvm/Support/MachineValueType.h", 1120, __extension__ __PRETTY_FUNCTION__)); |
1121 | return knownBitsGT(VT); |
1122 | } |
1123 | |
1124 | /// Return true if this has no less bits than VT. |
1125 | bool bitsGE(MVT VT) const { |
1126 | assert(isScalableVector() == VT.isScalableVector() &&(static_cast <bool> (isScalableVector() == VT.isScalableVector () && "Comparison between scalable and fixed types") ? void (0) : __assert_fail ("isScalableVector() == VT.isScalableVector() && \"Comparison between scalable and fixed types\"" , "llvm/include/llvm/Support/MachineValueType.h", 1127, __extension__ __PRETTY_FUNCTION__)) |
1127 | "Comparison between scalable and fixed types")(static_cast <bool> (isScalableVector() == VT.isScalableVector () && "Comparison between scalable and fixed types") ? void (0) : __assert_fail ("isScalableVector() == VT.isScalableVector() && \"Comparison between scalable and fixed types\"" , "llvm/include/llvm/Support/MachineValueType.h", 1127, __extension__ __PRETTY_FUNCTION__)); |
1128 | return knownBitsGE(VT); |
1129 | } |
1130 | |
1131 | /// Return true if this has less bits than VT. |
1132 | bool bitsLT(MVT VT) const { |
1133 | assert(isScalableVector() == VT.isScalableVector() &&(static_cast <bool> (isScalableVector() == VT.isScalableVector () && "Comparison between scalable and fixed types") ? void (0) : __assert_fail ("isScalableVector() == VT.isScalableVector() && \"Comparison between scalable and fixed types\"" , "llvm/include/llvm/Support/MachineValueType.h", 1134, __extension__ __PRETTY_FUNCTION__)) |
1134 | "Comparison between scalable and fixed types")(static_cast <bool> (isScalableVector() == VT.isScalableVector () && "Comparison between scalable and fixed types") ? void (0) : __assert_fail ("isScalableVector() == VT.isScalableVector() && \"Comparison between scalable and fixed types\"" , "llvm/include/llvm/Support/MachineValueType.h", 1134, __extension__ __PRETTY_FUNCTION__)); |
1135 | return knownBitsLT(VT); |
1136 | } |
1137 | |
1138 | /// Return true if this has no more bits than VT. |
1139 | bool bitsLE(MVT VT) const { |
1140 | assert(isScalableVector() == VT.isScalableVector() &&(static_cast <bool> (isScalableVector() == VT.isScalableVector () && "Comparison between scalable and fixed types") ? void (0) : __assert_fail ("isScalableVector() == VT.isScalableVector() && \"Comparison between scalable and fixed types\"" , "llvm/include/llvm/Support/MachineValueType.h", 1141, __extension__ __PRETTY_FUNCTION__)) |
1141 | "Comparison between scalable and fixed types")(static_cast <bool> (isScalableVector() == VT.isScalableVector () && "Comparison between scalable and fixed types") ? void (0) : __assert_fail ("isScalableVector() == VT.isScalableVector() && \"Comparison between scalable and fixed types\"" , "llvm/include/llvm/Support/MachineValueType.h", 1141, __extension__ __PRETTY_FUNCTION__)); |
1142 | return knownBitsLE(VT); |
1143 | } |
1144 | |
1145 | static MVT getFloatingPointVT(unsigned BitWidth) { |
1146 | switch (BitWidth) { |
1147 | default: |
1148 | llvm_unreachable("Bad bit width!")::llvm::llvm_unreachable_internal("Bad bit width!", "llvm/include/llvm/Support/MachineValueType.h" , 1148); |
1149 | case 16: |
1150 | return MVT::f16; |
1151 | case 32: |
1152 | return MVT::f32; |
1153 | case 64: |
1154 | return MVT::f64; |
1155 | case 80: |
1156 | return MVT::f80; |
1157 | case 128: |
1158 | return MVT::f128; |
1159 | } |
1160 | } |
1161 | |
1162 | static MVT getIntegerVT(unsigned BitWidth) { |
1163 | switch (BitWidth) { |
1164 | default: |
1165 | return (MVT::SimpleValueType)(MVT::INVALID_SIMPLE_VALUE_TYPE); |
1166 | case 1: |
1167 | return MVT::i1; |
1168 | case 8: |
1169 | return MVT::i8; |
1170 | case 16: |
1171 | return MVT::i16; |
1172 | case 32: |
1173 | return MVT::i32; |
1174 | case 64: |
1175 | return MVT::i64; |
1176 | case 128: |
1177 | return MVT::i128; |
1178 | } |
1179 | } |
1180 | |
1181 | static MVT getVectorVT(MVT VT, unsigned NumElements) { |
1182 | switch (VT.SimpleTy) { |
1183 | default: |
1184 | break; |
1185 | case MVT::i1: |
1186 | if (NumElements == 1) return MVT::v1i1; |
1187 | if (NumElements == 2) return MVT::v2i1; |
1188 | if (NumElements == 4) return MVT::v4i1; |
1189 | if (NumElements == 8) return MVT::v8i1; |
1190 | if (NumElements == 16) return MVT::v16i1; |
1191 | if (NumElements == 32) return MVT::v32i1; |
1192 | if (NumElements == 64) return MVT::v64i1; |
1193 | if (NumElements == 128) return MVT::v128i1; |
1194 | if (NumElements == 256) return MVT::v256i1; |
1195 | if (NumElements == 512) return MVT::v512i1; |
1196 | if (NumElements == 1024) return MVT::v1024i1; |
1197 | break; |
1198 | case MVT::i8: |
1199 | if (NumElements == 1) return MVT::v1i8; |
1200 | if (NumElements == 2) return MVT::v2i8; |
1201 | if (NumElements == 4) return MVT::v4i8; |
1202 | if (NumElements == 8) return MVT::v8i8; |
1203 | if (NumElements == 16) return MVT::v16i8; |
1204 | if (NumElements == 32) return MVT::v32i8; |
1205 | if (NumElements == 64) return MVT::v64i8; |
1206 | if (NumElements == 128) return MVT::v128i8; |
1207 | if (NumElements == 256) return MVT::v256i8; |
1208 | if (NumElements == 512) return MVT::v512i8; |
1209 | if (NumElements == 1024) return MVT::v1024i8; |
1210 | break; |
1211 | case MVT::i16: |
1212 | if (NumElements == 1) return MVT::v1i16; |
1213 | if (NumElements == 2) return MVT::v2i16; |
1214 | if (NumElements == 3) return MVT::v3i16; |
1215 | if (NumElements == 4) return MVT::v4i16; |
1216 | if (NumElements == 8) return MVT::v8i16; |
1217 | if (NumElements == 16) return MVT::v16i16; |
1218 | if (NumElements == 32) return MVT::v32i16; |
1219 | if (NumElements == 64) return MVT::v64i16; |
1220 | if (NumElements == 128) return MVT::v128i16; |
1221 | if (NumElements == 256) return MVT::v256i16; |
1222 | if (NumElements == 512) return MVT::v512i16; |
1223 | break; |
1224 | case MVT::i32: |
1225 | if (NumElements == 1) return MVT::v1i32; |
1226 | if (NumElements == 2) return MVT::v2i32; |
1227 | if (NumElements == 3) return MVT::v3i32; |
1228 | if (NumElements == 4) return MVT::v4i32; |
1229 | if (NumElements == 5) return MVT::v5i32; |
1230 | if (NumElements == 6) return MVT::v6i32; |
1231 | if (NumElements == 7) return MVT::v7i32; |
1232 | if (NumElements == 8) return MVT::v8i32; |
1233 | if (NumElements == 16) return MVT::v16i32; |
1234 | if (NumElements == 32) return MVT::v32i32; |
1235 | if (NumElements == 64) return MVT::v64i32; |
1236 | if (NumElements == 128) return MVT::v128i32; |
1237 | if (NumElements == 256) return MVT::v256i32; |
1238 | if (NumElements == 512) return MVT::v512i32; |
1239 | if (NumElements == 1024) return MVT::v1024i32; |
1240 | if (NumElements == 2048) return MVT::v2048i32; |
1241 | break; |
1242 | case MVT::i64: |
1243 | if (NumElements == 1) return MVT::v1i64; |
1244 | if (NumElements == 2) return MVT::v2i64; |
1245 | if (NumElements == 3) return MVT::v3i64; |
1246 | if (NumElements == 4) return MVT::v4i64; |
1247 | if (NumElements == 8) return MVT::v8i64; |
1248 | if (NumElements == 16) return MVT::v16i64; |
1249 | if (NumElements == 32) return MVT::v32i64; |
1250 | if (NumElements == 64) return MVT::v64i64; |
1251 | if (NumElements == 128) return MVT::v128i64; |
1252 | if (NumElements == 256) return MVT::v256i64; |
1253 | break; |
1254 | case MVT::i128: |
1255 | if (NumElements == 1) return MVT::v1i128; |
1256 | break; |
1257 | case MVT::f16: |
1258 | if (NumElements == 1) return MVT::v1f16; |
1259 | if (NumElements == 2) return MVT::v2f16; |
1260 | if (NumElements == 3) return MVT::v3f16; |
1261 | if (NumElements == 4) return MVT::v4f16; |
1262 | if (NumElements == 8) return MVT::v8f16; |
1263 | if (NumElements == 16) return MVT::v16f16; |
1264 | if (NumElements == 32) return MVT::v32f16; |
1265 | if (NumElements == 64) return MVT::v64f16; |
1266 | if (NumElements == 128) return MVT::v128f16; |
1267 | if (NumElements == 256) return MVT::v256f16; |
1268 | if (NumElements == 512) return MVT::v512f16; |
1269 | break; |
1270 | case MVT::bf16: |
1271 | if (NumElements == 2) return MVT::v2bf16; |
1272 | if (NumElements == 3) return MVT::v3bf16; |
1273 | if (NumElements == 4) return MVT::v4bf16; |
1274 | if (NumElements == 8) return MVT::v8bf16; |
1275 | if (NumElements == 16) return MVT::v16bf16; |
1276 | if (NumElements == 32) return MVT::v32bf16; |
1277 | if (NumElements == 64) return MVT::v64bf16; |
1278 | if (NumElements == 128) return MVT::v128bf16; |
1279 | break; |
1280 | case MVT::f32: |
1281 | if (NumElements == 1) return MVT::v1f32; |
1282 | if (NumElements == 2) return MVT::v2f32; |
1283 | if (NumElements == 3) return MVT::v3f32; |
1284 | if (NumElements == 4) return MVT::v4f32; |
1285 | if (NumElements == 5) return MVT::v5f32; |
1286 | if (NumElements == 6) return MVT::v6f32; |
1287 | if (NumElements == 7) return MVT::v7f32; |
1288 | if (NumElements == 8) return MVT::v8f32; |
1289 | if (NumElements == 16) return MVT::v16f32; |
1290 | if (NumElements == 32) return MVT::v32f32; |
1291 | if (NumElements == 64) return MVT::v64f32; |
1292 | if (NumElements == 128) return MVT::v128f32; |
1293 | if (NumElements == 256) return MVT::v256f32; |
1294 | if (NumElements == 512) return MVT::v512f32; |
1295 | if (NumElements == 1024) return MVT::v1024f32; |
1296 | if (NumElements == 2048) return MVT::v2048f32; |
1297 | break; |
1298 | case MVT::f64: |
1299 | if (NumElements == 1) return MVT::v1f64; |
1300 | if (NumElements == 2) return MVT::v2f64; |
1301 | if (NumElements == 3) return MVT::v3f64; |
1302 | if (NumElements == 4) return MVT::v4f64; |
1303 | if (NumElements == 8) return MVT::v8f64; |
1304 | if (NumElements == 16) return MVT::v16f64; |
1305 | if (NumElements == 32) return MVT::v32f64; |
1306 | if (NumElements == 64) return MVT::v64f64; |
1307 | if (NumElements == 128) return MVT::v128f64; |
1308 | if (NumElements == 256) return MVT::v256f64; |
1309 | break; |
1310 | } |
1311 | return (MVT::SimpleValueType)(MVT::INVALID_SIMPLE_VALUE_TYPE); |
1312 | } |
1313 | |
1314 | static MVT getScalableVectorVT(MVT VT, unsigned NumElements) { |
1315 | switch(VT.SimpleTy) { |
1316 | default: |
1317 | break; |
1318 | case MVT::i1: |
1319 | if (NumElements == 1) return MVT::nxv1i1; |
1320 | if (NumElements == 2) return MVT::nxv2i1; |
1321 | if (NumElements == 4) return MVT::nxv4i1; |
1322 | if (NumElements == 8) return MVT::nxv8i1; |
1323 | if (NumElements == 16) return MVT::nxv16i1; |
1324 | if (NumElements == 32) return MVT::nxv32i1; |
1325 | if (NumElements == 64) return MVT::nxv64i1; |
1326 | break; |
1327 | case MVT::i8: |
1328 | if (NumElements == 1) return MVT::nxv1i8; |
1329 | if (NumElements == 2) return MVT::nxv2i8; |
1330 | if (NumElements == 4) return MVT::nxv4i8; |
1331 | if (NumElements == 8) return MVT::nxv8i8; |
1332 | if (NumElements == 16) return MVT::nxv16i8; |
1333 | if (NumElements == 32) return MVT::nxv32i8; |
1334 | if (NumElements == 64) return MVT::nxv64i8; |
1335 | break; |
1336 | case MVT::i16: |
1337 | if (NumElements == 1) return MVT::nxv1i16; |
1338 | if (NumElements == 2) return MVT::nxv2i16; |
1339 | if (NumElements == 4) return MVT::nxv4i16; |
1340 | if (NumElements == 8) return MVT::nxv8i16; |
1341 | if (NumElements == 16) return MVT::nxv16i16; |
1342 | if (NumElements == 32) return MVT::nxv32i16; |
1343 | break; |
1344 | case MVT::i32: |
1345 | if (NumElements == 1) return MVT::nxv1i32; |
1346 | if (NumElements == 2) return MVT::nxv2i32; |
1347 | if (NumElements == 4) return MVT::nxv4i32; |
1348 | if (NumElements == 8) return MVT::nxv8i32; |
1349 | if (NumElements == 16) return MVT::nxv16i32; |
1350 | if (NumElements == 32) return MVT::nxv32i32; |
1351 | break; |
1352 | case MVT::i64: |
1353 | if (NumElements == 1) return MVT::nxv1i64; |
1354 | if (NumElements == 2) return MVT::nxv2i64; |
1355 | if (NumElements == 4) return MVT::nxv4i64; |
1356 | if (NumElements == 8) return MVT::nxv8i64; |
1357 | if (NumElements == 16) return MVT::nxv16i64; |
1358 | if (NumElements == 32) return MVT::nxv32i64; |
1359 | break; |
1360 | case MVT::f16: |
1361 | if (NumElements == 1) return MVT::nxv1f16; |
1362 | if (NumElements == 2) return MVT::nxv2f16; |
1363 | if (NumElements == 4) return MVT::nxv4f16; |
1364 | if (NumElements == 8) return MVT::nxv8f16; |
1365 | if (NumElements == 16) return MVT::nxv16f16; |
1366 | if (NumElements == 32) return MVT::nxv32f16; |
1367 | break; |
1368 | case MVT::bf16: |
1369 | if (NumElements == 1) return MVT::nxv1bf16; |
1370 | if (NumElements == 2) return MVT::nxv2bf16; |
1371 | if (NumElements == 4) return MVT::nxv4bf16; |
1372 | if (NumElements == 8) return MVT::nxv8bf16; |
1373 | break; |
1374 | case MVT::f32: |
1375 | if (NumElements == 1) return MVT::nxv1f32; |
1376 | if (NumElements == 2) return MVT::nxv2f32; |
1377 | if (NumElements == 4) return MVT::nxv4f32; |
1378 | if (NumElements == 8) return MVT::nxv8f32; |
1379 | if (NumElements == 16) return MVT::nxv16f32; |
1380 | break; |
1381 | case MVT::f64: |
1382 | if (NumElements == 1) return MVT::nxv1f64; |
1383 | if (NumElements == 2) return MVT::nxv2f64; |
1384 | if (NumElements == 4) return MVT::nxv4f64; |
1385 | if (NumElements == 8) return MVT::nxv8f64; |
1386 | break; |
1387 | } |
1388 | return (MVT::SimpleValueType)(MVT::INVALID_SIMPLE_VALUE_TYPE); |
1389 | } |
1390 | |
1391 | static MVT getVectorVT(MVT VT, unsigned NumElements, bool IsScalable) { |
1392 | if (IsScalable) |
1393 | return getScalableVectorVT(VT, NumElements); |
1394 | return getVectorVT(VT, NumElements); |
1395 | } |
1396 | |
1397 | static MVT getVectorVT(MVT VT, ElementCount EC) { |
1398 | if (EC.isScalable()) |
1399 | return getScalableVectorVT(VT, EC.getKnownMinValue()); |
1400 | return getVectorVT(VT, EC.getKnownMinValue()); |
1401 | } |
1402 | |
1403 | /// Return the value type corresponding to the specified type. This returns |
1404 | /// all pointers as iPTR. If HandleUnknown is true, unknown types are |
1405 | /// returned as Other, otherwise they are invalid. |
1406 | static MVT getVT(Type *Ty, bool HandleUnknown = false); |
1407 | |
1408 | public: |
1409 | /// SimpleValueType Iteration |
1410 | /// @{ |
1411 | static auto all_valuetypes() { |
1412 | return enum_seq_inclusive(MVT::FIRST_VALUETYPE, MVT::LAST_VALUETYPE, |
1413 | force_iteration_on_noniterable_enum); |
1414 | } |
1415 | |
1416 | static auto integer_valuetypes() { |
1417 | return enum_seq_inclusive(MVT::FIRST_INTEGER_VALUETYPE, |
1418 | MVT::LAST_INTEGER_VALUETYPE, |
1419 | force_iteration_on_noniterable_enum); |
1420 | } |
1421 | |
1422 | static auto fp_valuetypes() { |
1423 | return enum_seq_inclusive(MVT::FIRST_FP_VALUETYPE, MVT::LAST_FP_VALUETYPE, |
1424 | force_iteration_on_noniterable_enum); |
1425 | } |
1426 | |
1427 | static auto vector_valuetypes() { |
1428 | return enum_seq_inclusive(MVT::FIRST_VECTOR_VALUETYPE, |
1429 | MVT::LAST_VECTOR_VALUETYPE, |
1430 | force_iteration_on_noniterable_enum); |
1431 | } |
1432 | |
1433 | static auto fixedlen_vector_valuetypes() { |
1434 | return enum_seq_inclusive(MVT::FIRST_FIXEDLEN_VECTOR_VALUETYPE, |
1435 | MVT::LAST_FIXEDLEN_VECTOR_VALUETYPE, |
1436 | force_iteration_on_noniterable_enum); |
1437 | } |
1438 | |
1439 | static auto scalable_vector_valuetypes() { |
1440 | return enum_seq_inclusive(MVT::FIRST_SCALABLE_VECTOR_VALUETYPE, |
1441 | MVT::LAST_SCALABLE_VECTOR_VALUETYPE, |
1442 | force_iteration_on_noniterable_enum); |
1443 | } |
1444 | |
1445 | static auto integer_fixedlen_vector_valuetypes() { |
1446 | return enum_seq_inclusive(MVT::FIRST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE, |
1447 | MVT::LAST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE, |
1448 | force_iteration_on_noniterable_enum); |
1449 | } |
1450 | |
1451 | static auto fp_fixedlen_vector_valuetypes() { |
1452 | return enum_seq_inclusive(MVT::FIRST_FP_FIXEDLEN_VECTOR_VALUETYPE, |
1453 | MVT::LAST_FP_FIXEDLEN_VECTOR_VALUETYPE, |
1454 | force_iteration_on_noniterable_enum); |
1455 | } |
1456 | |
1457 | static auto integer_scalable_vector_valuetypes() { |
1458 | return enum_seq_inclusive(MVT::FIRST_INTEGER_SCALABLE_VECTOR_VALUETYPE, |
1459 | MVT::LAST_INTEGER_SCALABLE_VECTOR_VALUETYPE, |
1460 | force_iteration_on_noniterable_enum); |
1461 | } |
1462 | |
1463 | static auto fp_scalable_vector_valuetypes() { |
1464 | return enum_seq_inclusive(MVT::FIRST_FP_SCALABLE_VECTOR_VALUETYPE, |
1465 | MVT::LAST_FP_SCALABLE_VECTOR_VALUETYPE, |
1466 | force_iteration_on_noniterable_enum); |
1467 | } |
1468 | /// @} |
1469 | }; |
1470 | |
1471 | } // end namespace llvm |
1472 | |
1473 | #endif // LLVM_SUPPORT_MACHINEVALUETYPE_H |