File: | build/source/llvm/lib/Target/X86/X86TargetTransformInfo.cpp |
Warning: | line 4329, column 15 Division by zero |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===// | |||
2 | // | |||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | |||
4 | // See https://llvm.org/LICENSE.txt for license information. | |||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | |||
6 | // | |||
7 | //===----------------------------------------------------------------------===// | |||
8 | /// \file | |||
9 | /// This file implements a TargetTransformInfo analysis pass specific to the | |||
10 | /// X86 target machine. It uses the target's detailed information to provide | |||
11 | /// more precise answers to certain TTI queries, while letting the target | |||
12 | /// independent and default TTI implementations handle the rest. | |||
13 | /// | |||
14 | //===----------------------------------------------------------------------===// | |||
15 | /// About Cost Model numbers used below it's necessary to say the following: | |||
16 | /// the numbers correspond to some "generic" X86 CPU instead of usage of a | |||
17 | /// specific CPU model. Usually the numbers correspond to the CPU where the | |||
18 | /// feature first appeared. For example, if we do Subtarget.hasSSE42() in | |||
19 | /// the lookups below the cost is based on Nehalem as that was the first CPU | |||
20 | /// to support that feature level and thus has most likely the worst case cost, | |||
21 | /// although we may discard an outlying worst cost from one CPU (e.g. Atom). | |||
22 | /// | |||
23 | /// Some examples of other technologies/CPUs: | |||
24 | /// SSE 3 - Pentium4 / Athlon64 | |||
25 | /// SSE 4.1 - Penryn | |||
26 | /// SSE 4.2 - Nehalem / Silvermont | |||
27 | /// AVX - Sandy Bridge / Jaguar / Bulldozer | |||
28 | /// AVX2 - Haswell / Ryzen | |||
29 | /// AVX-512 - Xeon Phi / Skylake | |||
30 | /// | |||
31 | /// And some examples of instruction target dependent costs (latency) | |||
32 | /// divss sqrtss rsqrtss | |||
33 | /// AMD K7 11-16 19 3 | |||
34 | /// Piledriver 9-24 13-15 5 | |||
35 | /// Jaguar 14 16 2 | |||
36 | /// Pentium II,III 18 30 2 | |||
37 | /// Nehalem 7-14 7-18 3 | |||
38 | /// Haswell 10-13 11 5 | |||
39 | /// | |||
40 | /// Interpreting the 4 TargetCostKind types: | |||
41 | /// TCK_RecipThroughput and TCK_Latency should try to match the worst case | |||
42 | /// values reported by the CPU scheduler models (and llvm-mca). | |||
43 | /// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the | |||
44 | /// actual encoding size of the instruction. | |||
45 | /// TCK_SizeAndLatency should match the worst case micro-op counts reported by | |||
46 | /// by the CPU scheduler models (and llvm-mca), to ensure that they are | |||
47 | /// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are | |||
48 | /// often used as the cost thresholds where TCK_SizeAndLatency is requested. | |||
49 | //===----------------------------------------------------------------------===// | |||
50 | ||||
51 | #include "X86TargetTransformInfo.h" | |||
52 | #include "llvm/Analysis/TargetTransformInfo.h" | |||
53 | #include "llvm/CodeGen/BasicTTIImpl.h" | |||
54 | #include "llvm/CodeGen/CostTable.h" | |||
55 | #include "llvm/CodeGen/TargetLowering.h" | |||
56 | #include "llvm/IR/InstIterator.h" | |||
57 | #include "llvm/IR/IntrinsicInst.h" | |||
58 | #include "llvm/Support/Debug.h" | |||
59 | #include <optional> | |||
60 | ||||
61 | using namespace llvm; | |||
62 | ||||
63 | #define DEBUG_TYPE"x86tti" "x86tti" | |||
64 | ||||
65 | //===----------------------------------------------------------------------===// | |||
66 | // | |||
67 | // X86 cost model. | |||
68 | // | |||
69 | //===----------------------------------------------------------------------===// | |||
70 | ||||
71 | // Helper struct to store/access costs for each cost kind. | |||
72 | // TODO: Move this to allow other targets to use it? | |||
73 | struct CostKindCosts { | |||
74 | unsigned RecipThroughputCost = ~0U; | |||
75 | unsigned LatencyCost = ~0U; | |||
76 | unsigned CodeSizeCost = ~0U; | |||
77 | unsigned SizeAndLatencyCost = ~0U; | |||
78 | ||||
79 | std::optional<unsigned> | |||
80 | operator[](TargetTransformInfo::TargetCostKind Kind) const { | |||
81 | unsigned Cost = ~0U; | |||
82 | switch (Kind) { | |||
83 | case TargetTransformInfo::TCK_RecipThroughput: | |||
84 | Cost = RecipThroughputCost; | |||
85 | break; | |||
86 | case TargetTransformInfo::TCK_Latency: | |||
87 | Cost = LatencyCost; | |||
88 | break; | |||
89 | case TargetTransformInfo::TCK_CodeSize: | |||
90 | Cost = CodeSizeCost; | |||
91 | break; | |||
92 | case TargetTransformInfo::TCK_SizeAndLatency: | |||
93 | Cost = SizeAndLatencyCost; | |||
94 | break; | |||
95 | } | |||
96 | if (Cost == ~0U) | |||
97 | return std::nullopt; | |||
98 | return Cost; | |||
99 | } | |||
100 | }; | |||
101 | using CostKindTblEntry = CostTblEntryT<CostKindCosts>; | |||
102 | ||||
103 | TargetTransformInfo::PopcntSupportKind | |||
104 | X86TTIImpl::getPopcntSupport(unsigned TyWidth) { | |||
105 | assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2")(static_cast <bool> (isPowerOf2_32(TyWidth) && "Ty width must be power of 2" ) ? void (0) : __assert_fail ("isPowerOf2_32(TyWidth) && \"Ty width must be power of 2\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 105, __extension__ __PRETTY_FUNCTION__)); | |||
106 | // TODO: Currently the __builtin_popcount() implementation using SSE3 | |||
107 | // instructions is inefficient. Once the problem is fixed, we should | |||
108 | // call ST->hasSSE3() instead of ST->hasPOPCNT(). | |||
109 | return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software; | |||
110 | } | |||
111 | ||||
112 | std::optional<unsigned> X86TTIImpl::getCacheSize( | |||
113 | TargetTransformInfo::CacheLevel Level) const { | |||
114 | switch (Level) { | |||
115 | case TargetTransformInfo::CacheLevel::L1D: | |||
116 | // - Penryn | |||
117 | // - Nehalem | |||
118 | // - Westmere | |||
119 | // - Sandy Bridge | |||
120 | // - Ivy Bridge | |||
121 | // - Haswell | |||
122 | // - Broadwell | |||
123 | // - Skylake | |||
124 | // - Kabylake | |||
125 | return 32 * 1024; // 32 KByte | |||
126 | case TargetTransformInfo::CacheLevel::L2D: | |||
127 | // - Penryn | |||
128 | // - Nehalem | |||
129 | // - Westmere | |||
130 | // - Sandy Bridge | |||
131 | // - Ivy Bridge | |||
132 | // - Haswell | |||
133 | // - Broadwell | |||
134 | // - Skylake | |||
135 | // - Kabylake | |||
136 | return 256 * 1024; // 256 KByte | |||
137 | } | |||
138 | ||||
139 | llvm_unreachable("Unknown TargetTransformInfo::CacheLevel")::llvm::llvm_unreachable_internal("Unknown TargetTransformInfo::CacheLevel" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 139); | |||
140 | } | |||
141 | ||||
142 | std::optional<unsigned> X86TTIImpl::getCacheAssociativity( | |||
143 | TargetTransformInfo::CacheLevel Level) const { | |||
144 | // - Penryn | |||
145 | // - Nehalem | |||
146 | // - Westmere | |||
147 | // - Sandy Bridge | |||
148 | // - Ivy Bridge | |||
149 | // - Haswell | |||
150 | // - Broadwell | |||
151 | // - Skylake | |||
152 | // - Kabylake | |||
153 | switch (Level) { | |||
154 | case TargetTransformInfo::CacheLevel::L1D: | |||
155 | [[fallthrough]]; | |||
156 | case TargetTransformInfo::CacheLevel::L2D: | |||
157 | return 8; | |||
158 | } | |||
159 | ||||
160 | llvm_unreachable("Unknown TargetTransformInfo::CacheLevel")::llvm::llvm_unreachable_internal("Unknown TargetTransformInfo::CacheLevel" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 160); | |||
161 | } | |||
162 | ||||
163 | unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const { | |||
164 | bool Vector = (ClassID == 1); | |||
165 | if (Vector && !ST->hasSSE1()) | |||
166 | return 0; | |||
167 | ||||
168 | if (ST->is64Bit()) { | |||
169 | if (Vector && ST->hasAVX512()) | |||
170 | return 32; | |||
171 | return 16; | |||
172 | } | |||
173 | return 8; | |||
174 | } | |||
175 | ||||
176 | TypeSize | |||
177 | X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { | |||
178 | unsigned PreferVectorWidth = ST->getPreferVectorWidth(); | |||
179 | switch (K) { | |||
180 | case TargetTransformInfo::RGK_Scalar: | |||
181 | return TypeSize::getFixed(ST->is64Bit() ? 64 : 32); | |||
182 | case TargetTransformInfo::RGK_FixedWidthVector: | |||
183 | if (ST->hasAVX512() && PreferVectorWidth >= 512) | |||
184 | return TypeSize::getFixed(512); | |||
185 | if (ST->hasAVX() && PreferVectorWidth >= 256) | |||
186 | return TypeSize::getFixed(256); | |||
187 | if (ST->hasSSE1() && PreferVectorWidth >= 128) | |||
188 | return TypeSize::getFixed(128); | |||
189 | return TypeSize::getFixed(0); | |||
190 | case TargetTransformInfo::RGK_ScalableVector: | |||
191 | return TypeSize::getScalable(0); | |||
192 | } | |||
193 | ||||
194 | llvm_unreachable("Unsupported register kind")::llvm::llvm_unreachable_internal("Unsupported register kind" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 194); | |||
195 | } | |||
196 | ||||
197 | unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const { | |||
198 | return getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) | |||
199 | .getFixedSize(); | |||
200 | } | |||
201 | ||||
202 | unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) { | |||
203 | // If the loop will not be vectorized, don't interleave the loop. | |||
204 | // Let regular unroll to unroll the loop, which saves the overflow | |||
205 | // check and memory check cost. | |||
206 | if (VF == 1) | |||
207 | return 1; | |||
208 | ||||
209 | if (ST->isAtom()) | |||
210 | return 1; | |||
211 | ||||
212 | // Sandybridge and Haswell have multiple execution ports and pipelined | |||
213 | // vector units. | |||
214 | if (ST->hasAVX()) | |||
215 | return 4; | |||
216 | ||||
217 | return 2; | |||
218 | } | |||
219 | ||||
220 | InstructionCost X86TTIImpl::getArithmeticInstrCost( | |||
221 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, | |||
222 | TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, | |||
223 | ArrayRef<const Value *> Args, | |||
224 | const Instruction *CxtI) { | |||
225 | ||||
226 | // vXi8 multiplications are always promoted to vXi16. | |||
227 | if (Opcode == Instruction::Mul && Ty->isVectorTy() && | |||
228 | Ty->getScalarSizeInBits() == 8) { | |||
229 | Type *WideVecTy = | |||
230 | VectorType::getExtendedElementVectorType(cast<VectorType>(Ty)); | |||
231 | return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty, | |||
232 | TargetTransformInfo::CastContextHint::None, | |||
233 | CostKind) + | |||
234 | getCastInstrCost(Instruction::Trunc, Ty, WideVecTy, | |||
235 | TargetTransformInfo::CastContextHint::None, | |||
236 | CostKind) + | |||
237 | getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info); | |||
238 | } | |||
239 | ||||
240 | // Legalize the type. | |||
241 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); | |||
242 | ||||
243 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
244 | assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ? void (0) : __assert_fail ("ISD && \"Invalid opcode\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 244, __extension__ __PRETTY_FUNCTION__)); | |||
245 | ||||
246 | if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() && | |||
247 | LT.second.getScalarType() == MVT::i32) { | |||
248 | // Check if the operands can be represented as a smaller datatype. | |||
249 | bool Op1Signed = false, Op2Signed = false; | |||
250 | unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed); | |||
251 | unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed); | |||
252 | unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize); | |||
253 | bool SignedMode = Op1Signed || Op2Signed; | |||
254 | ||||
255 | // If both are representable as i15 and at least one is constant, | |||
256 | // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we | |||
257 | // can treat this as PMADDWD which has the same costs as a vXi16 multiply. | |||
258 | if (OpMinSize <= 15 && !ST->isPMADDWDSlow()) { | |||
259 | bool Op1Constant = | |||
260 | isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]); | |||
261 | bool Op2Constant = | |||
262 | isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]); | |||
263 | bool Op1Sext = isa<SExtInst>(Args[0]) && | |||
264 | (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41())); | |||
265 | bool Op2Sext = isa<SExtInst>(Args[1]) && | |||
266 | (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41())); | |||
267 | ||||
268 | bool IsZeroExtended = !Op1Signed || !Op2Signed; | |||
269 | bool IsConstant = Op1Constant || Op2Constant; | |||
270 | bool IsSext = Op1Sext || Op2Sext; | |||
271 | if (IsConstant || IsZeroExtended || IsSext) | |||
272 | LT.second = | |||
273 | MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements()); | |||
274 | } | |||
275 | ||||
276 | // Check if the vXi32 operands can be shrunk into a smaller datatype. | |||
277 | // This should match the codegen from reduceVMULWidth. | |||
278 | // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()). | |||
279 | if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) { | |||
280 | if (OpMinSize <= 7) | |||
281 | return LT.first * 3; // pmullw/sext | |||
282 | if (!SignedMode && OpMinSize <= 8) | |||
283 | return LT.first * 3; // pmullw/zext | |||
284 | if (OpMinSize <= 15) | |||
285 | return LT.first * 5; // pmullw/pmulhw/pshuf | |||
286 | if (!SignedMode && OpMinSize <= 16) | |||
287 | return LT.first * 5; // pmullw/pmulhw/pshuf | |||
288 | } | |||
289 | } | |||
290 | ||||
291 | // Vector multiply by pow2 will be simplified to shifts. | |||
292 | // Vector multiply by -pow2 will be simplified to shifts/negates. | |||
293 | if (ISD == ISD::MUL && Op2Info.isConstant() && | |||
294 | (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) { | |||
295 | InstructionCost Cost = | |||
296 | getArithmeticInstrCost(Instruction::Shl, Ty, CostKind, | |||
297 | Op1Info.getNoProps(), Op2Info.getNoProps()); | |||
298 | if (Op2Info.isNegatedPowerOf2()) | |||
299 | Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind); | |||
300 | return Cost; | |||
301 | } | |||
302 | ||||
303 | // On X86, vector signed division by constants power-of-two are | |||
304 | // normally expanded to the sequence SRA + SRL + ADD + SRA. | |||
305 | // The OperandValue properties may not be the same as that of the previous | |||
306 | // operation; conservatively assume OP_None. | |||
307 | if ((ISD == ISD::SDIV || ISD == ISD::SREM) && | |||
308 | Op2Info.isConstant() && Op2Info.isPowerOf2()) { | |||
309 | InstructionCost Cost = | |||
310 | 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, | |||
311 | Op1Info.getNoProps(), Op2Info.getNoProps()); | |||
312 | Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, | |||
313 | Op1Info.getNoProps(), Op2Info.getNoProps()); | |||
314 | Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, | |||
315 | Op1Info.getNoProps(), Op2Info.getNoProps()); | |||
316 | ||||
317 | if (ISD == ISD::SREM) { | |||
318 | // For SREM: (X % C) is the equivalent of (X - (X/C)*C) | |||
319 | Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(), | |||
320 | Op2Info.getNoProps()); | |||
321 | Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(), | |||
322 | Op2Info.getNoProps()); | |||
323 | } | |||
324 | ||||
325 | return Cost; | |||
326 | } | |||
327 | ||||
328 | // Vector unsigned division/remainder will be simplified to shifts/masks. | |||
329 | if ((ISD == ISD::UDIV || ISD == ISD::UREM) && | |||
330 | Op2Info.isConstant() && Op2Info.isPowerOf2()) { | |||
331 | if (ISD == ISD::UDIV) | |||
332 | return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, | |||
333 | Op1Info.getNoProps(), Op2Info.getNoProps()); | |||
334 | // UREM | |||
335 | return getArithmeticInstrCost(Instruction::And, Ty, CostKind, | |||
336 | Op1Info.getNoProps(), Op2Info.getNoProps()); | |||
337 | } | |||
338 | ||||
339 | static const CostKindTblEntry AVX512BWUniformConstCostTable[] = { | |||
340 | { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand. | |||
341 | { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand. | |||
342 | { ISD::SRA, MVT::v16i8, { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb. | |||
343 | { ISD::SHL, MVT::v32i8, { 1, 8, 2, 3 } }, // psllw + pand. | |||
344 | { ISD::SRL, MVT::v32i8, { 1, 8, 2, 3 } }, // psrlw + pand. | |||
345 | { ISD::SRA, MVT::v32i8, { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb. | |||
346 | { ISD::SHL, MVT::v64i8, { 1, 8, 2, 3 } }, // psllw + pand. | |||
347 | { ISD::SRL, MVT::v64i8, { 1, 8, 2, 3 } }, // psrlw + pand. | |||
348 | { ISD::SRA, MVT::v64i8, { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb. | |||
349 | ||||
350 | { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // psllw | |||
351 | { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw | |||
352 | { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw | |||
353 | { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // psllw | |||
354 | { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw | |||
355 | { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw | |||
356 | }; | |||
357 | ||||
358 | if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI()) | |||
359 | if (const auto *Entry = | |||
360 | CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second)) | |||
361 | if (auto KindCost = Entry->Cost[CostKind]) | |||
362 | return LT.first * *KindCost; | |||
363 | ||||
364 | static const CostKindTblEntry AVX512UniformConstCostTable[] = { | |||
365 | { ISD::SHL, MVT::v64i8, { 2, 12, 5, 6 } }, // psllw + pand. | |||
366 | { ISD::SRL, MVT::v64i8, { 2, 12, 5, 6 } }, // psrlw + pand. | |||
367 | { ISD::SRA, MVT::v64i8, { 3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb. | |||
368 | ||||
369 | { ISD::SHL, MVT::v16i16, { 2, 7, 4, 4 } }, // psllw + split. | |||
370 | { ISD::SRL, MVT::v16i16, { 2, 7, 4, 4 } }, // psrlw + split. | |||
371 | { ISD::SRA, MVT::v16i16, { 2, 7, 4, 4 } }, // psraw + split. | |||
372 | ||||
373 | { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, // pslld | |||
374 | { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, // psrld | |||
375 | { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, // psrad | |||
376 | { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, // pslld | |||
377 | { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, // psrld | |||
378 | { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, // psrad | |||
379 | ||||
380 | { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, // psraq | |||
381 | { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, // psllq | |||
382 | { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, // psrlq | |||
383 | { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, // psraq | |||
384 | { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, // psllq | |||
385 | { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, // psrlq | |||
386 | { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, // psraq | |||
387 | ||||
388 | { ISD::SDIV, MVT::v16i32, { 6 } }, // pmuludq sequence | |||
389 | { ISD::SREM, MVT::v16i32, { 8 } }, // pmuludq+mul+sub sequence | |||
390 | { ISD::UDIV, MVT::v16i32, { 5 } }, // pmuludq sequence | |||
391 | { ISD::UREM, MVT::v16i32, { 7 } }, // pmuludq+mul+sub sequence | |||
392 | }; | |||
393 | ||||
394 | if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512()) | |||
395 | if (const auto *Entry = | |||
396 | CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second)) | |||
397 | if (auto KindCost = Entry->Cost[CostKind]) | |||
398 | return LT.first * *KindCost; | |||
399 | ||||
400 | static const CostKindTblEntry AVX2UniformConstCostTable[] = { | |||
401 | { ISD::SHL, MVT::v16i8, { 1, 8, 2, 3 } }, // psllw + pand. | |||
402 | { ISD::SRL, MVT::v16i8, { 1, 8, 2, 3 } }, // psrlw + pand. | |||
403 | { ISD::SRA, MVT::v16i8, { 2, 10, 5, 6 } }, // psrlw, pand, pxor, psubb. | |||
404 | { ISD::SHL, MVT::v32i8, { 2, 8, 2, 4 } }, // psllw + pand. | |||
405 | { ISD::SRL, MVT::v32i8, { 2, 8, 2, 4 } }, // psrlw + pand. | |||
406 | { ISD::SRA, MVT::v32i8, { 3, 10, 5, 9 } }, // psrlw, pand, pxor, psubb. | |||
407 | ||||
408 | { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw | |||
409 | { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw | |||
410 | { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw | |||
411 | { ISD::SHL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psllw | |||
412 | { ISD::SRL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psrlw | |||
413 | { ISD::SRA, MVT::v16i16,{ 2, 2, 1, 2 } }, // psraw | |||
414 | ||||
415 | { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld | |||
416 | { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld | |||
417 | { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad | |||
418 | { ISD::SHL, MVT::v8i32, { 2, 2, 1, 2 } }, // pslld | |||
419 | { ISD::SRL, MVT::v8i32, { 2, 2, 1, 2 } }, // psrld | |||
420 | { ISD::SRA, MVT::v8i32, { 2, 2, 1, 2 } }, // psrad | |||
421 | ||||
422 | { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq | |||
423 | { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq | |||
424 | { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle. | |||
425 | { ISD::SHL, MVT::v4i64, { 2, 2, 1, 2 } }, // psllq | |||
426 | { ISD::SRL, MVT::v4i64, { 2, 2, 1, 2 } }, // psrlq | |||
427 | { ISD::SRA, MVT::v4i64, { 4, 4, 3, 6 } }, // psrad + shuffle + split. | |||
428 | ||||
429 | { ISD::SDIV, MVT::v8i32, { 6 } }, // pmuludq sequence | |||
430 | { ISD::SREM, MVT::v8i32, { 8 } }, // pmuludq+mul+sub sequence | |||
431 | { ISD::UDIV, MVT::v8i32, { 5 } }, // pmuludq sequence | |||
432 | { ISD::UREM, MVT::v8i32, { 7 } }, // pmuludq+mul+sub sequence | |||
433 | }; | |||
434 | ||||
435 | if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2()) | |||
436 | if (const auto *Entry = | |||
437 | CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second)) | |||
438 | if (auto KindCost = Entry->Cost[CostKind]) | |||
439 | return LT.first * *KindCost; | |||
440 | ||||
441 | static const CostKindTblEntry AVXUniformConstCostTable[] = { | |||
442 | { ISD::SHL, MVT::v16i8, { 2, 7, 2, 3 } }, // psllw + pand. | |||
443 | { ISD::SRL, MVT::v16i8, { 2, 7, 2, 3 } }, // psrlw + pand. | |||
444 | { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb. | |||
445 | { ISD::SHL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psllw + pand) + split. | |||
446 | { ISD::SRL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psrlw + pand) + split. | |||
447 | { ISD::SRA, MVT::v32i8, { 7, 7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split. | |||
448 | ||||
449 | { ISD::SHL, MVT::v8i16, { 1, 2, 1, 1 } }, // psllw. | |||
450 | { ISD::SRL, MVT::v8i16, { 1, 2, 1, 1 } }, // psrlw. | |||
451 | { ISD::SRA, MVT::v8i16, { 1, 2, 1, 1 } }, // psraw. | |||
452 | { ISD::SHL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psllw + split. | |||
453 | { ISD::SRL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psrlw + split. | |||
454 | { ISD::SRA, MVT::v16i16,{ 3, 6, 4, 5 } }, // psraw + split. | |||
455 | ||||
456 | { ISD::SHL, MVT::v4i32, { 1, 2, 1, 1 } }, // pslld. | |||
457 | { ISD::SRL, MVT::v4i32, { 1, 2, 1, 1 } }, // psrld. | |||
458 | { ISD::SRA, MVT::v4i32, { 1, 2, 1, 1 } }, // psrad. | |||
459 | { ISD::SHL, MVT::v8i32, { 3, 6, 4, 5 } }, // pslld + split. | |||
460 | { ISD::SRL, MVT::v8i32, { 3, 6, 4, 5 } }, // psrld + split. | |||
461 | { ISD::SRA, MVT::v8i32, { 3, 6, 4, 5 } }, // psrad + split. | |||
462 | ||||
463 | { ISD::SHL, MVT::v2i64, { 1, 2, 1, 1 } }, // psllq. | |||
464 | { ISD::SRL, MVT::v2i64, { 1, 2, 1, 1 } }, // psrlq. | |||
465 | { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle. | |||
466 | { ISD::SHL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split. | |||
467 | { ISD::SRL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split. | |||
468 | { ISD::SRA, MVT::v4i64, { 5, 7, 8, 9 } }, // 2 x psrad + shuffle + split. | |||
469 | ||||
470 | { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split. | |||
471 | { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split. | |||
472 | { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split. | |||
473 | { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split. | |||
474 | }; | |||
475 | ||||
476 | // XOP has faster vXi8 shifts. | |||
477 | if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() && | |||
478 | (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) | |||
479 | if (const auto *Entry = | |||
480 | CostTableLookup(AVXUniformConstCostTable, ISD, LT.second)) | |||
481 | if (auto KindCost = Entry->Cost[CostKind]) | |||
482 | return LT.first * *KindCost; | |||
483 | ||||
484 | static const CostKindTblEntry SSE2UniformConstCostTable[] = { | |||
485 | { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand. | |||
486 | { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand. | |||
487 | { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb. | |||
488 | ||||
489 | { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw. | |||
490 | { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw. | |||
491 | { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw. | |||
492 | ||||
493 | { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld | |||
494 | { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld. | |||
495 | { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad. | |||
496 | ||||
497 | { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq. | |||
498 | { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq. | |||
499 | { ISD::SRA, MVT::v2i64, { 3, 5, 6, 6 } }, // 2 x psrad + shuffle. | |||
500 | ||||
501 | { ISD::SDIV, MVT::v4i32, { 6 } }, // pmuludq sequence | |||
502 | { ISD::SREM, MVT::v4i32, { 8 } }, // pmuludq+mul+sub sequence | |||
503 | { ISD::UDIV, MVT::v4i32, { 5 } }, // pmuludq sequence | |||
504 | { ISD::UREM, MVT::v4i32, { 7 } }, // pmuludq+mul+sub sequence | |||
505 | }; | |||
506 | ||||
507 | // XOP has faster vXi8 shifts. | |||
508 | if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() && | |||
509 | (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) | |||
510 | if (const auto *Entry = | |||
511 | CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second)) | |||
512 | if (auto KindCost = Entry->Cost[CostKind]) | |||
513 | return LT.first * *KindCost; | |||
514 | ||||
515 | static const CostKindTblEntry AVX512BWConstCostTable[] = { | |||
516 | { ISD::SDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence | |||
517 | { ISD::SREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence | |||
518 | { ISD::UDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence | |||
519 | { ISD::UREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence | |||
520 | ||||
521 | { ISD::SDIV, MVT::v32i16, { 6 } }, // vpmulhw sequence | |||
522 | { ISD::SREM, MVT::v32i16, { 8 } }, // vpmulhw+mul+sub sequence | |||
523 | { ISD::UDIV, MVT::v32i16, { 6 } }, // vpmulhuw sequence | |||
524 | { ISD::UREM, MVT::v32i16, { 8 } }, // vpmulhuw+mul+sub sequence | |||
525 | }; | |||
526 | ||||
527 | if (Op2Info.isConstant() && ST->hasBWI()) | |||
528 | if (const auto *Entry = | |||
529 | CostTableLookup(AVX512BWConstCostTable, ISD, LT.second)) | |||
530 | if (auto KindCost = Entry->Cost[CostKind]) | |||
531 | return LT.first * *KindCost; | |||
532 | ||||
533 | static const CostKindTblEntry AVX512ConstCostTable[] = { | |||
534 | { ISD::SDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence | |||
535 | { ISD::SREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence | |||
536 | { ISD::UDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence | |||
537 | { ISD::UREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence | |||
538 | ||||
539 | { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence | |||
540 | { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence | |||
541 | { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence | |||
542 | { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence | |||
543 | ||||
544 | { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence | |||
545 | { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence | |||
546 | { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence | |||
547 | { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence | |||
548 | }; | |||
549 | ||||
550 | if (Op2Info.isConstant() && ST->hasAVX512()) | |||
551 | if (const auto *Entry = | |||
552 | CostTableLookup(AVX512ConstCostTable, ISD, LT.second)) | |||
553 | if (auto KindCost = Entry->Cost[CostKind]) | |||
554 | return LT.first * *KindCost; | |||
555 | ||||
556 | static const CostKindTblEntry AVX2ConstCostTable[] = { | |||
557 | { ISD::SDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence | |||
558 | { ISD::SREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence | |||
559 | { ISD::UDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence | |||
560 | { ISD::UREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence | |||
561 | ||||
562 | { ISD::SDIV, MVT::v16i16, { 6 } }, // vpmulhw sequence | |||
563 | { ISD::SREM, MVT::v16i16, { 8 } }, // vpmulhw+mul+sub sequence | |||
564 | { ISD::UDIV, MVT::v16i16, { 6 } }, // vpmulhuw sequence | |||
565 | { ISD::UREM, MVT::v16i16, { 8 } }, // vpmulhuw+mul+sub sequence | |||
566 | ||||
567 | { ISD::SDIV, MVT::v8i32, { 15 } }, // vpmuldq sequence | |||
568 | { ISD::SREM, MVT::v8i32, { 19 } }, // vpmuldq+mul+sub sequence | |||
569 | { ISD::UDIV, MVT::v8i32, { 15 } }, // vpmuludq sequence | |||
570 | { ISD::UREM, MVT::v8i32, { 19 } }, // vpmuludq+mul+sub sequence | |||
571 | }; | |||
572 | ||||
573 | if (Op2Info.isConstant() && ST->hasAVX2()) | |||
574 | if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second)) | |||
575 | if (auto KindCost = Entry->Cost[CostKind]) | |||
576 | return LT.first * *KindCost; | |||
577 | ||||
578 | static const CostKindTblEntry AVXConstCostTable[] = { | |||
579 | { ISD::SDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split. | |||
580 | { ISD::SREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split. | |||
581 | { ISD::UDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split. | |||
582 | { ISD::UREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split. | |||
583 | ||||
584 | { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split. | |||
585 | { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split. | |||
586 | { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split. | |||
587 | { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split. | |||
588 | ||||
589 | { ISD::SDIV, MVT::v8i32, { 32 } }, // vpmuludq sequence | |||
590 | { ISD::SREM, MVT::v8i32, { 38 } }, // vpmuludq+mul+sub sequence | |||
591 | { ISD::UDIV, MVT::v8i32, { 32 } }, // 2*pmuludq sequence + split. | |||
592 | { ISD::UREM, MVT::v8i32, { 42 } }, // 2*pmuludq+mul+sub sequence + split. | |||
593 | }; | |||
594 | ||||
595 | if (Op2Info.isConstant() && ST->hasAVX()) | |||
596 | if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second)) | |||
597 | if (auto KindCost = Entry->Cost[CostKind]) | |||
598 | return LT.first * *KindCost; | |||
599 | ||||
600 | static const CostKindTblEntry SSE41ConstCostTable[] = { | |||
601 | { ISD::SDIV, MVT::v4i32, { 15 } }, // vpmuludq sequence | |||
602 | { ISD::SREM, MVT::v4i32, { 20 } }, // vpmuludq+mul+sub sequence | |||
603 | }; | |||
604 | ||||
605 | if (Op2Info.isConstant() && ST->hasSSE41()) | |||
606 | if (const auto *Entry = | |||
607 | CostTableLookup(SSE41ConstCostTable, ISD, LT.second)) | |||
608 | if (auto KindCost = Entry->Cost[CostKind]) | |||
609 | return LT.first * *KindCost; | |||
610 | ||||
611 | static const CostKindTblEntry SSE2ConstCostTable[] = { | |||
612 | { ISD::SDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence | |||
613 | { ISD::SREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence | |||
614 | { ISD::UDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence | |||
615 | { ISD::UREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence | |||
616 | ||||
617 | { ISD::SDIV, MVT::v8i16, { 6 } }, // pmulhw sequence | |||
618 | { ISD::SREM, MVT::v8i16, { 8 } }, // pmulhw+mul+sub sequence | |||
619 | { ISD::UDIV, MVT::v8i16, { 6 } }, // pmulhuw sequence | |||
620 | { ISD::UREM, MVT::v8i16, { 8 } }, // pmulhuw+mul+sub sequence | |||
621 | ||||
622 | { ISD::SDIV, MVT::v4i32, { 19 } }, // pmuludq sequence | |||
623 | { ISD::SREM, MVT::v4i32, { 24 } }, // pmuludq+mul+sub sequence | |||
624 | { ISD::UDIV, MVT::v4i32, { 15 } }, // pmuludq sequence | |||
625 | { ISD::UREM, MVT::v4i32, { 20 } }, // pmuludq+mul+sub sequence | |||
626 | }; | |||
627 | ||||
628 | if (Op2Info.isConstant() && ST->hasSSE2()) | |||
629 | if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second)) | |||
630 | if (auto KindCost = Entry->Cost[CostKind]) | |||
631 | return LT.first * *KindCost; | |||
632 | ||||
633 | static const CostKindTblEntry AVX512BWUniformCostTable[] = { | |||
634 | { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand. | |||
635 | { ISD::SRL, MVT::v16i8, { 3,10, 5, 8 } }, // psrlw + pand. | |||
636 | { ISD::SRA, MVT::v16i8, { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb. | |||
637 | { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand. | |||
638 | { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand. | |||
639 | { ISD::SRA, MVT::v32i8, { 5,10,10,13 } }, // psrlw, pand, pxor, psubb. | |||
640 | { ISD::SHL, MVT::v64i8, { 4, 7, 6, 8 } }, // psllw + pand. | |||
641 | { ISD::SRL, MVT::v64i8, { 4, 8, 7,10 } }, // psrlw + pand. | |||
642 | { ISD::SRA, MVT::v64i8, { 5,10,10,15 } }, // psrlw, pand, pxor, psubb. | |||
643 | ||||
644 | { ISD::SHL, MVT::v32i16, { 2, 4, 2, 3 } }, // psllw | |||
645 | { ISD::SRL, MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw | |||
646 | { ISD::SRA, MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw | |||
647 | }; | |||
648 | ||||
649 | if (ST->hasBWI() && Op2Info.isUniform()) | |||
650 | if (const auto *Entry = | |||
651 | CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second)) | |||
652 | if (auto KindCost = Entry->Cost[CostKind]) | |||
653 | return LT.first * *KindCost; | |||
654 | ||||
655 | static const CostKindTblEntry AVX512UniformCostTable[] = { | |||
656 | { ISD::SHL, MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split. | |||
657 | { ISD::SRL, MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split. | |||
658 | { ISD::SRA, MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split. | |||
659 | ||||
660 | { ISD::SHL, MVT::v16i32, { 2, 4, 2, 3 } }, // pslld | |||
661 | { ISD::SRL, MVT::v16i32, { 2, 4, 2, 3 } }, // psrld | |||
662 | { ISD::SRA, MVT::v16i32, { 2, 4, 2, 3 } }, // psrad | |||
663 | ||||
664 | { ISD::SRA, MVT::v2i64, { 1, 2, 1, 2 } }, // psraq | |||
665 | { ISD::SHL, MVT::v4i64, { 1, 4, 1, 2 } }, // psllq | |||
666 | { ISD::SRL, MVT::v4i64, { 1, 4, 1, 2 } }, // psrlq | |||
667 | { ISD::SRA, MVT::v4i64, { 1, 4, 1, 2 } }, // psraq | |||
668 | { ISD::SHL, MVT::v8i64, { 1, 4, 1, 2 } }, // psllq | |||
669 | { ISD::SRL, MVT::v8i64, { 1, 4, 1, 2 } }, // psrlq | |||
670 | { ISD::SRA, MVT::v8i64, { 1, 4, 1, 2 } }, // psraq | |||
671 | }; | |||
672 | ||||
673 | if (ST->hasAVX512() && Op2Info.isUniform()) | |||
674 | if (const auto *Entry = | |||
675 | CostTableLookup(AVX512UniformCostTable, ISD, LT.second)) | |||
676 | if (auto KindCost = Entry->Cost[CostKind]) | |||
677 | return LT.first * *KindCost; | |||
678 | ||||
679 | static const CostKindTblEntry AVX2UniformCostTable[] = { | |||
680 | // Uniform splats are cheaper for the following instructions. | |||
681 | { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand. | |||
682 | { ISD::SRL, MVT::v16i8, { 3, 9, 5, 8 } }, // psrlw + pand. | |||
683 | { ISD::SRA, MVT::v16i8, { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb. | |||
684 | { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand. | |||
685 | { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand. | |||
686 | { ISD::SRA, MVT::v32i8, { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb. | |||
687 | ||||
688 | { ISD::SHL, MVT::v8i16, { 1, 2, 1, 2 } }, // psllw. | |||
689 | { ISD::SRL, MVT::v8i16, { 1, 2, 1, 2 } }, // psrlw. | |||
690 | { ISD::SRA, MVT::v8i16, { 1, 2, 1, 2 } }, // psraw. | |||
691 | { ISD::SHL, MVT::v16i16, { 2, 4, 2, 3 } }, // psllw. | |||
692 | { ISD::SRL, MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw. | |||
693 | { ISD::SRA, MVT::v16i16, { 2, 4, 2, 3 } }, // psraw. | |||
694 | ||||
695 | { ISD::SHL, MVT::v4i32, { 1, 2, 1, 2 } }, // pslld | |||
696 | { ISD::SRL, MVT::v4i32, { 1, 2, 1, 2 } }, // psrld | |||
697 | { ISD::SRA, MVT::v4i32, { 1, 2, 1, 2 } }, // psrad | |||
698 | { ISD::SHL, MVT::v8i32, { 2, 4, 2, 3 } }, // pslld | |||
699 | { ISD::SRL, MVT::v8i32, { 2, 4, 2, 3 } }, // psrld | |||
700 | { ISD::SRA, MVT::v8i32, { 2, 4, 2, 3 } }, // psrad | |||
701 | ||||
702 | { ISD::SHL, MVT::v2i64, { 1, 2, 1, 2 } }, // psllq | |||
703 | { ISD::SRL, MVT::v2i64, { 1, 2, 1, 2 } }, // psrlq | |||
704 | { ISD::SRA, MVT::v2i64, { 2, 4, 5, 7 } }, // 2 x psrad + shuffle. | |||
705 | { ISD::SHL, MVT::v4i64, { 2, 4, 1, 2 } }, // psllq | |||
706 | { ISD::SRL, MVT::v4i64, { 2, 4, 1, 2 } }, // psrlq | |||
707 | { ISD::SRA, MVT::v4i64, { 4, 6, 5, 9 } }, // 2 x psrad + shuffle. | |||
708 | }; | |||
709 | ||||
710 | if (ST->hasAVX2() && Op2Info.isUniform()) | |||
711 | if (const auto *Entry = | |||
712 | CostTableLookup(AVX2UniformCostTable, ISD, LT.second)) | |||
713 | if (auto KindCost = Entry->Cost[CostKind]) | |||
714 | return LT.first * *KindCost; | |||
715 | ||||
716 | static const CostKindTblEntry AVXUniformCostTable[] = { | |||
717 | { ISD::SHL, MVT::v16i8, { 4, 4, 6, 8 } }, // psllw + pand. | |||
718 | { ISD::SRL, MVT::v16i8, { 4, 8, 5, 8 } }, // psrlw + pand. | |||
719 | { ISD::SRA, MVT::v16i8, { 6, 6, 9,13 } }, // psrlw, pand, pxor, psubb. | |||
720 | { ISD::SHL, MVT::v32i8, { 7, 8,11,14 } }, // psllw + pand + split. | |||
721 | { ISD::SRL, MVT::v32i8, { 7, 9,10,14 } }, // psrlw + pand + split. | |||
722 | { ISD::SRA, MVT::v32i8, { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split. | |||
723 | ||||
724 | { ISD::SHL, MVT::v8i16, { 1, 3, 1, 2 } }, // psllw. | |||
725 | { ISD::SRL, MVT::v8i16, { 1, 3, 1, 2 } }, // psrlw. | |||
726 | { ISD::SRA, MVT::v8i16, { 1, 3, 1, 2 } }, // psraw. | |||
727 | { ISD::SHL, MVT::v16i16, { 3, 7, 5, 7 } }, // psllw + split. | |||
728 | { ISD::SRL, MVT::v16i16, { 3, 7, 5, 7 } }, // psrlw + split. | |||
729 | { ISD::SRA, MVT::v16i16, { 3, 7, 5, 7 } }, // psraw + split. | |||
730 | ||||
731 | { ISD::SHL, MVT::v4i32, { 1, 3, 1, 2 } }, // pslld. | |||
732 | { ISD::SRL, MVT::v4i32, { 1, 3, 1, 2 } }, // psrld. | |||
733 | { ISD::SRA, MVT::v4i32, { 1, 3, 1, 2 } }, // psrad. | |||
734 | { ISD::SHL, MVT::v8i32, { 3, 7, 5, 7 } }, // pslld + split. | |||
735 | { ISD::SRL, MVT::v8i32, { 3, 7, 5, 7 } }, // psrld + split. | |||
736 | { ISD::SRA, MVT::v8i32, { 3, 7, 5, 7 } }, // psrad + split. | |||
737 | ||||
738 | { ISD::SHL, MVT::v2i64, { 1, 3, 1, 2 } }, // psllq. | |||
739 | { ISD::SRL, MVT::v2i64, { 1, 3, 1, 2 } }, // psrlq. | |||
740 | { ISD::SRA, MVT::v2i64, { 3, 4, 5, 7 } }, // 2 x psrad + shuffle. | |||
741 | { ISD::SHL, MVT::v4i64, { 3, 7, 4, 6 } }, // psllq + split. | |||
742 | { ISD::SRL, MVT::v4i64, { 3, 7, 4, 6 } }, // psrlq + split. | |||
743 | { ISD::SRA, MVT::v4i64, { 6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split. | |||
744 | }; | |||
745 | ||||
746 | // XOP has faster vXi8 shifts. | |||
747 | if (ST->hasAVX() && Op2Info.isUniform() && | |||
748 | (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) | |||
749 | if (const auto *Entry = | |||
750 | CostTableLookup(AVXUniformCostTable, ISD, LT.second)) | |||
751 | if (auto KindCost = Entry->Cost[CostKind]) | |||
752 | return LT.first * *KindCost; | |||
753 | ||||
754 | static const CostKindTblEntry SSE2UniformCostTable[] = { | |||
755 | // Uniform splats are cheaper for the following instructions. | |||
756 | { ISD::SHL, MVT::v16i8, { 9, 10, 6, 9 } }, // psllw + pand. | |||
757 | { ISD::SRL, MVT::v16i8, { 9, 13, 5, 9 } }, // psrlw + pand. | |||
758 | { ISD::SRA, MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence. | |||
759 | ||||
760 | { ISD::SHL, MVT::v8i16, { 2, 2, 1, 2 } }, // psllw. | |||
761 | { ISD::SRL, MVT::v8i16, { 2, 2, 1, 2 } }, // psrlw. | |||
762 | { ISD::SRA, MVT::v8i16, { 2, 2, 1, 2 } }, // psraw. | |||
763 | ||||
764 | { ISD::SHL, MVT::v4i32, { 2, 2, 1, 2 } }, // pslld | |||
765 | { ISD::SRL, MVT::v4i32, { 2, 2, 1, 2 } }, // psrld. | |||
766 | { ISD::SRA, MVT::v4i32, { 2, 2, 1, 2 } }, // psrad. | |||
767 | ||||
768 | { ISD::SHL, MVT::v2i64, { 2, 2, 1, 2 } }, // psllq. | |||
769 | { ISD::SRL, MVT::v2i64, { 2, 2, 1, 2 } }, // psrlq. | |||
770 | { ISD::SRA, MVT::v2i64, { 5, 9, 5, 7 } }, // 2*psrlq + xor + sub. | |||
771 | }; | |||
772 | ||||
773 | if (ST->hasSSE2() && Op2Info.isUniform() && | |||
774 | (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) | |||
775 | if (const auto *Entry = | |||
776 | CostTableLookup(SSE2UniformCostTable, ISD, LT.second)) | |||
777 | if (auto KindCost = Entry->Cost[CostKind]) | |||
778 | return LT.first * *KindCost; | |||
779 | ||||
780 | static const CostKindTblEntry AVX512DQCostTable[] = { | |||
781 | { ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq | |||
782 | { ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq | |||
783 | { ISD::MUL, MVT::v8i64, { 3, 15, 1, 3 } } // pmullq | |||
784 | }; | |||
785 | ||||
786 | // Look for AVX512DQ lowering tricks for custom cases. | |||
787 | if (ST->hasDQI()) | |||
788 | if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second)) | |||
789 | if (auto KindCost = Entry->Cost[CostKind]) | |||
790 | return LT.first * *KindCost; | |||
791 | ||||
792 | static const CostKindTblEntry AVX512BWCostTable[] = { | |||
793 | { ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence. | |||
794 | { ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence. | |||
795 | { ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence. | |||
796 | { ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence. | |||
797 | { ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence. | |||
798 | { ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence. | |||
799 | { ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence. | |||
800 | { ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence. | |||
801 | { ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence. | |||
802 | ||||
803 | { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw | |||
804 | { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw | |||
805 | { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw | |||
806 | { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw | |||
807 | { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw | |||
808 | { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw | |||
809 | { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw | |||
810 | { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw | |||
811 | { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw | |||
812 | ||||
813 | { ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb | |||
814 | { ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw | |||
815 | ||||
816 | { ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb | |||
817 | { ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw | |||
818 | { ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd | |||
819 | { ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq | |||
820 | ||||
821 | { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb | |||
822 | { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw | |||
823 | ||||
824 | { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw | |||
825 | ||||
826 | { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb | |||
827 | { ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw | |||
828 | { ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd | |||
829 | { ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq | |||
830 | }; | |||
831 | ||||
832 | // Look for AVX512BW lowering tricks for custom cases. | |||
833 | if (ST->hasBWI()) | |||
834 | if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second)) | |||
835 | if (auto KindCost = Entry->Cost[CostKind]) | |||
836 | return LT.first * *KindCost; | |||
837 | ||||
838 | static const CostKindTblEntry AVX512CostTable[] = { | |||
839 | { ISD::SHL, MVT::v64i8, { 15, 19,27,33 } }, // vpblendv+split sequence. | |||
840 | { ISD::SRL, MVT::v64i8, { 15, 19,30,36 } }, // vpblendv+split sequence. | |||
841 | { ISD::SRA, MVT::v64i8, { 37, 37,51,63 } }, // vpblendv+split sequence. | |||
842 | ||||
843 | { ISD::SHL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence. | |||
844 | { ISD::SRL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence. | |||
845 | { ISD::SRA, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence. | |||
846 | ||||
847 | { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, | |||
848 | { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, | |||
849 | { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, | |||
850 | { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, | |||
851 | { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, | |||
852 | { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, | |||
853 | { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, | |||
854 | { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, | |||
855 | { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, | |||
856 | ||||
857 | { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, | |||
858 | { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, | |||
859 | { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, | |||
860 | { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, | |||
861 | { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, | |||
862 | { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, | |||
863 | { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, | |||
864 | { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, | |||
865 | { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, | |||
866 | ||||
867 | { ISD::ADD, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*paddb + split | |||
868 | { ISD::ADD, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*paddw + split | |||
869 | ||||
870 | { ISD::SUB, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*psubb + split | |||
871 | { ISD::SUB, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*psubw + split | |||
872 | ||||
873 | { ISD::AND, MVT::v32i8, { 1, 1, 1, 1 } }, | |||
874 | { ISD::AND, MVT::v16i16, { 1, 1, 1, 1 } }, | |||
875 | { ISD::AND, MVT::v8i32, { 1, 1, 1, 1 } }, | |||
876 | { ISD::AND, MVT::v4i64, { 1, 1, 1, 1 } }, | |||
877 | ||||
878 | { ISD::OR, MVT::v32i8, { 1, 1, 1, 1 } }, | |||
879 | { ISD::OR, MVT::v16i16, { 1, 1, 1, 1 } }, | |||
880 | { ISD::OR, MVT::v8i32, { 1, 1, 1, 1 } }, | |||
881 | { ISD::OR, MVT::v4i64, { 1, 1, 1, 1 } }, | |||
882 | ||||
883 | { ISD::XOR, MVT::v32i8, { 1, 1, 1, 1 } }, | |||
884 | { ISD::XOR, MVT::v16i16, { 1, 1, 1, 1 } }, | |||
885 | { ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } }, | |||
886 | { ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } }, | |||
887 | ||||
888 | { ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org) | |||
889 | { ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org) | |||
890 | { ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org) | |||
891 | { ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add | |||
892 | { ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/ | |||
893 | ||||
894 | { ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/ | |||
895 | { ISD::FADD, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
896 | { ISD::FADD, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
897 | { ISD::FSUB, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
898 | { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
899 | { ISD::FMUL, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
900 | { ISD::FMUL, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
901 | { ISD::FMUL, MVT::v2f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
902 | { ISD::FMUL, MVT::f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
903 | ||||
904 | { ISD::FDIV, MVT::f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
905 | { ISD::FDIV, MVT::v2f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
906 | { ISD::FDIV, MVT::v4f64, { 8, 14, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
907 | { ISD::FDIV, MVT::v8f64, { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/ | |||
908 | ||||
909 | { ISD::FNEG, MVT::v16f32, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/ | |||
910 | { ISD::FADD, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
911 | { ISD::FADD, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
912 | { ISD::FSUB, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
913 | { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
914 | { ISD::FMUL, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
915 | { ISD::FMUL, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
916 | { ISD::FMUL, MVT::v4f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
917 | { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
918 | ||||
919 | { ISD::FDIV, MVT::f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
920 | { ISD::FDIV, MVT::v4f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
921 | { ISD::FDIV, MVT::v8f32, { 5, 11, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
922 | { ISD::FDIV, MVT::v16f32, { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/ | |||
923 | }; | |||
924 | ||||
925 | if (ST->hasAVX512()) | |||
926 | if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second)) | |||
927 | if (auto KindCost = Entry->Cost[CostKind]) | |||
928 | return LT.first * *KindCost; | |||
929 | ||||
930 | static const CostKindTblEntry AVX2ShiftCostTable[] = { | |||
931 | // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to | |||
932 | // customize them to detect the cases where shift amount is a scalar one. | |||
933 | { ISD::SHL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org) | |||
934 | { ISD::SRL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org) | |||
935 | { ISD::SRA, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org) | |||
936 | { ISD::SHL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org) | |||
937 | { ISD::SRL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org) | |||
938 | { ISD::SRA, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org) | |||
939 | { ISD::SHL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org) | |||
940 | { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org) | |||
941 | { ISD::SHL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org) | |||
942 | { ISD::SRL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org) | |||
943 | }; | |||
944 | ||||
945 | if (ST->hasAVX512()) { | |||
946 | if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant()) | |||
947 | // On AVX512, a packed v32i16 shift left by a constant build_vector | |||
948 | // is lowered into a vector multiply (vpmullw). | |||
949 | return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, | |||
950 | Op1Info.getNoProps(), Op2Info.getNoProps()); | |||
951 | } | |||
952 | ||||
953 | // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts). | |||
954 | if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) { | |||
955 | if (ISD == ISD::SHL && LT.second == MVT::v16i16 && | |||
956 | Op2Info.isConstant()) | |||
957 | // On AVX2, a packed v16i16 shift left by a constant build_vector | |||
958 | // is lowered into a vector multiply (vpmullw). | |||
959 | return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, | |||
960 | Op1Info.getNoProps(), Op2Info.getNoProps()); | |||
961 | ||||
962 | if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second)) | |||
963 | if (auto KindCost = Entry->Cost[CostKind]) | |||
964 | return LT.first * *KindCost; | |||
965 | } | |||
966 | ||||
967 | static const CostKindTblEntry XOPShiftCostTable[] = { | |||
968 | // 128bit shifts take 1cy, but right shifts require negation beforehand. | |||
969 | { ISD::SHL, MVT::v16i8, { 1, 3, 1, 1 } }, | |||
970 | { ISD::SRL, MVT::v16i8, { 2, 3, 1, 1 } }, | |||
971 | { ISD::SRA, MVT::v16i8, { 2, 3, 1, 1 } }, | |||
972 | { ISD::SHL, MVT::v8i16, { 1, 3, 1, 1 } }, | |||
973 | { ISD::SRL, MVT::v8i16, { 2, 3, 1, 1 } }, | |||
974 | { ISD::SRA, MVT::v8i16, { 2, 3, 1, 1 } }, | |||
975 | { ISD::SHL, MVT::v4i32, { 1, 3, 1, 1 } }, | |||
976 | { ISD::SRL, MVT::v4i32, { 2, 3, 1, 1 } }, | |||
977 | { ISD::SRA, MVT::v4i32, { 2, 3, 1, 1 } }, | |||
978 | { ISD::SHL, MVT::v2i64, { 1, 3, 1, 1 } }, | |||
979 | { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, | |||
980 | { ISD::SRA, MVT::v2i64, { 2, 3, 1, 1 } }, | |||
981 | // 256bit shifts require splitting if AVX2 didn't catch them above. | |||
982 | { ISD::SHL, MVT::v32i8, { 4, 7, 5, 6 } }, | |||
983 | { ISD::SRL, MVT::v32i8, { 6, 7, 5, 6 } }, | |||
984 | { ISD::SRA, MVT::v32i8, { 6, 7, 5, 6 } }, | |||
985 | { ISD::SHL, MVT::v16i16, { 4, 7, 5, 6 } }, | |||
986 | { ISD::SRL, MVT::v16i16, { 6, 7, 5, 6 } }, | |||
987 | { ISD::SRA, MVT::v16i16, { 6, 7, 5, 6 } }, | |||
988 | { ISD::SHL, MVT::v8i32, { 4, 7, 5, 6 } }, | |||
989 | { ISD::SRL, MVT::v8i32, { 6, 7, 5, 6 } }, | |||
990 | { ISD::SRA, MVT::v8i32, { 6, 7, 5, 6 } }, | |||
991 | { ISD::SHL, MVT::v4i64, { 4, 7, 5, 6 } }, | |||
992 | { ISD::SRL, MVT::v4i64, { 6, 7, 5, 6 } }, | |||
993 | { ISD::SRA, MVT::v4i64, { 6, 7, 5, 6 } }, | |||
994 | }; | |||
995 | ||||
996 | // Look for XOP lowering tricks. | |||
997 | if (ST->hasXOP()) { | |||
998 | // If the right shift is constant then we'll fold the negation so | |||
999 | // it's as cheap as a left shift. | |||
1000 | int ShiftISD = ISD; | |||
1001 | if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant()) | |||
1002 | ShiftISD = ISD::SHL; | |||
1003 | if (const auto *Entry = | |||
1004 | CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second)) | |||
1005 | if (auto KindCost = Entry->Cost[CostKind]) | |||
1006 | return LT.first * *KindCost; | |||
1007 | } | |||
1008 | ||||
1009 | if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) { | |||
1010 | MVT VT = LT.second; | |||
1011 | // Vector shift left by non uniform constant can be lowered | |||
1012 | // into vector multiply. | |||
1013 | if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) || | |||
1014 | ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX())) | |||
1015 | ISD = ISD::MUL; | |||
1016 | } | |||
1017 | ||||
1018 | static const CostKindTblEntry GLMCostTable[] = { | |||
1019 | { ISD::FDIV, MVT::f32, { 18, 19, 1, 1 } }, // divss | |||
1020 | { ISD::FDIV, MVT::v4f32, { 35, 36, 1, 1 } }, // divps | |||
1021 | { ISD::FDIV, MVT::f64, { 33, 34, 1, 1 } }, // divsd | |||
1022 | { ISD::FDIV, MVT::v2f64, { 65, 66, 1, 1 } }, // divpd | |||
1023 | }; | |||
1024 | ||||
1025 | if (ST->useGLMDivSqrtCosts()) | |||
1026 | if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second)) | |||
1027 | if (auto KindCost = Entry->Cost[CostKind]) | |||
1028 | return LT.first * *KindCost; | |||
1029 | ||||
1030 | static const CostKindTblEntry SLMCostTable[] = { | |||
1031 | { ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld | |||
1032 | { ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw | |||
1033 | { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd | |||
1034 | { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss | |||
1035 | { ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd | |||
1036 | { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // mulps | |||
1037 | { ISD::FDIV, MVT::f32, { 17, 19, 1, 1 } }, // divss | |||
1038 | { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 6 } }, // divps | |||
1039 | { ISD::FDIV, MVT::f64, { 32, 34, 1, 1 } }, // divsd | |||
1040 | { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 6 } }, // divpd | |||
1041 | { ISD::FADD, MVT::v2f64, { 2, 4, 1, 1 } }, // addpd | |||
1042 | { ISD::FSUB, MVT::v2f64, { 2, 4, 1, 1 } }, // subpd | |||
1043 | // v2i64/v4i64 mul is custom lowered as a series of long: | |||
1044 | // multiplies(3), shifts(3) and adds(2) | |||
1045 | // slm muldq version throughput is 2 and addq throughput 4 | |||
1046 | // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) + | |||
1047 | // 3X4 (addq throughput) = 17 | |||
1048 | { ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } }, | |||
1049 | // slm addq\subq throughput is 4 | |||
1050 | { ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } }, | |||
1051 | { ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } }, | |||
1052 | }; | |||
1053 | ||||
1054 | if (ST->useSLMArithCosts()) | |||
1055 | if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second)) | |||
1056 | if (auto KindCost = Entry->Cost[CostKind]) | |||
1057 | return LT.first * *KindCost; | |||
1058 | ||||
1059 | static const CostKindTblEntry AVX2CostTable[] = { | |||
1060 | { ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence. | |||
1061 | { ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence. | |||
1062 | { ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence. | |||
1063 | { ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence. | |||
1064 | ||||
1065 | { ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence. | |||
1066 | { ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence. | |||
1067 | { ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence. | |||
1068 | { ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence. | |||
1069 | ||||
1070 | { ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence. | |||
1071 | { ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence. | |||
1072 | { ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence. | |||
1073 | { ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence. | |||
1074 | { ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence. | |||
1075 | { ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence. | |||
1076 | ||||
1077 | { ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb | |||
1078 | { ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb | |||
1079 | { ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw | |||
1080 | { ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw | |||
1081 | { ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd | |||
1082 | { ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd | |||
1083 | { ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq | |||
1084 | { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq | |||
1085 | ||||
1086 | { ISD::MUL, MVT::v16i16, { 2, 5, 1, 1 } }, // pmullw | |||
1087 | { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld | |||
1088 | { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld | |||
1089 | { ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add | |||
1090 | { ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add | |||
1091 | ||||
1092 | { ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd | |||
1093 | { ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps | |||
1094 | ||||
1095 | { ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd | |||
1096 | { ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss | |||
1097 | { ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd | |||
1098 | { ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps | |||
1099 | { ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd | |||
1100 | { ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps | |||
1101 | ||||
1102 | { ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd | |||
1103 | { ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss | |||
1104 | { ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd | |||
1105 | { ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps | |||
1106 | { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd | |||
1107 | { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps | |||
1108 | ||||
1109 | { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd | |||
1110 | { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss | |||
1111 | { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd | |||
1112 | { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps | |||
1113 | { ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd | |||
1114 | { ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps | |||
1115 | ||||
1116 | { ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss | |||
1117 | { ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps | |||
1118 | { ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps | |||
1119 | { ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd | |||
1120 | { ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd | |||
1121 | { ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd | |||
1122 | }; | |||
1123 | ||||
1124 | // Look for AVX2 lowering tricks for custom cases. | |||
1125 | if (ST->hasAVX2()) | |||
1126 | if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second)) | |||
1127 | if (auto KindCost = Entry->Cost[CostKind]) | |||
1128 | return LT.first * *KindCost; | |||
1129 | ||||
1130 | static const CostKindTblEntry AVX1CostTable[] = { | |||
1131 | // We don't have to scalarize unsupported ops. We can issue two half-sized | |||
1132 | // operations and we only need to extract the upper YMM half. | |||
1133 | // Two ops + 1 extract + 1 insert = 4. | |||
1134 | { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split | |||
1135 | { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split | |||
1136 | { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld | |||
1137 | { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } }, | |||
1138 | ||||
1139 | { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps | |||
1140 | { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps | |||
1141 | { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps | |||
1142 | { ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps | |||
1143 | ||||
1144 | { ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps | |||
1145 | { ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps | |||
1146 | { ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps | |||
1147 | { ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps | |||
1148 | ||||
1149 | { ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps | |||
1150 | { ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps | |||
1151 | { ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps | |||
1152 | { ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps | |||
1153 | ||||
1154 | { ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split | |||
1155 | { ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split | |||
1156 | { ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split | |||
1157 | { ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split | |||
1158 | { ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split | |||
1159 | { ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split | |||
1160 | { ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split | |||
1161 | { ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split | |||
1162 | { ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq | |||
1163 | { ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq | |||
1164 | ||||
1165 | { ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence. | |||
1166 | { ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split. | |||
1167 | { ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence. | |||
1168 | { ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split. | |||
1169 | { ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld | |||
1170 | { ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split | |||
1171 | { ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend. | |||
1172 | { ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split. | |||
1173 | ||||
1174 | { ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence. | |||
1175 | { ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split. | |||
1176 | { ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence. | |||
1177 | { ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split. | |||
1178 | { ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend. | |||
1179 | { ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split. | |||
1180 | { ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend. | |||
1181 | { ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split. | |||
1182 | ||||
1183 | { ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence. | |||
1184 | { ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split. | |||
1185 | { ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence. | |||
1186 | { ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split. | |||
1187 | { ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend. | |||
1188 | { ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split. | |||
1189 | { ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend. | |||
1190 | { ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split. | |||
1191 | ||||
1192 | { ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/ | |||
1193 | { ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/ | |||
1194 | ||||
1195 | { ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ | |||
1196 | { ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ | |||
1197 | { ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ | |||
1198 | { ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ | |||
1199 | { ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/ | |||
1200 | { ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/ | |||
1201 | ||||
1202 | { ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ | |||
1203 | { ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ | |||
1204 | { ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ | |||
1205 | { ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ | |||
1206 | { ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/ | |||
1207 | { ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/ | |||
1208 | ||||
1209 | { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/ | |||
1210 | { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/ | |||
1211 | { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/ | |||
1212 | { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/ | |||
1213 | { ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/ | |||
1214 | { ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/ | |||
1215 | ||||
1216 | { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/ | |||
1217 | { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/ | |||
1218 | { ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/ | |||
1219 | { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/ | |||
1220 | { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/ | |||
1221 | { ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/ | |||
1222 | }; | |||
1223 | ||||
1224 | if (ST->hasAVX()) | |||
1225 | if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second)) | |||
1226 | if (auto KindCost = Entry->Cost[CostKind]) | |||
1227 | return LT.first * *KindCost; | |||
1228 | ||||
1229 | static const CostKindTblEntry SSE42CostTable[] = { | |||
1230 | { ISD::FADD, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
1231 | { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
1232 | { ISD::FADD, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
1233 | { ISD::FADD, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
1234 | ||||
1235 | { ISD::FSUB, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
1236 | { ISD::FSUB, MVT::f32 , { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
1237 | { ISD::FSUB, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
1238 | { ISD::FSUB, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
1239 | ||||
1240 | { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
1241 | { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
1242 | { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
1243 | { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
1244 | ||||
1245 | { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
1246 | { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
1247 | { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
1248 | { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
1249 | ||||
1250 | { ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add | |||
1251 | }; | |||
1252 | ||||
1253 | if (ST->hasSSE42()) | |||
1254 | if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second)) | |||
1255 | if (auto KindCost = Entry->Cost[CostKind]) | |||
1256 | return LT.first * *KindCost; | |||
1257 | ||||
1258 | static const CostKindTblEntry SSE41CostTable[] = { | |||
1259 | { ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence. | |||
1260 | { ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence. | |||
1261 | { ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld | |||
1262 | ||||
1263 | { ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence. | |||
1264 | { ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence. | |||
1265 | { ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend. | |||
1266 | { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence. | |||
1267 | ||||
1268 | { ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence. | |||
1269 | { ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence. | |||
1270 | { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend. | |||
1271 | { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence. | |||
1272 | ||||
1273 | { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org) | |||
1274 | }; | |||
1275 | ||||
1276 | if (ST->hasSSE41()) | |||
1277 | if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second)) | |||
1278 | if (auto KindCost = Entry->Cost[CostKind]) | |||
1279 | return LT.first * *KindCost; | |||
1280 | ||||
1281 | static const CostKindTblEntry SSE2CostTable[] = { | |||
1282 | // We don't correctly identify costs of casts because they are marked as | |||
1283 | // custom. | |||
1284 | { ISD::SHL, MVT::v16i8, { 13, 21,26,28 } }, // cmpgtb sequence. | |||
1285 | { ISD::SHL, MVT::v8i16, { 24, 27,16,20 } }, // cmpgtw sequence. | |||
1286 | { ISD::SHL, MVT::v4i32, { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq. | |||
1287 | { ISD::SHL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence. | |||
1288 | ||||
1289 | { ISD::SRL, MVT::v16i8, { 14, 28,27,30 } }, // cmpgtb sequence. | |||
1290 | { ISD::SRL, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence. | |||
1291 | { ISD::SRL, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend. | |||
1292 | { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence. | |||
1293 | ||||
1294 | { ISD::SRA, MVT::v16i8, { 27, 30,54,54 } }, // unpacked cmpgtb sequence. | |||
1295 | { ISD::SRA, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence. | |||
1296 | { ISD::SRA, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend. | |||
1297 | { ISD::SRA, MVT::v2i64, { 8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence. | |||
1298 | ||||
1299 | { ISD::AND, MVT::v16i8, { 1, 1, 1, 1 } }, // pand | |||
1300 | { ISD::AND, MVT::v8i16, { 1, 1, 1, 1 } }, // pand | |||
1301 | { ISD::AND, MVT::v4i32, { 1, 1, 1, 1 } }, // pand | |||
1302 | { ISD::AND, MVT::v2i64, { 1, 1, 1, 1 } }, // pand | |||
1303 | ||||
1304 | { ISD::OR, MVT::v16i8, { 1, 1, 1, 1 } }, // por | |||
1305 | { ISD::OR, MVT::v8i16, { 1, 1, 1, 1 } }, // por | |||
1306 | { ISD::OR, MVT::v4i32, { 1, 1, 1, 1 } }, // por | |||
1307 | { ISD::OR, MVT::v2i64, { 1, 1, 1, 1 } }, // por | |||
1308 | ||||
1309 | { ISD::XOR, MVT::v16i8, { 1, 1, 1, 1 } }, // pxor | |||
1310 | { ISD::XOR, MVT::v8i16, { 1, 1, 1, 1 } }, // pxor | |||
1311 | { ISD::XOR, MVT::v4i32, { 1, 1, 1, 1 } }, // pxor | |||
1312 | { ISD::XOR, MVT::v2i64, { 1, 1, 1, 1 } }, // pxor | |||
1313 | ||||
1314 | { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq | |||
1315 | { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq | |||
1316 | ||||
1317 | { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw | |||
1318 | { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle | |||
1319 | { ISD::MUL, MVT::v2i64, { 8, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add | |||
1320 | ||||
1321 | { ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/ | |||
1322 | { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/ | |||
1323 | { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/ | |||
1324 | { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/ | |||
1325 | ||||
1326 | { ISD::FNEG, MVT::f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/ | |||
1327 | { ISD::FNEG, MVT::f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/ | |||
1328 | { ISD::FNEG, MVT::v4f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/ | |||
1329 | { ISD::FNEG, MVT::v2f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/ | |||
1330 | ||||
1331 | { ISD::FADD, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ | |||
1332 | { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ | |||
1333 | { ISD::FADD, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ | |||
1334 | ||||
1335 | { ISD::FSUB, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ | |||
1336 | { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ | |||
1337 | { ISD::FSUB, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ | |||
1338 | ||||
1339 | { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/ | |||
1340 | { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/ | |||
1341 | }; | |||
1342 | ||||
1343 | if (ST->hasSSE2()) | |||
1344 | if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second)) | |||
1345 | if (auto KindCost = Entry->Cost[CostKind]) | |||
1346 | return LT.first * *KindCost; | |||
1347 | ||||
1348 | static const CostKindTblEntry SSE1CostTable[] = { | |||
1349 | { ISD::FDIV, MVT::f32, { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/ | |||
1350 | { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/ | |||
1351 | ||||
1352 | { ISD::FNEG, MVT::f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/ | |||
1353 | { ISD::FNEG, MVT::v4f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/ | |||
1354 | ||||
1355 | { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/ | |||
1356 | { ISD::FADD, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/ | |||
1357 | ||||
1358 | { ISD::FSUB, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/ | |||
1359 | { ISD::FSUB, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/ | |||
1360 | ||||
1361 | { ISD::FMUL, MVT::f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/ | |||
1362 | { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/ | |||
1363 | }; | |||
1364 | ||||
1365 | if (ST->hasSSE1()) | |||
1366 | if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second)) | |||
1367 | if (auto KindCost = Entry->Cost[CostKind]) | |||
1368 | return LT.first * *KindCost; | |||
1369 | ||||
1370 | static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets | |||
1371 | { ISD::ADD, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/ | |||
1372 | { ISD::SUB, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/ | |||
1373 | { ISD::MUL, MVT::i64, { 2 } }, // Nehalem from http://www.agner.org/ | |||
1374 | }; | |||
1375 | ||||
1376 | if (ST->is64Bit()) | |||
1377 | if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second)) | |||
1378 | if (auto KindCost = Entry->Cost[CostKind]) | |||
1379 | return LT.first * *KindCost; | |||
1380 | ||||
1381 | static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets | |||
1382 | { ISD::ADD, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/ | |||
1383 | { ISD::ADD, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/ | |||
1384 | { ISD::ADD, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/ | |||
1385 | ||||
1386 | { ISD::SUB, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/ | |||
1387 | { ISD::SUB, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/ | |||
1388 | { ISD::SUB, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/ | |||
1389 | ||||
1390 | { ISD::FNEG, MVT::f64, { 2, 2, 1, 3 } }, // (x87) | |||
1391 | { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // (x87) | |||
1392 | { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // (x87) | |||
1393 | { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // (x87) | |||
1394 | { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87) | |||
1395 | }; | |||
1396 | ||||
1397 | if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second)) | |||
1398 | if (auto KindCost = Entry->Cost[CostKind]) | |||
1399 | return LT.first * *KindCost; | |||
1400 | ||||
1401 | // It is not a good idea to vectorize division. We have to scalarize it and | |||
1402 | // in the process we will often end up having to spilling regular | |||
1403 | // registers. The overhead of division is going to dominate most kernels | |||
1404 | // anyways so try hard to prevent vectorization of division - it is | |||
1405 | // generally a bad idea. Assume somewhat arbitrarily that we have to be able | |||
1406 | // to hide "20 cycles" for each lane. | |||
1407 | if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() && | |||
1408 | (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV || | |||
1409 | ISD == ISD::UREM)) { | |||
1410 | InstructionCost ScalarCost = | |||
1411 | getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind, | |||
1412 | Op1Info.getNoProps(), Op2Info.getNoProps()); | |||
1413 | return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost; | |||
1414 | } | |||
1415 | ||||
1416 | // Handle some basic single instruction code size cases. | |||
1417 | if (CostKind == TTI::TCK_CodeSize) { | |||
1418 | switch (ISD) { | |||
1419 | case ISD::FADD: | |||
1420 | case ISD::FSUB: | |||
1421 | case ISD::FMUL: | |||
1422 | case ISD::FDIV: | |||
1423 | case ISD::FNEG: | |||
1424 | case ISD::AND: | |||
1425 | case ISD::OR: | |||
1426 | case ISD::XOR: | |||
1427 | return LT.first; | |||
1428 | break; | |||
1429 | } | |||
1430 | } | |||
1431 | ||||
1432 | // Fallback to the default implementation. | |||
1433 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, | |||
1434 | Args, CxtI); | |||
1435 | } | |||
1436 | ||||
1437 | InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, | |||
1438 | VectorType *BaseTp, | |||
1439 | ArrayRef<int> Mask, | |||
1440 | TTI::TargetCostKind CostKind, | |||
1441 | int Index, VectorType *SubTp, | |||
1442 | ArrayRef<const Value *> Args) { | |||
1443 | // 64-bit packed float vectors (v2f32) are widened to type v4f32. | |||
1444 | // 64-bit packed integer vectors (v2i32) are widened to type v4i32. | |||
1445 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(BaseTp); | |||
1446 | ||||
1447 | Kind = improveShuffleKindFromMask(Kind, Mask); | |||
1448 | ||||
1449 | // Treat Transpose as 2-op shuffles - there's no difference in lowering. | |||
1450 | if (Kind == TTI::SK_Transpose) | |||
1451 | Kind = TTI::SK_PermuteTwoSrc; | |||
1452 | ||||
1453 | // For Broadcasts we are splatting the first element from the first input | |||
1454 | // register, so only need to reference that input and all the output | |||
1455 | // registers are the same. | |||
1456 | if (Kind == TTI::SK_Broadcast) | |||
1457 | LT.first = 1; | |||
1458 | ||||
1459 | // Subvector extractions are free if they start at the beginning of a | |||
1460 | // vector and cheap if the subvectors are aligned. | |||
1461 | if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) { | |||
1462 | int NumElts = LT.second.getVectorNumElements(); | |||
1463 | if ((Index % NumElts) == 0) | |||
1464 | return 0; | |||
1465 | std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp); | |||
1466 | if (SubLT.second.isVector()) { | |||
1467 | int NumSubElts = SubLT.second.getVectorNumElements(); | |||
1468 | if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) | |||
1469 | return SubLT.first; | |||
1470 | // Handle some cases for widening legalization. For now we only handle | |||
1471 | // cases where the original subvector was naturally aligned and evenly | |||
1472 | // fit in its legalized subvector type. | |||
1473 | // FIXME: Remove some of the alignment restrictions. | |||
1474 | // FIXME: We can use permq for 64-bit or larger extracts from 256-bit | |||
1475 | // vectors. | |||
1476 | int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements(); | |||
1477 | if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 && | |||
1478 | (NumSubElts % OrigSubElts) == 0 && | |||
1479 | LT.second.getVectorElementType() == | |||
1480 | SubLT.second.getVectorElementType() && | |||
1481 | LT.second.getVectorElementType().getSizeInBits() == | |||
1482 | BaseTp->getElementType()->getPrimitiveSizeInBits()) { | |||
1483 | assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&(static_cast <bool> (NumElts >= NumSubElts && NumElts > OrigSubElts && "Unexpected number of elements!" ) ? void (0) : __assert_fail ("NumElts >= NumSubElts && NumElts > OrigSubElts && \"Unexpected number of elements!\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 1484, __extension__ __PRETTY_FUNCTION__)) | |||
1484 | "Unexpected number of elements!")(static_cast <bool> (NumElts >= NumSubElts && NumElts > OrigSubElts && "Unexpected number of elements!" ) ? void (0) : __assert_fail ("NumElts >= NumSubElts && NumElts > OrigSubElts && \"Unexpected number of elements!\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 1484, __extension__ __PRETTY_FUNCTION__)); | |||
1485 | auto *VecTy = FixedVectorType::get(BaseTp->getElementType(), | |||
1486 | LT.second.getVectorNumElements()); | |||
1487 | auto *SubTy = FixedVectorType::get(BaseTp->getElementType(), | |||
1488 | SubLT.second.getVectorNumElements()); | |||
1489 | int ExtractIndex = alignDown((Index % NumElts), NumSubElts); | |||
1490 | InstructionCost ExtractCost = | |||
1491 | getShuffleCost(TTI::SK_ExtractSubvector, VecTy, std::nullopt, | |||
1492 | CostKind, ExtractIndex, SubTy); | |||
1493 | ||||
1494 | // If the original size is 32-bits or more, we can use pshufd. Otherwise | |||
1495 | // if we have SSSE3 we can use pshufb. | |||
1496 | if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3()) | |||
1497 | return ExtractCost + 1; // pshufd or pshufb | |||
1498 | ||||
1499 | assert(SubTp->getPrimitiveSizeInBits() == 16 &&(static_cast <bool> (SubTp->getPrimitiveSizeInBits() == 16 && "Unexpected vector size") ? void (0) : __assert_fail ("SubTp->getPrimitiveSizeInBits() == 16 && \"Unexpected vector size\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 1500, __extension__ __PRETTY_FUNCTION__)) | |||
1500 | "Unexpected vector size")(static_cast <bool> (SubTp->getPrimitiveSizeInBits() == 16 && "Unexpected vector size") ? void (0) : __assert_fail ("SubTp->getPrimitiveSizeInBits() == 16 && \"Unexpected vector size\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 1500, __extension__ __PRETTY_FUNCTION__)); | |||
1501 | ||||
1502 | return ExtractCost + 2; // worst case pshufhw + pshufd | |||
1503 | } | |||
1504 | } | |||
1505 | } | |||
1506 | ||||
1507 | // Subvector insertions are cheap if the subvectors are aligned. | |||
1508 | // Note that in general, the insertion starting at the beginning of a vector | |||
1509 | // isn't free, because we need to preserve the rest of the wide vector. | |||
1510 | if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) { | |||
1511 | int NumElts = LT.second.getVectorNumElements(); | |||
1512 | std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp); | |||
1513 | if (SubLT.second.isVector()) { | |||
1514 | int NumSubElts = SubLT.second.getVectorNumElements(); | |||
1515 | if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) | |||
1516 | return SubLT.first; | |||
1517 | } | |||
1518 | ||||
1519 | // If the insertion isn't aligned, treat it like a 2-op shuffle. | |||
1520 | Kind = TTI::SK_PermuteTwoSrc; | |||
1521 | } | |||
1522 | ||||
1523 | // Handle some common (illegal) sub-vector types as they are often very cheap | |||
1524 | // to shuffle even on targets without PSHUFB. | |||
1525 | EVT VT = TLI->getValueType(DL, BaseTp); | |||
1526 | if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 && | |||
1527 | !ST->hasSSSE3()) { | |||
1528 | static const CostTblEntry SSE2SubVectorShuffleTbl[] = { | |||
1529 | {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw | |||
1530 | {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw | |||
1531 | {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw | |||
1532 | {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw | |||
1533 | {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck | |||
1534 | ||||
1535 | {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw | |||
1536 | {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw | |||
1537 | {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus | |||
1538 | {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck | |||
1539 | ||||
1540 | {TTI::SK_Splice, MVT::v4i16, 2}, // punpck+psrldq | |||
1541 | {TTI::SK_Splice, MVT::v2i16, 2}, // punpck+psrldq | |||
1542 | {TTI::SK_Splice, MVT::v4i8, 2}, // punpck+psrldq | |||
1543 | {TTI::SK_Splice, MVT::v2i8, 2}, // punpck+psrldq | |||
1544 | ||||
1545 | {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw | |||
1546 | {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw | |||
1547 | {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw | |||
1548 | {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw | |||
1549 | {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck | |||
1550 | ||||
1551 | {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw | |||
1552 | {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw | |||
1553 | {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw | |||
1554 | {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw | |||
1555 | {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck | |||
1556 | }; | |||
1557 | ||||
1558 | if (ST->hasSSE2()) | |||
1559 | if (const auto *Entry = | |||
1560 | CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT())) | |||
1561 | return Entry->Cost; | |||
1562 | } | |||
1563 | ||||
1564 | // We are going to permute multiple sources and the result will be in multiple | |||
1565 | // destinations. Providing an accurate cost only for splits where the element | |||
1566 | // type remains the same. | |||
1567 | if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) { | |||
1568 | MVT LegalVT = LT.second; | |||
1569 | if (LegalVT.isVector() && | |||
1570 | LegalVT.getVectorElementType().getSizeInBits() == | |||
1571 | BaseTp->getElementType()->getPrimitiveSizeInBits() && | |||
1572 | LegalVT.getVectorNumElements() < | |||
1573 | cast<FixedVectorType>(BaseTp)->getNumElements()) { | |||
1574 | ||||
1575 | unsigned VecTySize = DL.getTypeStoreSize(BaseTp); | |||
1576 | unsigned LegalVTSize = LegalVT.getStoreSize(); | |||
1577 | // Number of source vectors after legalization: | |||
1578 | unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize; | |||
1579 | // Number of destination vectors after legalization: | |||
1580 | InstructionCost NumOfDests = LT.first; | |||
1581 | ||||
1582 | auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(), | |||
1583 | LegalVT.getVectorNumElements()); | |||
1584 | ||||
1585 | if (!Mask.empty() && NumOfDests.isValid()) { | |||
1586 | // Try to perform better estimation of the permutation. | |||
1587 | // 1. Split the source/destination vectors into real registers. | |||
1588 | // 2. Do the mask analysis to identify which real registers are | |||
1589 | // permuted. If more than 1 source registers are used for the | |||
1590 | // destination register building, the cost for this destination register | |||
1591 | // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one | |||
1592 | // source register is used, build mask and calculate the cost as a cost | |||
1593 | // of PermuteSingleSrc. | |||
1594 | // Also, for the single register permute we try to identify if the | |||
1595 | // destination register is just a copy of the source register or the | |||
1596 | // copy of the previous destination register (the cost is | |||
1597 | // TTI::TCC_Basic). If the source register is just reused, the cost for | |||
1598 | // this operation is 0. | |||
1599 | unsigned E = *NumOfDests.getValue(); | |||
1600 | unsigned NormalizedVF = | |||
1601 | LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E); | |||
1602 | unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements(); | |||
1603 | unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements(); | |||
1604 | SmallVector<int> NormalizedMask(NormalizedVF, UndefMaskElem); | |||
1605 | copy(Mask, NormalizedMask.begin()); | |||
1606 | unsigned PrevSrcReg = 0; | |||
1607 | ArrayRef<int> PrevRegMask; | |||
1608 | InstructionCost Cost = 0; | |||
1609 | processShuffleMasks( | |||
1610 | NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {}, | |||
1611 | [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask, | |||
1612 | &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) { | |||
1613 | if (!ShuffleVectorInst::isIdentityMask(RegMask)) { | |||
1614 | // Check if the previous register can be just copied to the next | |||
1615 | // one. | |||
1616 | if (PrevRegMask.empty() || PrevSrcReg != SrcReg || | |||
1617 | PrevRegMask != RegMask) | |||
1618 | Cost += getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy, | |||
1619 | RegMask, CostKind, 0, nullptr); | |||
1620 | else | |||
1621 | // Just a copy of previous destination register. | |||
1622 | Cost += TTI::TCC_Basic; | |||
1623 | return; | |||
1624 | } | |||
1625 | if (SrcReg != DestReg && | |||
1626 | any_of(RegMask, [](int I) { return I != UndefMaskElem; })) { | |||
1627 | // Just a copy of the source register. | |||
1628 | Cost += TTI::TCC_Basic; | |||
1629 | } | |||
1630 | PrevSrcReg = SrcReg; | |||
1631 | PrevRegMask = RegMask; | |||
1632 | }, | |||
1633 | [this, SingleOpTy, CostKind, &Cost](ArrayRef<int> RegMask, | |||
1634 | unsigned /*Unused*/, | |||
1635 | unsigned /*Unused*/) { | |||
1636 | Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask, | |||
1637 | CostKind, 0, nullptr); | |||
1638 | }); | |||
1639 | return Cost; | |||
1640 | } | |||
1641 | ||||
1642 | InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests; | |||
1643 | return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, | |||
1644 | std::nullopt, CostKind, 0, nullptr); | |||
1645 | } | |||
1646 | ||||
1647 | return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp); | |||
1648 | } | |||
1649 | ||||
1650 | // For 2-input shuffles, we must account for splitting the 2 inputs into many. | |||
1651 | if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) { | |||
1652 | // We assume that source and destination have the same vector type. | |||
1653 | InstructionCost NumOfDests = LT.first; | |||
1654 | InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1; | |||
1655 | LT.first = NumOfDests * NumOfShufflesPerDest; | |||
1656 | } | |||
1657 | ||||
1658 | static const CostTblEntry AVX512VBMIShuffleTbl[] = { | |||
1659 | {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb | |||
1660 | {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb | |||
1661 | ||||
1662 | {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb | |||
1663 | {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb | |||
1664 | ||||
1665 | {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b | |||
1666 | {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b | |||
1667 | {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b | |||
1668 | }; | |||
1669 | ||||
1670 | if (ST->hasVBMI()) | |||
1671 | if (const auto *Entry = | |||
1672 | CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second)) | |||
1673 | return LT.first * Entry->Cost; | |||
1674 | ||||
1675 | static const CostTblEntry AVX512BWShuffleTbl[] = { | |||
1676 | {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw | |||
1677 | {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw | |||
1678 | {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb | |||
1679 | ||||
1680 | {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw | |||
1681 | {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw | |||
1682 | {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw | |||
1683 | {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2 | |||
1684 | ||||
1685 | {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw | |||
1686 | {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw | |||
1687 | {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw | |||
1688 | {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw | |||
1689 | {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16 | |||
1690 | ||||
1691 | {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w | |||
1692 | {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w | |||
1693 | {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w | |||
1694 | {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w | |||
1695 | {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1 | |||
1696 | ||||
1697 | {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw | |||
1698 | {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb | |||
1699 | ||||
1700 | {TTI::SK_Splice, MVT::v32i16, 2}, // vshufi64x2 + palignr | |||
1701 | {TTI::SK_Splice, MVT::v32f16, 2}, // vshufi64x2 + palignr | |||
1702 | {TTI::SK_Splice, MVT::v64i8, 2}, // vshufi64x2 + palignr | |||
1703 | }; | |||
1704 | ||||
1705 | if (ST->hasBWI()) | |||
1706 | if (const auto *Entry = | |||
1707 | CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second)) | |||
1708 | return LT.first * Entry->Cost; | |||
1709 | ||||
1710 | static const CostKindTblEntry AVX512ShuffleTbl[] = { | |||
1711 | {TTI::SK_Broadcast, MVT::v8f64, { 1, 1, 1, 1 } }, // vbroadcastsd | |||
1712 | {TTI::SK_Broadcast, MVT::v16f32, { 1, 1, 1, 1 } }, // vbroadcastss | |||
1713 | {TTI::SK_Broadcast, MVT::v8i64, { 1, 1, 1, 1 } }, // vpbroadcastq | |||
1714 | {TTI::SK_Broadcast, MVT::v16i32, { 1, 1, 1, 1 } }, // vpbroadcastd | |||
1715 | {TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw | |||
1716 | {TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw | |||
1717 | {TTI::SK_Broadcast, MVT::v64i8, { 1, 1, 1, 1 } }, // vpbroadcastb | |||
1718 | ||||
1719 | {TTI::SK_Reverse, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd | |||
1720 | {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps | |||
1721 | {TTI::SK_Reverse, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq | |||
1722 | {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd | |||
1723 | {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca | |||
1724 | {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca | |||
1725 | {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca | |||
1726 | ||||
1727 | {TTI::SK_Splice, MVT::v8f64, { 1, 1, 1, 1 } }, // vpalignd | |||
1728 | {TTI::SK_Splice, MVT::v4f64, { 1, 1, 1, 1 } }, // vpalignd | |||
1729 | {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd | |||
1730 | {TTI::SK_Splice, MVT::v8f32, { 1, 1, 1, 1 } }, // vpalignd | |||
1731 | {TTI::SK_Splice, MVT::v8i64, { 1, 1, 1, 1 } }, // vpalignd | |||
1732 | {TTI::SK_Splice, MVT::v4i64, { 1, 1, 1, 1 } }, // vpalignd | |||
1733 | {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd | |||
1734 | {TTI::SK_Splice, MVT::v8i32, { 1, 1, 1, 1 } }, // vpalignd | |||
1735 | {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr | |||
1736 | {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr | |||
1737 | {TTI::SK_Splice, MVT::v64i8, { 4, 4, 4, 4 } }, // split + palignr | |||
1738 | ||||
1739 | {TTI::SK_PermuteSingleSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd | |||
1740 | {TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermpd | |||
1741 | {TTI::SK_PermuteSingleSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermpd | |||
1742 | {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps | |||
1743 | {TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermps | |||
1744 | {TTI::SK_PermuteSingleSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermps | |||
1745 | {TTI::SK_PermuteSingleSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq | |||
1746 | {TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermq | |||
1747 | {TTI::SK_PermuteSingleSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermq | |||
1748 | {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd | |||
1749 | {TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermd | |||
1750 | {TTI::SK_PermuteSingleSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermd | |||
1751 | {TTI::SK_PermuteSingleSrc, MVT::v16i8, { 1, 3, 1, 1 } }, // pshufb | |||
1752 | ||||
1753 | {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermt2pd | |||
1754 | {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermt2ps | |||
1755 | {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermt2q | |||
1756 | {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermt2d | |||
1757 | {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermt2pd | |||
1758 | {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermt2ps | |||
1759 | {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermt2q | |||
1760 | {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermt2d | |||
1761 | {TTI::SK_PermuteTwoSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermt2pd | |||
1762 | {TTI::SK_PermuteTwoSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermt2ps | |||
1763 | {TTI::SK_PermuteTwoSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermt2q | |||
1764 | {TTI::SK_PermuteTwoSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermt2d | |||
1765 | ||||
1766 | // FIXME: This just applies the type legalization cost rules above | |||
1767 | // assuming these completely split. | |||
1768 | {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } }, | |||
1769 | {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } }, | |||
1770 | {TTI::SK_PermuteSingleSrc, MVT::v64i8, { 14, 14, 14, 14 } }, | |||
1771 | {TTI::SK_PermuteTwoSrc, MVT::v32i16, { 42, 42, 42, 42 } }, | |||
1772 | {TTI::SK_PermuteTwoSrc, MVT::v32f16, { 42, 42, 42, 42 } }, | |||
1773 | {TTI::SK_PermuteTwoSrc, MVT::v64i8, { 42, 42, 42, 42 } }, | |||
1774 | ||||
1775 | {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq | |||
1776 | {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq | |||
1777 | {TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vpternlogq | |||
1778 | {TTI::SK_Select, MVT::v8f64, { 1, 1, 1, 1 } }, // vblendmpd | |||
1779 | {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps | |||
1780 | {TTI::SK_Select, MVT::v8i64, { 1, 1, 1, 1 } }, // vblendmq | |||
1781 | {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd | |||
1782 | }; | |||
1783 | ||||
1784 | if (ST->hasAVX512()) | |||
1785 | if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second)) | |||
1786 | if (auto KindCost = Entry->Cost[CostKind]) | |||
1787 | return LT.first * *KindCost; | |||
1788 | ||||
1789 | static const CostTblEntry AVX2ShuffleTbl[] = { | |||
1790 | {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd | |||
1791 | {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps | |||
1792 | {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq | |||
1793 | {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd | |||
1794 | {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw | |||
1795 | {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw | |||
1796 | {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb | |||
1797 | ||||
1798 | {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd | |||
1799 | {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps | |||
1800 | {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq | |||
1801 | {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd | |||
1802 | {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb | |||
1803 | {TTI::SK_Reverse, MVT::v16f16, 2}, // vperm2i128 + pshufb | |||
1804 | {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb | |||
1805 | ||||
1806 | {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb | |||
1807 | {TTI::SK_Select, MVT::v16f16, 1}, // vpblendvb | |||
1808 | {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb | |||
1809 | ||||
1810 | {TTI::SK_Splice, MVT::v8i32, 2}, // vperm2i128 + vpalignr | |||
1811 | {TTI::SK_Splice, MVT::v8f32, 2}, // vperm2i128 + vpalignr | |||
1812 | {TTI::SK_Splice, MVT::v16i16, 2}, // vperm2i128 + vpalignr | |||
1813 | {TTI::SK_Splice, MVT::v16f16, 2}, // vperm2i128 + vpalignr | |||
1814 | {TTI::SK_Splice, MVT::v32i8, 2}, // vperm2i128 + vpalignr | |||
1815 | ||||
1816 | {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd | |||
1817 | {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps | |||
1818 | {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq | |||
1819 | {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd | |||
1820 | {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb | |||
1821 | // + vpblendvb | |||
1822 | {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vperm2i128 + 2*vpshufb | |||
1823 | // + vpblendvb | |||
1824 | {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb | |||
1825 | // + vpblendvb | |||
1826 | ||||
1827 | {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd | |||
1828 | {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps | |||
1829 | {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd | |||
1830 | {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd | |||
1831 | {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb | |||
1832 | // + vpblendvb | |||
1833 | {TTI::SK_PermuteTwoSrc, MVT::v16f16, 7}, // 2*vperm2i128 + 4*vpshufb | |||
1834 | // + vpblendvb | |||
1835 | {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb | |||
1836 | // + vpblendvb | |||
1837 | }; | |||
1838 | ||||
1839 | if (ST->hasAVX2()) | |||
1840 | if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second)) | |||
1841 | return LT.first * Entry->Cost; | |||
1842 | ||||
1843 | static const CostTblEntry XOPShuffleTbl[] = { | |||
1844 | {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd | |||
1845 | {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps | |||
1846 | {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd | |||
1847 | {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps | |||
1848 | {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm | |||
1849 | // + vinsertf128 | |||
1850 | {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm | |||
1851 | // + vinsertf128 | |||
1852 | ||||
1853 | {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm | |||
1854 | // + vinsertf128 | |||
1855 | {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm | |||
1856 | {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm | |||
1857 | // + vinsertf128 | |||
1858 | {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm | |||
1859 | }; | |||
1860 | ||||
1861 | if (ST->hasXOP()) | |||
1862 | if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second)) | |||
1863 | return LT.first * Entry->Cost; | |||
1864 | ||||
1865 | static const CostTblEntry AVX1ShuffleTbl[] = { | |||
1866 | {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd | |||
1867 | {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps | |||
1868 | {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd | |||
1869 | {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps | |||
1870 | {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128 | |||
1871 | {TTI::SK_Broadcast, MVT::v16f16, 3}, // vpshuflw + vpshufd + vinsertf128 | |||
1872 | {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128 | |||
1873 | ||||
1874 | {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd | |||
1875 | {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps | |||
1876 | {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd | |||
1877 | {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps | |||
1878 | {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb | |||
1879 | // + vinsertf128 | |||
1880 | {TTI::SK_Reverse, MVT::v16f16, 4}, // vextractf128 + 2*pshufb | |||
1881 | // + vinsertf128 | |||
1882 | {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb | |||
1883 | // + vinsertf128 | |||
1884 | ||||
1885 | {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd | |||
1886 | {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd | |||
1887 | {TTI::SK_Select, MVT::v8i32, 1}, // vblendps | |||
1888 | {TTI::SK_Select, MVT::v8f32, 1}, // vblendps | |||
1889 | {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor | |||
1890 | {TTI::SK_Select, MVT::v16f16, 3}, // vpand + vpandn + vpor | |||
1891 | {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor | |||
1892 | ||||
1893 | {TTI::SK_Splice, MVT::v4i64, 2}, // vperm2f128 + shufpd | |||
1894 | {TTI::SK_Splice, MVT::v4f64, 2}, // vperm2f128 + shufpd | |||
1895 | {TTI::SK_Splice, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps | |||
1896 | {TTI::SK_Splice, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps | |||
1897 | {TTI::SK_Splice, MVT::v16i16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128 | |||
1898 | {TTI::SK_Splice, MVT::v16f16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128 | |||
1899 | {TTI::SK_Splice, MVT::v32i8, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128 | |||
1900 | ||||
1901 | {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd | |||
1902 | {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd | |||
1903 | {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps | |||
1904 | {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps | |||
1905 | {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb | |||
1906 | // + 2*por + vinsertf128 | |||
1907 | {TTI::SK_PermuteSingleSrc, MVT::v16f16, 8}, // vextractf128 + 4*pshufb | |||
1908 | // + 2*por + vinsertf128 | |||
1909 | {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb | |||
1910 | // + 2*por + vinsertf128 | |||
1911 | ||||
1912 | {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd | |||
1913 | {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd | |||
1914 | {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps | |||
1915 | {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps | |||
1916 | {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb | |||
1917 | // + 4*por + vinsertf128 | |||
1918 | {TTI::SK_PermuteTwoSrc, MVT::v16f16, 15}, // 2*vextractf128 + 8*pshufb | |||
1919 | // + 4*por + vinsertf128 | |||
1920 | {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb | |||
1921 | // + 4*por + vinsertf128 | |||
1922 | }; | |||
1923 | ||||
1924 | if (ST->hasAVX()) | |||
1925 | if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second)) | |||
1926 | return LT.first * Entry->Cost; | |||
1927 | ||||
1928 | static const CostTblEntry SSE41ShuffleTbl[] = { | |||
1929 | {TTI::SK_Select, MVT::v2i64, 1}, // pblendw | |||
1930 | {TTI::SK_Select, MVT::v2f64, 1}, // movsd | |||
1931 | {TTI::SK_Select, MVT::v4i32, 1}, // pblendw | |||
1932 | {TTI::SK_Select, MVT::v4f32, 1}, // blendps | |||
1933 | {TTI::SK_Select, MVT::v8i16, 1}, // pblendw | |||
1934 | {TTI::SK_Select, MVT::v8f16, 1}, // pblendw | |||
1935 | {TTI::SK_Select, MVT::v16i8, 1} // pblendvb | |||
1936 | }; | |||
1937 | ||||
1938 | if (ST->hasSSE41()) | |||
1939 | if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second)) | |||
1940 | return LT.first * Entry->Cost; | |||
1941 | ||||
1942 | static const CostTblEntry SSSE3ShuffleTbl[] = { | |||
1943 | {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb | |||
1944 | {TTI::SK_Broadcast, MVT::v8f16, 1}, // pshufb | |||
1945 | {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb | |||
1946 | ||||
1947 | {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb | |||
1948 | {TTI::SK_Reverse, MVT::v8f16, 1}, // pshufb | |||
1949 | {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb | |||
1950 | ||||
1951 | {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por | |||
1952 | {TTI::SK_Select, MVT::v8f16, 3}, // 2*pshufb + por | |||
1953 | {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por | |||
1954 | ||||
1955 | {TTI::SK_Splice, MVT::v4i32, 1}, // palignr | |||
1956 | {TTI::SK_Splice, MVT::v4f32, 1}, // palignr | |||
1957 | {TTI::SK_Splice, MVT::v8i16, 1}, // palignr | |||
1958 | {TTI::SK_Splice, MVT::v8f16, 1}, // palignr | |||
1959 | {TTI::SK_Splice, MVT::v16i8, 1}, // palignr | |||
1960 | ||||
1961 | {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb | |||
1962 | {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // pshufb | |||
1963 | {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb | |||
1964 | ||||
1965 | {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por | |||
1966 | {TTI::SK_PermuteTwoSrc, MVT::v8f16, 3}, // 2*pshufb + por | |||
1967 | {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por | |||
1968 | }; | |||
1969 | ||||
1970 | if (ST->hasSSSE3()) | |||
1971 | if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second)) | |||
1972 | return LT.first * Entry->Cost; | |||
1973 | ||||
1974 | static const CostTblEntry SSE2ShuffleTbl[] = { | |||
1975 | {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd | |||
1976 | {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd | |||
1977 | {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd | |||
1978 | {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd | |||
1979 | {TTI::SK_Broadcast, MVT::v8f16, 2}, // pshuflw + pshufd | |||
1980 | {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd | |||
1981 | ||||
1982 | {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd | |||
1983 | {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd | |||
1984 | {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd | |||
1985 | {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd | |||
1986 | {TTI::SK_Reverse, MVT::v8f16, 3}, // pshuflw + pshufhw + pshufd | |||
1987 | {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw | |||
1988 | // + 2*pshufd + 2*unpck + packus | |||
1989 | ||||
1990 | {TTI::SK_Select, MVT::v2i64, 1}, // movsd | |||
1991 | {TTI::SK_Select, MVT::v2f64, 1}, // movsd | |||
1992 | {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps | |||
1993 | {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por | |||
1994 | {TTI::SK_Select, MVT::v8f16, 3}, // pand + pandn + por | |||
1995 | {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por | |||
1996 | ||||
1997 | {TTI::SK_Splice, MVT::v2i64, 1}, // shufpd | |||
1998 | {TTI::SK_Splice, MVT::v2f64, 1}, // shufpd | |||
1999 | {TTI::SK_Splice, MVT::v4i32, 2}, // 2*{unpck,movsd,pshufd} | |||
2000 | {TTI::SK_Splice, MVT::v8i16, 3}, // psrldq + psrlldq + por | |||
2001 | {TTI::SK_Splice, MVT::v8f16, 3}, // psrldq + psrlldq + por | |||
2002 | {TTI::SK_Splice, MVT::v16i8, 3}, // psrldq + psrlldq + por | |||
2003 | ||||
2004 | {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd | |||
2005 | {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd | |||
2006 | {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd | |||
2007 | {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw | |||
2008 | // + pshufd/unpck | |||
2009 | {TTI::SK_PermuteSingleSrc, MVT::v8f16, 5}, // 2*pshuflw + 2*pshufhw | |||
2010 | // + pshufd/unpck | |||
2011 | { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw | |||
2012 | // + 2*pshufd + 2*unpck + 2*packus | |||
2013 | ||||
2014 | { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd | |||
2015 | { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd | |||
2016 | { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd} | |||
2017 | { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute | |||
2018 | { TTI::SK_PermuteTwoSrc, MVT::v8f16, 8 }, // blend+permute | |||
2019 | { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute | |||
2020 | }; | |||
2021 | ||||
2022 | static const CostTblEntry SSE3BroadcastLoadTbl[] = { | |||
2023 | {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup | |||
2024 | }; | |||
2025 | ||||
2026 | if (ST->hasSSE2()) { | |||
2027 | bool IsLoad = | |||
2028 | llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); }); | |||
2029 | if (ST->hasSSE3() && IsLoad) | |||
2030 | if (const auto *Entry = | |||
2031 | CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) { | |||
2032 | assert(isLegalBroadcastLoad(BaseTp->getElementType(),(static_cast <bool> (isLegalBroadcastLoad(BaseTp->getElementType (), LT.second.getVectorElementCount()) && "Table entry missing from isLegalBroadcastLoad()" ) ? void (0) : __assert_fail ("isLegalBroadcastLoad(BaseTp->getElementType(), LT.second.getVectorElementCount()) && \"Table entry missing from isLegalBroadcastLoad()\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 2034, __extension__ __PRETTY_FUNCTION__)) | |||
2033 | LT.second.getVectorElementCount()) &&(static_cast <bool> (isLegalBroadcastLoad(BaseTp->getElementType (), LT.second.getVectorElementCount()) && "Table entry missing from isLegalBroadcastLoad()" ) ? void (0) : __assert_fail ("isLegalBroadcastLoad(BaseTp->getElementType(), LT.second.getVectorElementCount()) && \"Table entry missing from isLegalBroadcastLoad()\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 2034, __extension__ __PRETTY_FUNCTION__)) | |||
2034 | "Table entry missing from isLegalBroadcastLoad()")(static_cast <bool> (isLegalBroadcastLoad(BaseTp->getElementType (), LT.second.getVectorElementCount()) && "Table entry missing from isLegalBroadcastLoad()" ) ? void (0) : __assert_fail ("isLegalBroadcastLoad(BaseTp->getElementType(), LT.second.getVectorElementCount()) && \"Table entry missing from isLegalBroadcastLoad()\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 2034, __extension__ __PRETTY_FUNCTION__)); | |||
2035 | return LT.first * Entry->Cost; | |||
2036 | } | |||
2037 | ||||
2038 | if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second)) | |||
2039 | return LT.first * Entry->Cost; | |||
2040 | } | |||
2041 | ||||
2042 | static const CostTblEntry SSE1ShuffleTbl[] = { | |||
2043 | { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps | |||
2044 | { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps | |||
2045 | { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps | |||
2046 | { TTI::SK_Splice, MVT::v4f32, 2 }, // 2*shufps | |||
2047 | { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps | |||
2048 | { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps | |||
2049 | }; | |||
2050 | ||||
2051 | if (ST->hasSSE1()) | |||
2052 | if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second)) | |||
2053 | return LT.first * Entry->Cost; | |||
2054 | ||||
2055 | return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp); | |||
2056 | } | |||
2057 | ||||
2058 | InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, | |||
2059 | Type *Src, | |||
2060 | TTI::CastContextHint CCH, | |||
2061 | TTI::TargetCostKind CostKind, | |||
2062 | const Instruction *I) { | |||
2063 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
2064 | assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ? void (0) : __assert_fail ("ISD && \"Invalid opcode\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 2064, __extension__ __PRETTY_FUNCTION__)); | |||
2065 | ||||
2066 | // TODO: Allow non-throughput costs that aren't binary. | |||
2067 | auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost { | |||
2068 | if (CostKind != TTI::TCK_RecipThroughput) | |||
2069 | return Cost == 0 ? 0 : 1; | |||
2070 | return Cost; | |||
2071 | }; | |||
2072 | ||||
2073 | // The cost tables include both specific, custom (non-legal) src/dst type | |||
2074 | // conversions and generic, legalized types. We test for customs first, before | |||
2075 | // falling back to legalization. | |||
2076 | // FIXME: Need a better design of the cost table to handle non-simple types of | |||
2077 | // potential massive combinations (elem_num x src_type x dst_type). | |||
2078 | static const TypeConversionCostTblEntry AVX512BWConversionTbl[] { | |||
2079 | { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 }, | |||
2080 | { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 }, | |||
2081 | ||||
2082 | // Mask sign extend has an instruction. | |||
2083 | { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 }, | |||
2084 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 }, | |||
2085 | { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 }, | |||
2086 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 }, | |||
2087 | { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 }, | |||
2088 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 }, | |||
2089 | { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 }, | |||
2090 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 }, | |||
2091 | { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 }, | |||
2092 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 }, | |||
2093 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 }, | |||
2094 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 }, | |||
2095 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, | |||
2096 | { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 }, | |||
2097 | { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 }, | |||
2098 | { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 }, | |||
2099 | { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, 1 }, | |||
2100 | ||||
2101 | // Mask zero extend is a sext + shift. | |||
2102 | { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 }, | |||
2103 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 }, | |||
2104 | { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 }, | |||
2105 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 }, | |||
2106 | { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 }, | |||
2107 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 }, | |||
2108 | { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 }, | |||
2109 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 }, | |||
2110 | { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 }, | |||
2111 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 }, | |||
2112 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 }, | |||
2113 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 }, | |||
2114 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 }, | |||
2115 | { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 }, | |||
2116 | { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 }, | |||
2117 | { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 }, | |||
2118 | { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, 2 }, | |||
2119 | ||||
2120 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, | |||
2121 | { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 }, | |||
2122 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, | |||
2123 | { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 }, | |||
2124 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, | |||
2125 | { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 }, | |||
2126 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, | |||
2127 | { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 }, | |||
2128 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, | |||
2129 | { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 }, | |||
2130 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, | |||
2131 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, | |||
2132 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, | |||
2133 | { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, | |||
2134 | { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, 2 }, | |||
2135 | { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, 2 }, | |||
2136 | { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, 2 }, | |||
2137 | ||||
2138 | { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 2 }, | |||
2139 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm | |||
2140 | { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // vpmovwb | |||
2141 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // vpmovwb | |||
2142 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // vpmovwb | |||
2143 | }; | |||
2144 | ||||
2145 | static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = { | |||
2146 | // Mask sign extend has an instruction. | |||
2147 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, | |||
2148 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 }, | |||
2149 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, | |||
2150 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, | |||
2151 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, | |||
2152 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, 1 }, | |||
2153 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, | |||
2154 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, | |||
2155 | ||||
2156 | // Mask zero extend is a sext + shift. | |||
2157 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, | |||
2158 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 }, | |||
2159 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, | |||
2160 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, | |||
2161 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, | |||
2162 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, 2 }, | |||
2163 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, | |||
2164 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, | |||
2165 | ||||
2166 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, | |||
2167 | { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 }, | |||
2168 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, | |||
2169 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, | |||
2170 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, | |||
2171 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, | |||
2172 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, | |||
2173 | { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, 2 }, | |||
2174 | ||||
2175 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, | |||
2176 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, | |||
2177 | ||||
2178 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, | |||
2179 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, | |||
2180 | ||||
2181 | { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 }, | |||
2182 | { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 }, | |||
2183 | ||||
2184 | { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 }, | |||
2185 | { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 }, | |||
2186 | }; | |||
2187 | ||||
2188 | // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and | |||
2189 | // 256-bit wide vectors. | |||
2190 | ||||
2191 | static const TypeConversionCostTblEntry AVX512FConversionTbl[] = { | |||
2192 | { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 }, | |||
2193 | { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 }, | |||
2194 | { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 }, | |||
2195 | ||||
2196 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd | |||
2197 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd | |||
2198 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd | |||
2199 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd | |||
2200 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq | |||
2201 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq | |||
2202 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq | |||
2203 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd | |||
2204 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd | |||
2205 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd | |||
2206 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd | |||
2207 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd | |||
2208 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq | |||
2209 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq | |||
2210 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq | |||
2211 | { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 2 }, // vpmovdb | |||
2212 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 2 }, // vpmovdb | |||
2213 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 }, // vpmovdb | |||
2214 | { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, 2 }, // vpmovdb | |||
2215 | { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, 2 }, // vpmovdb | |||
2216 | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 }, // vpmovdw | |||
2217 | { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, 2 }, // vpmovdw | |||
2218 | { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 2 }, // vpmovqb | |||
2219 | { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1 }, // vpshufb | |||
2220 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 }, // vpmovqb | |||
2221 | { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, 2 }, // vpmovqb | |||
2222 | { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, 2 }, // vpmovqb | |||
2223 | { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, 2 }, // vpmovqb | |||
2224 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 }, // vpmovqw | |||
2225 | { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, 2 }, // vpmovqw | |||
2226 | { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, 2 }, // vpmovqw | |||
2227 | { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, // vpmovqd | |||
2228 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd | |||
2229 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb | |||
2230 | ||||
2231 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32 | |||
2232 | { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 8 }, | |||
2233 | { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, 8 }, | |||
2234 | ||||
2235 | // Sign extend is zmm vpternlogd+vptruncdb. | |||
2236 | // Zero extend is zmm broadcast load+vptruncdw. | |||
2237 | { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 3 }, | |||
2238 | { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 4 }, | |||
2239 | { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 3 }, | |||
2240 | { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 4 }, | |||
2241 | { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 3 }, | |||
2242 | { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 4 }, | |||
2243 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 3 }, | |||
2244 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 4 }, | |||
2245 | ||||
2246 | // Sign extend is zmm vpternlogd+vptruncdw. | |||
2247 | // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw. | |||
2248 | { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 3 }, | |||
2249 | { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 4 }, | |||
2250 | { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 3 }, | |||
2251 | { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 4 }, | |||
2252 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 3 }, | |||
2253 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 4 }, | |||
2254 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 3 }, | |||
2255 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, | |||
2256 | ||||
2257 | { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd | |||
2258 | { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld | |||
2259 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd | |||
2260 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld | |||
2261 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd | |||
2262 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld | |||
2263 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq | |||
2264 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq | |||
2265 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq | |||
2266 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq | |||
2267 | ||||
2268 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd | |||
2269 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld | |||
2270 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq | |||
2271 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq | |||
2272 | ||||
2273 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, | |||
2274 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, | |||
2275 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, | |||
2276 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, | |||
2277 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 }, | |||
2278 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 }, | |||
2279 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, | |||
2280 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, | |||
2281 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, | |||
2282 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, | |||
2283 | ||||
2284 | { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right | |||
2285 | { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right | |||
2286 | ||||
2287 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, | |||
2288 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, | |||
2289 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 }, | |||
2290 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 }, | |||
2291 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, | |||
2292 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 }, | |||
2293 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, | |||
2294 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, | |||
2295 | ||||
2296 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, | |||
2297 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, | |||
2298 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 }, | |||
2299 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 }, | |||
2300 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, | |||
2301 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 }, | |||
2302 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, | |||
2303 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, | |||
2304 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 }, | |||
2305 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 }, | |||
2306 | ||||
2307 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 }, | |||
2308 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, 7 }, | |||
2309 | { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64,15 }, | |||
2310 | { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32,11 }, | |||
2311 | { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64,31 }, | |||
2312 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, 3 }, | |||
2313 | { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, 7 }, | |||
2314 | { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, 5 }, | |||
2315 | { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64,15 }, | |||
2316 | { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 1 }, | |||
2317 | { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, 3 }, | |||
2318 | ||||
2319 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 }, | |||
2320 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 3 }, | |||
2321 | { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 3 }, | |||
2322 | { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 }, | |||
2323 | { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 3 }, | |||
2324 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 3 }, | |||
2325 | }; | |||
2326 | ||||
2327 | static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] { | |||
2328 | // Mask sign extend has an instruction. | |||
2329 | { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 }, | |||
2330 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 }, | |||
2331 | { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 }, | |||
2332 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 }, | |||
2333 | { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 }, | |||
2334 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 }, | |||
2335 | { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 }, | |||
2336 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 }, | |||
2337 | { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 }, | |||
2338 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 }, | |||
2339 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 }, | |||
2340 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 }, | |||
2341 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, | |||
2342 | { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 }, | |||
2343 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, 1 }, | |||
2344 | { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, 1 }, | |||
2345 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, 1 }, | |||
2346 | ||||
2347 | // Mask zero extend is a sext + shift. | |||
2348 | { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 }, | |||
2349 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 }, | |||
2350 | { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 }, | |||
2351 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 }, | |||
2352 | { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 }, | |||
2353 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 }, | |||
2354 | { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 }, | |||
2355 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 }, | |||
2356 | { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 }, | |||
2357 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 }, | |||
2358 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 }, | |||
2359 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 }, | |||
2360 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 }, | |||
2361 | { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 }, | |||
2362 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, 2 }, | |||
2363 | { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, 2 }, | |||
2364 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, 2 }, | |||
2365 | ||||
2366 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, | |||
2367 | { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 }, | |||
2368 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, | |||
2369 | { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 }, | |||
2370 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, | |||
2371 | { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 }, | |||
2372 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, | |||
2373 | { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 }, | |||
2374 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, | |||
2375 | { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 }, | |||
2376 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, | |||
2377 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, | |||
2378 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, | |||
2379 | { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, | |||
2380 | { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, 2 }, | |||
2381 | { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, 2 }, | |||
2382 | { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, 2 }, | |||
2383 | ||||
2384 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, | |||
2385 | }; | |||
2386 | ||||
2387 | static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = { | |||
2388 | // Mask sign extend has an instruction. | |||
2389 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, | |||
2390 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 }, | |||
2391 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, | |||
2392 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, 1 }, | |||
2393 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, | |||
2394 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, 1 }, | |||
2395 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, 1 }, | |||
2396 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, | |||
2397 | ||||
2398 | // Mask zero extend is a sext + shift. | |||
2399 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, | |||
2400 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 }, | |||
2401 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, | |||
2402 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, 2 }, | |||
2403 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, | |||
2404 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, 2 }, | |||
2405 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, 2 }, | |||
2406 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, | |||
2407 | ||||
2408 | { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, 2 }, | |||
2409 | { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, 2 }, | |||
2410 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, | |||
2411 | { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 }, | |||
2412 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, | |||
2413 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, | |||
2414 | { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, 2 }, | |||
2415 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, | |||
2416 | ||||
2417 | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, | |||
2418 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, | |||
2419 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, | |||
2420 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, | |||
2421 | ||||
2422 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, | |||
2423 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, | |||
2424 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, | |||
2425 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, | |||
2426 | ||||
2427 | { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, 1 }, | |||
2428 | { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 }, | |||
2429 | { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, | |||
2430 | { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 }, | |||
2431 | ||||
2432 | { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, 1 }, | |||
2433 | { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 }, | |||
2434 | { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, | |||
2435 | { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 }, | |||
2436 | }; | |||
2437 | ||||
2438 | static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = { | |||
2439 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd | |||
2440 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd | |||
2441 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd | |||
2442 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8 | |||
2443 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq | |||
2444 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq | |||
2445 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq | |||
2446 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16 | |||
2447 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd | |||
2448 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd | |||
2449 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd | |||
2450 | { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, 2 }, // vpslld+vptestmd | |||
2451 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq | |||
2452 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq | |||
2453 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd | |||
2454 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, // vpmovqb | |||
2455 | { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, // vpmovqw | |||
2456 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, // vpmovwb | |||
2457 | ||||
2458 | // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb | |||
2459 | // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb | |||
2460 | { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 5 }, | |||
2461 | { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 6 }, | |||
2462 | { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 5 }, | |||
2463 | { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 6 }, | |||
2464 | { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 5 }, | |||
2465 | { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 6 }, | |||
2466 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 10 }, | |||
2467 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 12 }, | |||
2468 | ||||
2469 | // sign extend is vpcmpeq+maskedmove+vpmovdw | |||
2470 | // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw | |||
2471 | { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 4 }, | |||
2472 | { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 5 }, | |||
2473 | { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 4 }, | |||
2474 | { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 5 }, | |||
2475 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 4 }, | |||
2476 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 5 }, | |||
2477 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 10 }, | |||
2478 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 12 }, | |||
2479 | ||||
2480 | { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd | |||
2481 | { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld | |||
2482 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd | |||
2483 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld | |||
2484 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd | |||
2485 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld | |||
2486 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, 1 }, // vpternlogd | |||
2487 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, 2 }, // vpternlogd+psrld | |||
2488 | ||||
2489 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq | |||
2490 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq | |||
2491 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq | |||
2492 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq | |||
2493 | ||||
2494 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 1 }, | |||
2495 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 1 }, | |||
2496 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 1 }, | |||
2497 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 1 }, | |||
2498 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, | |||
2499 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, | |||
2500 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 1 }, | |||
2501 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 1 }, | |||
2502 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, | |||
2503 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, | |||
2504 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, | |||
2505 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, | |||
2506 | ||||
2507 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, | |||
2508 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 }, | |||
2509 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, | |||
2510 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 }, | |||
2511 | ||||
2512 | { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 }, | |||
2513 | { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 }, | |||
2514 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, | |||
2515 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 }, | |||
2516 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, | |||
2517 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 }, | |||
2518 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, | |||
2519 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, | |||
2520 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, | |||
2521 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, | |||
2522 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 }, | |||
2523 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 }, | |||
2524 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 5 }, | |||
2525 | ||||
2526 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 }, | |||
2527 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 }, | |||
2528 | { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, 5 }, | |||
2529 | ||||
2530 | { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 }, | |||
2531 | { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 }, | |||
2532 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, | |||
2533 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 1 }, | |||
2534 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 }, | |||
2535 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 }, | |||
2536 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 }, | |||
2537 | }; | |||
2538 | ||||
2539 | static const TypeConversionCostTblEntry AVX2ConversionTbl[] = { | |||
2540 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, | |||
2541 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, | |||
2542 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, | |||
2543 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, | |||
2544 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, | |||
2545 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, | |||
2546 | ||||
2547 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 2 }, | |||
2548 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 2 }, | |||
2549 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 2 }, | |||
2550 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 2 }, | |||
2551 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, | |||
2552 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, | |||
2553 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 2 }, | |||
2554 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 2 }, | |||
2555 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, | |||
2556 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, | |||
2557 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 3 }, | |||
2558 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 3 }, | |||
2559 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, | |||
2560 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, | |||
2561 | ||||
2562 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, | |||
2563 | ||||
2564 | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 4 }, | |||
2565 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 4 }, | |||
2566 | { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 1 }, | |||
2567 | { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 1 }, | |||
2568 | { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 1 }, | |||
2569 | { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 4 }, | |||
2570 | { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 4 }, | |||
2571 | { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 1 }, | |||
2572 | { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 1 }, | |||
2573 | { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 5 }, | |||
2574 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, | |||
2575 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 }, | |||
2576 | ||||
2577 | { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 }, | |||
2578 | { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 }, | |||
2579 | ||||
2580 | { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 1 }, | |||
2581 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 1 }, | |||
2582 | { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 1 }, | |||
2583 | { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 3 }, | |||
2584 | ||||
2585 | { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 3 }, | |||
2586 | { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 3 }, | |||
2587 | { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 1 }, | |||
2588 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 }, | |||
2589 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 }, | |||
2590 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4 }, | |||
2591 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 3 }, | |||
2592 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 4 }, | |||
2593 | ||||
2594 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 }, | |||
2595 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 }, | |||
2596 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 }, | |||
2597 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 }, | |||
2598 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, | |||
2599 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, | |||
2600 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 3 }, | |||
2601 | ||||
2602 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 }, | |||
2603 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 }, | |||
2604 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 }, | |||
2605 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 }, | |||
2606 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 }, | |||
2607 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 }, | |||
2608 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 2 }, | |||
2609 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 }, | |||
2610 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 }, | |||
2611 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 }, | |||
2612 | }; | |||
2613 | ||||
2614 | static const TypeConversionCostTblEntry AVXConversionTbl[] = { | |||
2615 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 }, | |||
2616 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 }, | |||
2617 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 }, | |||
2618 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 }, | |||
2619 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, | |||
2620 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, | |||
2621 | ||||
2622 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 3 }, | |||
2623 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 3 }, | |||
2624 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 3 }, | |||
2625 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 3 }, | |||
2626 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 3 }, | |||
2627 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 }, | |||
2628 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 3 }, | |||
2629 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 3 }, | |||
2630 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 3 }, | |||
2631 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 }, | |||
2632 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 3 }, | |||
2633 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 }, | |||
2634 | ||||
2635 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 4 }, | |||
2636 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 5 }, | |||
2637 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 4 }, | |||
2638 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 9 }, | |||
2639 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, 11 }, | |||
2640 | ||||
2641 | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 }, | |||
2642 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 }, | |||
2643 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // and+extract+packuswb | |||
2644 | { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 5 }, | |||
2645 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, | |||
2646 | { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 5 }, | |||
2647 | { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 3 }, // and+extract+2*packusdw | |||
2648 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 }, | |||
2649 | ||||
2650 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, | |||
2651 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 }, | |||
2652 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 }, | |||
2653 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 }, | |||
2654 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 }, | |||
2655 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, | |||
2656 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 }, | |||
2657 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 }, | |||
2658 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 }, | |||
2659 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 }, | |||
2660 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 5 }, | |||
2661 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 8 }, | |||
2662 | ||||
2663 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 }, | |||
2664 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 }, | |||
2665 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 }, | |||
2666 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 }, | |||
2667 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 }, | |||
2668 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, | |||
2669 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 }, | |||
2670 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 4 }, | |||
2671 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 4 }, | |||
2672 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 }, | |||
2673 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 }, | |||
2674 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 }, | |||
2675 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 10 }, | |||
2676 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 10 }, | |||
2677 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 18 }, | |||
2678 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 }, | |||
2679 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 10 }, | |||
2680 | ||||
2681 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 }, | |||
2682 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, 2 }, | |||
2683 | { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, 2 }, | |||
2684 | { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, 2 }, | |||
2685 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 2 }, | |||
2686 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, 2 }, | |||
2687 | { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 2 }, | |||
2688 | { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, 2 }, | |||
2689 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 2 }, | |||
2690 | { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 2 }, | |||
2691 | { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 5 }, | |||
2692 | ||||
2693 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, 2 }, | |||
2694 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, 2 }, | |||
2695 | { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, 2 }, | |||
2696 | { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, 2 }, | |||
2697 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 2 }, | |||
2698 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, 2 }, | |||
2699 | { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 2 }, | |||
2700 | { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, 2 }, | |||
2701 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 }, | |||
2702 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 }, | |||
2703 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 6 }, | |||
2704 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 7 }, | |||
2705 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 7 }, | |||
2706 | ||||
2707 | { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 }, | |||
2708 | { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 }, | |||
2709 | }; | |||
2710 | ||||
2711 | static const TypeConversionCostTblEntry SSE41ConversionTbl[] = { | |||
2712 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 1 }, | |||
2713 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 1 }, | |||
2714 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 1 }, | |||
2715 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 1 }, | |||
2716 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 }, | |||
2717 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 1 }, | |||
2718 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 1 }, | |||
2719 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 1 }, | |||
2720 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 }, | |||
2721 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 1 }, | |||
2722 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 }, | |||
2723 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 1 }, | |||
2724 | ||||
2725 | // These truncates end up widening elements. | |||
2726 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ | |||
2727 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ | |||
2728 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD | |||
2729 | ||||
2730 | { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 2 }, | |||
2731 | { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 2 }, | |||
2732 | { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 2 }, | |||
2733 | ||||
2734 | { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 1 }, | |||
2735 | { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 1 }, | |||
2736 | { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 1 }, | |||
2737 | { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 1 }, | |||
2738 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 }, | |||
2739 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, | |||
2740 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 }, | |||
2741 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, | |||
2742 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, | |||
2743 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 1 }, | |||
2744 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 }, | |||
2745 | ||||
2746 | { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 1 }, | |||
2747 | { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 1 }, | |||
2748 | { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 4 }, | |||
2749 | { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 }, | |||
2750 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 }, | |||
2751 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, | |||
2752 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 }, | |||
2753 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, | |||
2754 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 3 }, | |||
2755 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 }, | |||
2756 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 2 }, | |||
2757 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 12 }, | |||
2758 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 22 }, | |||
2759 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 4 }, | |||
2760 | ||||
2761 | { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 1 }, | |||
2762 | { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 1 }, | |||
2763 | { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 1 }, | |||
2764 | { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 1 }, | |||
2765 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 2 }, | |||
2766 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 2 }, | |||
2767 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 1 }, | |||
2768 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 1 }, | |||
2769 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, | |||
2770 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 1 }, | |||
2771 | ||||
2772 | { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 1 }, | |||
2773 | { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 }, | |||
2774 | { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 1 }, | |||
2775 | { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 }, | |||
2776 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 2 }, | |||
2777 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 2 }, | |||
2778 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 1 }, | |||
2779 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 1 }, | |||
2780 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 4 }, | |||
2781 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 }, | |||
2782 | }; | |||
2783 | ||||
2784 | static const TypeConversionCostTblEntry SSE2ConversionTbl[] = { | |||
2785 | // These are somewhat magic numbers justified by comparing the | |||
2786 | // output of llvm-mca for our various supported scheduler models | |||
2787 | // and basing it off the worst case scenario. | |||
2788 | { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 3 }, | |||
2789 | { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 3 }, | |||
2790 | { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 3 }, | |||
2791 | { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 3 }, | |||
2792 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 3 }, | |||
2793 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 }, | |||
2794 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 3 }, | |||
2795 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 }, | |||
2796 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 }, | |||
2797 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4 }, | |||
2798 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 8 }, | |||
2799 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 8 }, | |||
2800 | ||||
2801 | { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 3 }, | |||
2802 | { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 3 }, | |||
2803 | { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 8 }, | |||
2804 | { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 9 }, | |||
2805 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 }, | |||
2806 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 4 }, | |||
2807 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 4 }, | |||
2808 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 }, | |||
2809 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 7 }, | |||
2810 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 7 }, | |||
2811 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 }, | |||
2812 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 15 }, | |||
2813 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 18 }, | |||
2814 | ||||
2815 | { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 4 }, | |||
2816 | { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 4 }, | |||
2817 | { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 4 }, | |||
2818 | { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 4 }, | |||
2819 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 6 }, | |||
2820 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 6 }, | |||
2821 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 5 }, | |||
2822 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 5 }, | |||
2823 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 4 }, | |||
2824 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 4 }, | |||
2825 | ||||
2826 | { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 4 }, | |||
2827 | { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 }, | |||
2828 | { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 4 }, | |||
2829 | { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 15 }, | |||
2830 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 6 }, | |||
2831 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 6 }, | |||
2832 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 5 }, | |||
2833 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 5 }, | |||
2834 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 8 }, | |||
2835 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 8 }, | |||
2836 | ||||
2837 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 4 }, | |||
2838 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 4 }, | |||
2839 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 2 }, | |||
2840 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 3 }, | |||
2841 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 }, | |||
2842 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 2 }, | |||
2843 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 2 }, | |||
2844 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 3 }, | |||
2845 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 }, | |||
2846 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 2 }, | |||
2847 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 }, | |||
2848 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 2 }, | |||
2849 | ||||
2850 | // These truncates are really widening elements. | |||
2851 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD | |||
2852 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // PUNPCKLWD+DQ | |||
2853 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // PUNPCKLBW+WD+PSHUFD | |||
2854 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 1 }, // PUNPCKLWD | |||
2855 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD | |||
2856 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW | |||
2857 | ||||
2858 | { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 2 }, // PAND+PACKUSWB | |||
2859 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, | |||
2860 | { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 3 }, // PAND+2*PACKUSWB | |||
2861 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 }, | |||
2862 | { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 }, | |||
2863 | { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 3 }, | |||
2864 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, | |||
2865 | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32,10 }, | |||
2866 | { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB | |||
2867 | { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW | |||
2868 | { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, 1 }, // PSHUFD | |||
2869 | }; | |||
2870 | ||||
2871 | // Attempt to map directly to (simple) MVT types to let us match custom entries. | |||
2872 | EVT SrcTy = TLI->getValueType(DL, Src); | |||
2873 | EVT DstTy = TLI->getValueType(DL, Dst); | |||
2874 | ||||
2875 | // The function getSimpleVT only handles simple value types. | |||
2876 | if (SrcTy.isSimple() && DstTy.isSimple()) { | |||
2877 | MVT SimpleSrcTy = SrcTy.getSimpleVT(); | |||
2878 | MVT SimpleDstTy = DstTy.getSimpleVT(); | |||
2879 | ||||
2880 | if (ST->useAVX512Regs()) { | |||
2881 | if (ST->hasBWI()) | |||
2882 | if (const auto *Entry = ConvertCostTableLookup( | |||
2883 | AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) | |||
2884 | return AdjustCost(Entry->Cost); | |||
2885 | ||||
2886 | if (ST->hasDQI()) | |||
2887 | if (const auto *Entry = ConvertCostTableLookup( | |||
2888 | AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) | |||
2889 | return AdjustCost(Entry->Cost); | |||
2890 | ||||
2891 | if (ST->hasAVX512()) | |||
2892 | if (const auto *Entry = ConvertCostTableLookup( | |||
2893 | AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) | |||
2894 | return AdjustCost(Entry->Cost); | |||
2895 | } | |||
2896 | ||||
2897 | if (ST->hasBWI()) | |||
2898 | if (const auto *Entry = ConvertCostTableLookup( | |||
2899 | AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) | |||
2900 | return AdjustCost(Entry->Cost); | |||
2901 | ||||
2902 | if (ST->hasDQI()) | |||
2903 | if (const auto *Entry = ConvertCostTableLookup( | |||
2904 | AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) | |||
2905 | return AdjustCost(Entry->Cost); | |||
2906 | ||||
2907 | if (ST->hasAVX512()) | |||
2908 | if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD, | |||
2909 | SimpleDstTy, SimpleSrcTy)) | |||
2910 | return AdjustCost(Entry->Cost); | |||
2911 | ||||
2912 | if (ST->hasAVX2()) { | |||
2913 | if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, | |||
2914 | SimpleDstTy, SimpleSrcTy)) | |||
2915 | return AdjustCost(Entry->Cost); | |||
2916 | } | |||
2917 | ||||
2918 | if (ST->hasAVX()) { | |||
2919 | if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, | |||
2920 | SimpleDstTy, SimpleSrcTy)) | |||
2921 | return AdjustCost(Entry->Cost); | |||
2922 | } | |||
2923 | ||||
2924 | if (ST->hasSSE41()) { | |||
2925 | if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, | |||
2926 | SimpleDstTy, SimpleSrcTy)) | |||
2927 | return AdjustCost(Entry->Cost); | |||
2928 | } | |||
2929 | ||||
2930 | if (ST->hasSSE2()) { | |||
2931 | if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, | |||
2932 | SimpleDstTy, SimpleSrcTy)) | |||
2933 | return AdjustCost(Entry->Cost); | |||
2934 | } | |||
2935 | } | |||
2936 | ||||
2937 | // Fall back to legalized types. | |||
2938 | std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src); | |||
2939 | std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst); | |||
2940 | ||||
2941 | // If we're truncating to the same legalized type - just assume its free. | |||
2942 | if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second) | |||
2943 | return TTI::TCC_Free; | |||
2944 | ||||
2945 | if (ST->useAVX512Regs()) { | |||
2946 | if (ST->hasBWI()) | |||
2947 | if (const auto *Entry = ConvertCostTableLookup( | |||
2948 | AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second)) | |||
2949 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | |||
2950 | ||||
2951 | if (ST->hasDQI()) | |||
2952 | if (const auto *Entry = ConvertCostTableLookup( | |||
2953 | AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second)) | |||
2954 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | |||
2955 | ||||
2956 | if (ST->hasAVX512()) | |||
2957 | if (const auto *Entry = ConvertCostTableLookup( | |||
2958 | AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second)) | |||
2959 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | |||
2960 | } | |||
2961 | ||||
2962 | if (ST->hasBWI()) | |||
2963 | if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD, | |||
2964 | LTDest.second, LTSrc.second)) | |||
2965 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | |||
2966 | ||||
2967 | if (ST->hasDQI()) | |||
2968 | if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD, | |||
2969 | LTDest.second, LTSrc.second)) | |||
2970 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | |||
2971 | ||||
2972 | if (ST->hasAVX512()) | |||
2973 | if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD, | |||
2974 | LTDest.second, LTSrc.second)) | |||
2975 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | |||
2976 | ||||
2977 | if (ST->hasAVX2()) | |||
2978 | if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, | |||
2979 | LTDest.second, LTSrc.second)) | |||
2980 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | |||
2981 | ||||
2982 | if (ST->hasAVX()) | |||
2983 | if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, | |||
2984 | LTDest.second, LTSrc.second)) | |||
2985 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | |||
2986 | ||||
2987 | if (ST->hasSSE41()) | |||
2988 | if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, | |||
2989 | LTDest.second, LTSrc.second)) | |||
2990 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | |||
2991 | ||||
2992 | if (ST->hasSSE2()) | |||
2993 | if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, | |||
2994 | LTDest.second, LTSrc.second)) | |||
2995 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | |||
2996 | ||||
2997 | // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for | |||
2998 | // sitofp. | |||
2999 | if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) && | |||
3000 | 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) { | |||
3001 | Type *ExtSrc = Src->getWithNewBitWidth(32); | |||
3002 | unsigned ExtOpc = | |||
3003 | (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt; | |||
3004 | ||||
3005 | // For scalar loads the extend would be free. | |||
3006 | InstructionCost ExtCost = 0; | |||
3007 | if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0)))) | |||
3008 | ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind); | |||
3009 | ||||
3010 | return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc, | |||
3011 | TTI::CastContextHint::None, CostKind); | |||
3012 | } | |||
3013 | ||||
3014 | // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi | |||
3015 | // i32. | |||
3016 | if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) && | |||
3017 | 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) { | |||
3018 | Type *TruncDst = Dst->getWithNewBitWidth(32); | |||
3019 | return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) + | |||
3020 | getCastInstrCost(Instruction::Trunc, Dst, TruncDst, | |||
3021 | TTI::CastContextHint::None, CostKind); | |||
3022 | } | |||
3023 | ||||
3024 | return AdjustCost( | |||
3025 | BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); | |||
3026 | } | |||
3027 | ||||
3028 | InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, | |||
3029 | Type *CondTy, | |||
3030 | CmpInst::Predicate VecPred, | |||
3031 | TTI::TargetCostKind CostKind, | |||
3032 | const Instruction *I) { | |||
3033 | // Early out if this type isn't scalar/vector integer/float. | |||
3034 | if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) | |||
3035 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, | |||
3036 | I); | |||
3037 | ||||
3038 | // Legalize the type. | |||
3039 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); | |||
3040 | ||||
3041 | MVT MTy = LT.second; | |||
3042 | ||||
3043 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
3044 | assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ? void (0) : __assert_fail ("ISD && \"Invalid opcode\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3044, __extension__ __PRETTY_FUNCTION__)); | |||
3045 | ||||
3046 | InstructionCost ExtraCost = 0; | |||
3047 | if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) { | |||
3048 | // Some vector comparison predicates cost extra instructions. | |||
3049 | // TODO: Should we invert this and assume worst case cmp costs | |||
3050 | // and reduce for particular predicates? | |||
3051 | if (MTy.isVector() && | |||
3052 | !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) || | |||
3053 | (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) || | |||
3054 | ST->hasBWI())) { | |||
3055 | // Fallback to I if a specific predicate wasn't specified. | |||
3056 | CmpInst::Predicate Pred = VecPred; | |||
3057 | if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE || | |||
3058 | Pred == CmpInst::BAD_FCMP_PREDICATE)) | |||
3059 | Pred = cast<CmpInst>(I)->getPredicate(); | |||
3060 | ||||
3061 | switch (Pred) { | |||
3062 | case CmpInst::Predicate::ICMP_NE: | |||
3063 | // xor(cmpeq(x,y),-1) | |||
3064 | ExtraCost = 1; | |||
3065 | break; | |||
3066 | case CmpInst::Predicate::ICMP_SGE: | |||
3067 | case CmpInst::Predicate::ICMP_SLE: | |||
3068 | // xor(cmpgt(x,y),-1) | |||
3069 | ExtraCost = 1; | |||
3070 | break; | |||
3071 | case CmpInst::Predicate::ICMP_ULT: | |||
3072 | case CmpInst::Predicate::ICMP_UGT: | |||
3073 | // cmpgt(xor(x,signbit),xor(y,signbit)) | |||
3074 | // xor(cmpeq(pmaxu(x,y),x),-1) | |||
3075 | ExtraCost = 2; | |||
3076 | break; | |||
3077 | case CmpInst::Predicate::ICMP_ULE: | |||
3078 | case CmpInst::Predicate::ICMP_UGE: | |||
3079 | if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) || | |||
3080 | (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) { | |||
3081 | // cmpeq(psubus(x,y),0) | |||
3082 | // cmpeq(pminu(x,y),x) | |||
3083 | ExtraCost = 1; | |||
3084 | } else { | |||
3085 | // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1) | |||
3086 | ExtraCost = 3; | |||
3087 | } | |||
3088 | break; | |||
3089 | case CmpInst::Predicate::FCMP_ONE: | |||
3090 | case CmpInst::Predicate::FCMP_UEQ: | |||
3091 | // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases. | |||
3092 | // Use FCMP_UEQ expansion - FCMP_ONE should be the same. | |||
3093 | if (CondTy && !ST->hasAVX()) | |||
3094 | return getCmpSelInstrCost(Opcode, ValTy, CondTy, | |||
3095 | CmpInst::Predicate::FCMP_UNO, CostKind) + | |||
3096 | getCmpSelInstrCost(Opcode, ValTy, CondTy, | |||
3097 | CmpInst::Predicate::FCMP_OEQ, CostKind) + | |||
3098 | getArithmeticInstrCost(Instruction::Or, CondTy, CostKind); | |||
3099 | ||||
3100 | break; | |||
3101 | case CmpInst::Predicate::BAD_ICMP_PREDICATE: | |||
3102 | case CmpInst::Predicate::BAD_FCMP_PREDICATE: | |||
3103 | // Assume worst case scenario and add the maximum extra cost. | |||
3104 | ExtraCost = 3; | |||
3105 | break; | |||
3106 | default: | |||
3107 | break; | |||
3108 | } | |||
3109 | } | |||
3110 | } | |||
3111 | ||||
3112 | static const CostKindTblEntry SLMCostTbl[] = { | |||
3113 | // slm pcmpeq/pcmpgt throughput is 2 | |||
3114 | { ISD::SETCC, MVT::v2i64, { 2, 5, 1, 2 } }, | |||
3115 | // slm pblendvb/blendvpd/blendvps throughput is 4 | |||
3116 | { ISD::SELECT, MVT::v2f64, { 4, 4, 1, 3 } }, // vblendvpd | |||
3117 | { ISD::SELECT, MVT::v4f32, { 4, 4, 1, 3 } }, // vblendvps | |||
3118 | { ISD::SELECT, MVT::v2i64, { 4, 4, 1, 3 } }, // pblendvb | |||
3119 | { ISD::SELECT, MVT::v8i32, { 4, 4, 1, 3 } }, // pblendvb | |||
3120 | { ISD::SELECT, MVT::v8i16, { 4, 4, 1, 3 } }, // pblendvb | |||
3121 | { ISD::SELECT, MVT::v16i8, { 4, 4, 1, 3 } }, // pblendvb | |||
3122 | }; | |||
3123 | ||||
3124 | static const CostKindTblEntry AVX512BWCostTbl[] = { | |||
3125 | { ISD::SETCC, MVT::v32i16, { 1, 1, 1, 1 } }, | |||
3126 | { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 1 } }, | |||
3127 | { ISD::SETCC, MVT::v64i8, { 1, 1, 1, 1 } }, | |||
3128 | { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 1 } }, | |||
3129 | ||||
3130 | { ISD::SELECT, MVT::v32i16, { 1, 1, 1, 1 } }, | |||
3131 | { ISD::SELECT, MVT::v64i8, { 1, 1, 1, 1 } }, | |||
3132 | }; | |||
3133 | ||||
3134 | static const CostKindTblEntry AVX512CostTbl[] = { | |||
3135 | { ISD::SETCC, MVT::v8f64, { 1, 4, 1, 1 } }, | |||
3136 | { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 1 } }, | |||
3137 | { ISD::SETCC, MVT::v16f32, { 1, 4, 1, 1 } }, | |||
3138 | { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 1 } }, | |||
3139 | ||||
3140 | { ISD::SETCC, MVT::v8i64, { 1, 1, 1, 1 } }, | |||
3141 | { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 1 } }, | |||
3142 | { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } }, | |||
3143 | { ISD::SETCC, MVT::v16i32, { 1, 1, 1, 1 } }, | |||
3144 | { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 1 } }, | |||
3145 | { ISD::SETCC, MVT::v32i16, { 3, 7, 5, 5 } }, | |||
3146 | { ISD::SETCC, MVT::v64i8, { 3, 7, 5, 5 } }, | |||
3147 | ||||
3148 | { ISD::SELECT, MVT::v8i64, { 1, 1, 1, 1 } }, | |||
3149 | { ISD::SELECT, MVT::v4i64, { 1, 1, 1, 1 } }, | |||
3150 | { ISD::SELECT, MVT::v2i64, { 1, 1, 1, 1 } }, | |||
3151 | { ISD::SELECT, MVT::v16i32, { 1, 1, 1, 1 } }, | |||
3152 | { ISD::SELECT, MVT::v8i32, { 1, 1, 1, 1 } }, | |||
3153 | { ISD::SELECT, MVT::v4i32, { 1, 1, 1, 1 } }, | |||
3154 | { ISD::SELECT, MVT::v8f64, { 1, 1, 1, 1 } }, | |||
3155 | { ISD::SELECT, MVT::v4f64, { 1, 1, 1, 1 } }, | |||
3156 | { ISD::SELECT, MVT::v2f64, { 1, 1, 1, 1 } }, | |||
3157 | { ISD::SELECT, MVT::f64, { 1, 1, 1, 1 } }, | |||
3158 | { ISD::SELECT, MVT::v16f32, { 1, 1, 1, 1 } }, | |||
3159 | { ISD::SELECT, MVT::v8f32 , { 1, 1, 1, 1 } }, | |||
3160 | { ISD::SELECT, MVT::v4f32, { 1, 1, 1, 1 } }, | |||
3161 | { ISD::SELECT, MVT::f32 , { 1, 1, 1, 1 } }, | |||
3162 | ||||
3163 | { ISD::SELECT, MVT::v32i16, { 2, 2, 4, 4 } }, | |||
3164 | { ISD::SELECT, MVT::v16i16, { 1, 1, 1, 1 } }, | |||
3165 | { ISD::SELECT, MVT::v8i16, { 1, 1, 1, 1 } }, | |||
3166 | { ISD::SELECT, MVT::v64i8, { 2, 2, 4, 4 } }, | |||
3167 | { ISD::SELECT, MVT::v32i8, { 1, 1, 1, 1 } }, | |||
3168 | { ISD::SELECT, MVT::v16i8, { 1, 1, 1, 1 } }, | |||
3169 | }; | |||
3170 | ||||
3171 | static const CostKindTblEntry AVX2CostTbl[] = { | |||
3172 | { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 2 } }, | |||
3173 | { ISD::SETCC, MVT::v2f64, { 1, 4, 1, 1 } }, | |||
3174 | { ISD::SETCC, MVT::f64, { 1, 4, 1, 1 } }, | |||
3175 | { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 2 } }, | |||
3176 | { ISD::SETCC, MVT::v4f32, { 1, 4, 1, 1 } }, | |||
3177 | { ISD::SETCC, MVT::f32, { 1, 4, 1, 1 } }, | |||
3178 | ||||
3179 | { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 2 } }, | |||
3180 | { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 2 } }, | |||
3181 | { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 2 } }, | |||
3182 | { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 2 } }, | |||
3183 | ||||
3184 | { ISD::SELECT, MVT::v4f64, { 2, 2, 1, 2 } }, // vblendvpd | |||
3185 | { ISD::SELECT, MVT::v8f32, { 2, 2, 1, 2 } }, // vblendvps | |||
3186 | { ISD::SELECT, MVT::v4i64, { 2, 2, 1, 2 } }, // pblendvb | |||
3187 | { ISD::SELECT, MVT::v8i32, { 2, 2, 1, 2 } }, // pblendvb | |||
3188 | { ISD::SELECT, MVT::v16i16, { 2, 2, 1, 2 } }, // pblendvb | |||
3189 | { ISD::SELECT, MVT::v32i8, { 2, 2, 1, 2 } }, // pblendvb | |||
3190 | }; | |||
3191 | ||||
3192 | static const CostKindTblEntry XOPCostTbl[] = { | |||
3193 | { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } }, | |||
3194 | { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } }, | |||
3195 | }; | |||
3196 | ||||
3197 | static const CostKindTblEntry AVX1CostTbl[] = { | |||
3198 | { ISD::SETCC, MVT::v4f64, { 2, 3, 1, 2 } }, | |||
3199 | { ISD::SETCC, MVT::v2f64, { 1, 3, 1, 1 } }, | |||
3200 | { ISD::SETCC, MVT::f64, { 1, 3, 1, 1 } }, | |||
3201 | { ISD::SETCC, MVT::v8f32, { 2, 3, 1, 2 } }, | |||
3202 | { ISD::SETCC, MVT::v4f32, { 1, 3, 1, 1 } }, | |||
3203 | { ISD::SETCC, MVT::f32, { 1, 3, 1, 1 } }, | |||
3204 | ||||
3205 | // AVX1 does not support 8-wide integer compare. | |||
3206 | { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } }, | |||
3207 | { ISD::SETCC, MVT::v8i32, { 4, 2, 5, 6 } }, | |||
3208 | { ISD::SETCC, MVT::v16i16, { 4, 2, 5, 6 } }, | |||
3209 | { ISD::SETCC, MVT::v32i8, { 4, 2, 5, 6 } }, | |||
3210 | ||||
3211 | { ISD::SELECT, MVT::v4f64, { 3, 3, 1, 2 } }, // vblendvpd | |||
3212 | { ISD::SELECT, MVT::v8f32, { 3, 3, 1, 2 } }, // vblendvps | |||
3213 | { ISD::SELECT, MVT::v4i64, { 3, 3, 1, 2 } }, // vblendvpd | |||
3214 | { ISD::SELECT, MVT::v8i32, { 3, 3, 1, 2 } }, // vblendvps | |||
3215 | { ISD::SELECT, MVT::v16i16, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps | |||
3216 | { ISD::SELECT, MVT::v32i8, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps | |||
3217 | }; | |||
3218 | ||||
3219 | static const CostKindTblEntry SSE42CostTbl[] = { | |||
3220 | { ISD::SETCC, MVT::v2i64, { 1, 2, 1, 2 } }, | |||
3221 | }; | |||
3222 | ||||
3223 | static const CostKindTblEntry SSE41CostTbl[] = { | |||
3224 | { ISD::SETCC, MVT::v2f64, { 1, 5, 1, 1 } }, | |||
3225 | { ISD::SETCC, MVT::v4f32, { 1, 5, 1, 1 } }, | |||
3226 | ||||
3227 | { ISD::SELECT, MVT::v2f64, { 2, 2, 1, 2 } }, // blendvpd | |||
3228 | { ISD::SELECT, MVT::f64, { 2, 2, 1, 2 } }, // blendvpd | |||
3229 | { ISD::SELECT, MVT::v4f32, { 2, 2, 1, 2 } }, // blendvps | |||
3230 | { ISD::SELECT, MVT::f32 , { 2, 2, 1, 2 } }, // blendvps | |||
3231 | { ISD::SELECT, MVT::v2i64, { 2, 2, 1, 2 } }, // pblendvb | |||
3232 | { ISD::SELECT, MVT::v4i32, { 2, 2, 1, 2 } }, // pblendvb | |||
3233 | { ISD::SELECT, MVT::v8i16, { 2, 2, 1, 2 } }, // pblendvb | |||
3234 | { ISD::SELECT, MVT::v16i8, { 2, 2, 1, 2 } }, // pblendvb | |||
3235 | }; | |||
3236 | ||||
3237 | static const CostKindTblEntry SSE2CostTbl[] = { | |||
3238 | { ISD::SETCC, MVT::v2f64, { 2, 5, 1, 1 } }, | |||
3239 | { ISD::SETCC, MVT::f64, { 1, 5, 1, 1 } }, | |||
3240 | ||||
3241 | { ISD::SETCC, MVT::v2i64, { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion | |||
3242 | { ISD::SETCC, MVT::v4i32, { 1, 1, 1, 1 } }, | |||
3243 | { ISD::SETCC, MVT::v8i16, { 1, 1, 1, 1 } }, | |||
3244 | { ISD::SETCC, MVT::v16i8, { 1, 1, 1, 1 } }, | |||
3245 | ||||
3246 | { ISD::SELECT, MVT::v2f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd | |||
3247 | { ISD::SELECT, MVT::f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd | |||
3248 | { ISD::SELECT, MVT::v2i64, { 2, 2, 3, 3 } }, // pand + pandn + por | |||
3249 | { ISD::SELECT, MVT::v4i32, { 2, 2, 3, 3 } }, // pand + pandn + por | |||
3250 | { ISD::SELECT, MVT::v8i16, { 2, 2, 3, 3 } }, // pand + pandn + por | |||
3251 | { ISD::SELECT, MVT::v16i8, { 2, 2, 3, 3 } }, // pand + pandn + por | |||
3252 | }; | |||
3253 | ||||
3254 | static const CostKindTblEntry SSE1CostTbl[] = { | |||
3255 | { ISD::SETCC, MVT::v4f32, { 2, 5, 1, 1 } }, | |||
3256 | { ISD::SETCC, MVT::f32, { 1, 5, 1, 1 } }, | |||
3257 | ||||
3258 | { ISD::SELECT, MVT::v4f32, { 2, 2, 3, 3 } }, // andps + andnps + orps | |||
3259 | { ISD::SELECT, MVT::f32, { 2, 2, 3, 3 } }, // andps + andnps + orps | |||
3260 | }; | |||
3261 | ||||
3262 | if (ST->useSLMArithCosts()) | |||
3263 | if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) | |||
3264 | if (auto KindCost = Entry->Cost[CostKind]) | |||
3265 | return LT.first * (ExtraCost + *KindCost); | |||
3266 | ||||
3267 | if (ST->hasBWI()) | |||
3268 | if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) | |||
3269 | if (auto KindCost = Entry->Cost[CostKind]) | |||
3270 | return LT.first * (ExtraCost + *KindCost); | |||
3271 | ||||
3272 | if (ST->hasAVX512()) | |||
3273 | if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) | |||
3274 | if (auto KindCost = Entry->Cost[CostKind]) | |||
3275 | return LT.first * (ExtraCost + *KindCost); | |||
3276 | ||||
3277 | if (ST->hasAVX2()) | |||
3278 | if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) | |||
3279 | if (auto KindCost = Entry->Cost[CostKind]) | |||
3280 | return LT.first * (ExtraCost + *KindCost); | |||
3281 | ||||
3282 | if (ST->hasXOP()) | |||
3283 | if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy)) | |||
3284 | if (auto KindCost = Entry->Cost[CostKind]) | |||
3285 | return LT.first * (ExtraCost + *KindCost); | |||
3286 | ||||
3287 | if (ST->hasAVX()) | |||
3288 | if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) | |||
3289 | if (auto KindCost = Entry->Cost[CostKind]) | |||
3290 | return LT.first * (ExtraCost + *KindCost); | |||
3291 | ||||
3292 | if (ST->hasSSE42()) | |||
3293 | if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) | |||
3294 | if (auto KindCost = Entry->Cost[CostKind]) | |||
3295 | return LT.first * (ExtraCost + *KindCost); | |||
3296 | ||||
3297 | if (ST->hasSSE41()) | |||
3298 | if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) | |||
3299 | if (auto KindCost = Entry->Cost[CostKind]) | |||
3300 | return LT.first * (ExtraCost + *KindCost); | |||
3301 | ||||
3302 | if (ST->hasSSE2()) | |||
3303 | if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) | |||
3304 | if (auto KindCost = Entry->Cost[CostKind]) | |||
3305 | return LT.first * (ExtraCost + *KindCost); | |||
3306 | ||||
3307 | if (ST->hasSSE1()) | |||
3308 | if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) | |||
3309 | if (auto KindCost = Entry->Cost[CostKind]) | |||
3310 | return LT.first * (ExtraCost + *KindCost); | |||
3311 | ||||
3312 | // Assume a 3cy latency for fp select ops. | |||
3313 | if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select) | |||
3314 | if (ValTy->getScalarType()->isFloatingPointTy()) | |||
3315 | return 3; | |||
3316 | ||||
3317 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); | |||
3318 | } | |||
3319 | ||||
3320 | unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; } | |||
3321 | ||||
3322 | InstructionCost | |||
3323 | X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, | |||
3324 | TTI::TargetCostKind CostKind) { | |||
3325 | // Costs should match the codegen from: | |||
3326 | // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll | |||
3327 | // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll | |||
3328 | // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll | |||
3329 | // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll | |||
3330 | // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll | |||
3331 | ||||
3332 | // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not | |||
3333 | // specialized in these tables yet. | |||
3334 | static const CostKindTblEntry AVX512VBMI2CostTbl[] = { | |||
3335 | { ISD::FSHL, MVT::v8i64, { 1, 1, 1, 1 } }, | |||
3336 | { ISD::FSHL, MVT::v4i64, { 1, 1, 1, 1 } }, | |||
3337 | { ISD::FSHL, MVT::v2i64, { 1, 1, 1, 1 } }, | |||
3338 | { ISD::FSHL, MVT::v16i32, { 1, 1, 1, 1 } }, | |||
3339 | { ISD::FSHL, MVT::v8i32, { 1, 1, 1, 1 } }, | |||
3340 | { ISD::FSHL, MVT::v4i32, { 1, 1, 1, 1 } }, | |||
3341 | { ISD::FSHL, MVT::v32i16, { 1, 1, 1, 1 } }, | |||
3342 | { ISD::FSHL, MVT::v16i16, { 1, 1, 1, 1 } }, | |||
3343 | { ISD::FSHL, MVT::v8i16, { 1, 1, 1, 1 } }, | |||
3344 | { ISD::ROTL, MVT::v32i16, { 1, 1, 1, 1 } }, | |||
3345 | { ISD::ROTL, MVT::v16i16, { 1, 1, 1, 1 } }, | |||
3346 | { ISD::ROTL, MVT::v8i16, { 1, 1, 1, 1 } }, | |||
3347 | { ISD::ROTR, MVT::v32i16, { 1, 1, 1, 1 } }, | |||
3348 | { ISD::ROTR, MVT::v16i16, { 1, 1, 1, 1 } }, | |||
3349 | { ISD::ROTR, MVT::v8i16, { 1, 1, 1, 1 } }, | |||
3350 | }; | |||
3351 | static const CostKindTblEntry AVX512BITALGCostTbl[] = { | |||
3352 | { ISD::CTPOP, MVT::v32i16, { 1, 1, 1, 1 } }, | |||
3353 | { ISD::CTPOP, MVT::v64i8, { 1, 1, 1, 1 } }, | |||
3354 | { ISD::CTPOP, MVT::v16i16, { 1, 1, 1, 1 } }, | |||
3355 | { ISD::CTPOP, MVT::v32i8, { 1, 1, 1, 1 } }, | |||
3356 | { ISD::CTPOP, MVT::v8i16, { 1, 1, 1, 1 } }, | |||
3357 | { ISD::CTPOP, MVT::v16i8, { 1, 1, 1, 1 } }, | |||
3358 | }; | |||
3359 | static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = { | |||
3360 | { ISD::CTPOP, MVT::v8i64, { 1, 1, 1, 1 } }, | |||
3361 | { ISD::CTPOP, MVT::v16i32, { 1, 1, 1, 1 } }, | |||
3362 | { ISD::CTPOP, MVT::v4i64, { 1, 1, 1, 1 } }, | |||
3363 | { ISD::CTPOP, MVT::v8i32, { 1, 1, 1, 1 } }, | |||
3364 | { ISD::CTPOP, MVT::v2i64, { 1, 1, 1, 1 } }, | |||
3365 | { ISD::CTPOP, MVT::v4i32, { 1, 1, 1, 1 } }, | |||
3366 | }; | |||
3367 | static const CostKindTblEntry AVX512CDCostTbl[] = { | |||
3368 | { ISD::CTLZ, MVT::v8i64, { 1, 5, 1, 1 } }, | |||
3369 | { ISD::CTLZ, MVT::v16i32, { 1, 5, 1, 1 } }, | |||
3370 | { ISD::CTLZ, MVT::v32i16, { 18, 27, 23, 27 } }, | |||
3371 | { ISD::CTLZ, MVT::v64i8, { 3, 16, 9, 11 } }, | |||
3372 | { ISD::CTLZ, MVT::v4i64, { 1, 5, 1, 1 } }, | |||
3373 | { ISD::CTLZ, MVT::v8i32, { 1, 5, 1, 1 } }, | |||
3374 | { ISD::CTLZ, MVT::v16i16, { 8, 19, 11, 13 } }, | |||
3375 | { ISD::CTLZ, MVT::v32i8, { 2, 11, 9, 10 } }, | |||
3376 | { ISD::CTLZ, MVT::v2i64, { 1, 5, 1, 1 } }, | |||
3377 | { ISD::CTLZ, MVT::v4i32, { 1, 5, 1, 1 } }, | |||
3378 | { ISD::CTLZ, MVT::v8i16, { 3, 15, 4, 6 } }, | |||
3379 | { ISD::CTLZ, MVT::v16i8, { 2, 10, 9, 10 } }, | |||
3380 | ||||
3381 | { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } }, | |||
3382 | { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } }, | |||
3383 | { ISD::CTTZ, MVT::v4i64, { 1, 8, 6, 6 } }, | |||
3384 | { ISD::CTTZ, MVT::v8i32, { 1, 8, 6, 6 } }, | |||
3385 | { ISD::CTTZ, MVT::v2i64, { 1, 8, 6, 6 } }, | |||
3386 | { ISD::CTTZ, MVT::v4i32, { 1, 8, 6, 6 } }, | |||
3387 | }; | |||
3388 | static const CostKindTblEntry AVX512BWCostTbl[] = { | |||
3389 | { ISD::ABS, MVT::v32i16, { 1, 1, 1, 1 } }, | |||
3390 | { ISD::ABS, MVT::v64i8, { 1, 1, 1, 1 } }, | |||
3391 | { ISD::BITREVERSE, MVT::v8i64, { 3 } }, | |||
3392 | { ISD::BITREVERSE, MVT::v16i32, { 3 } }, | |||
3393 | { ISD::BITREVERSE, MVT::v32i16, { 3 } }, | |||
3394 | { ISD::BITREVERSE, MVT::v64i8, { 2 } }, | |||
3395 | { ISD::BSWAP, MVT::v8i64, { 1 } }, | |||
3396 | { ISD::BSWAP, MVT::v16i32, { 1 } }, | |||
3397 | { ISD::BSWAP, MVT::v32i16, { 1 } }, | |||
3398 | { ISD::CTLZ, MVT::v8i64, { 8, 22, 23, 23 } }, | |||
3399 | { ISD::CTLZ, MVT::v16i32, { 8, 23, 25, 25 } }, | |||
3400 | { ISD::CTLZ, MVT::v32i16, { 4, 15, 15, 16 } }, | |||
3401 | { ISD::CTLZ, MVT::v64i8, { 3, 12, 10, 9 } }, | |||
3402 | { ISD::CTPOP, MVT::v2i64, { 3, 7, 10, 10 } }, | |||
3403 | { ISD::CTPOP, MVT::v4i64, { 3, 7, 10, 10 } }, | |||
3404 | { ISD::CTPOP, MVT::v8i64, { 3, 8, 10, 12 } }, | |||
3405 | { ISD::CTPOP, MVT::v4i32, { 7, 11, 14, 14 } }, | |||
3406 | { ISD::CTPOP, MVT::v8i32, { 7, 11, 14, 14 } }, | |||
3407 | { ISD::CTPOP, MVT::v16i32, { 7, 12, 14, 16 } }, | |||
3408 | { ISD::CTPOP, MVT::v8i16, { 2, 7, 11, 11 } }, | |||
3409 | { ISD::CTPOP, MVT::v16i16, { 2, 7, 11, 11 } }, | |||
3410 | { ISD::CTPOP, MVT::v32i16, { 3, 7, 11, 13 } }, | |||
3411 | { ISD::CTPOP, MVT::v16i8, { 2, 4, 8, 8 } }, | |||
3412 | { ISD::CTPOP, MVT::v32i8, { 2, 4, 8, 8 } }, | |||
3413 | { ISD::CTPOP, MVT::v64i8, { 2, 5, 8, 10 } }, | |||
3414 | { ISD::CTTZ, MVT::v8i16, { 3, 9, 14, 14 } }, | |||
3415 | { ISD::CTTZ, MVT::v16i16, { 3, 9, 14, 14 } }, | |||
3416 | { ISD::CTTZ, MVT::v32i16, { 3, 10, 14, 16 } }, | |||
3417 | { ISD::CTTZ, MVT::v16i8, { 2, 6, 11, 11 } }, | |||
3418 | { ISD::CTTZ, MVT::v32i8, { 2, 6, 11, 11 } }, | |||
3419 | { ISD::CTTZ, MVT::v64i8, { 3, 7, 11, 13 } }, | |||
3420 | { ISD::ROTL, MVT::v32i16, { 2, 8, 6, 8 } }, | |||
3421 | { ISD::ROTL, MVT::v16i16, { 2, 8, 6, 7 } }, | |||
3422 | { ISD::ROTL, MVT::v8i16, { 2, 7, 6, 7 } }, | |||
3423 | { ISD::ROTL, MVT::v64i8, { 5, 6, 11, 12 } }, | |||
3424 | { ISD::ROTL, MVT::v32i8, { 5, 15, 7, 10 } }, | |||
3425 | { ISD::ROTL, MVT::v16i8, { 5, 15, 7, 10 } }, | |||
3426 | { ISD::ROTR, MVT::v32i16, { 2, 8, 6, 8 } }, | |||
3427 | { ISD::ROTR, MVT::v16i16, { 2, 8, 6, 7 } }, | |||
3428 | { ISD::ROTR, MVT::v8i16, { 2, 7, 6, 7 } }, | |||
3429 | { ISD::ROTR, MVT::v64i8, { 5, 6, 12, 14 } }, | |||
3430 | { ISD::ROTR, MVT::v32i8, { 5, 14, 6, 9 } }, | |||
3431 | { ISD::ROTR, MVT::v16i8, { 5, 14, 6, 9 } }, | |||
3432 | { ISD::SADDSAT, MVT::v32i16, { 1 } }, | |||
3433 | { ISD::SADDSAT, MVT::v64i8, { 1 } }, | |||
3434 | { ISD::SMAX, MVT::v32i16, { 1, 1, 1, 1 } }, | |||
3435 | { ISD::SMAX, MVT::v64i8, { 1, 1, 1, 1 } }, | |||
3436 | { ISD::SMIN, MVT::v32i16, { 1, 1, 1, 1 } }, | |||
3437 | { ISD::SMIN, MVT::v64i8, { 1, 1, 1, 1 } }, | |||
3438 | { ISD::SSUBSAT, MVT::v32i16, { 1 } }, | |||
3439 | { ISD::SSUBSAT, MVT::v64i8, { 1 } }, | |||
3440 | { ISD::UADDSAT, MVT::v32i16, { 1 } }, | |||
3441 | { ISD::UADDSAT, MVT::v64i8, { 1 } }, | |||
3442 | { ISD::UMAX, MVT::v32i16, { 1, 1, 1, 1 } }, | |||
3443 | { ISD::UMAX, MVT::v64i8, { 1, 1, 1, 1 } }, | |||
3444 | { ISD::UMIN, MVT::v32i16, { 1, 1, 1, 1 } }, | |||
3445 | { ISD::UMIN, MVT::v64i8, { 1, 1, 1, 1 } }, | |||
3446 | { ISD::USUBSAT, MVT::v32i16, { 1 } }, | |||
3447 | { ISD::USUBSAT, MVT::v64i8, { 1 } }, | |||
3448 | }; | |||
3449 | static const CostKindTblEntry AVX512CostTbl[] = { | |||
3450 | { ISD::ABS, MVT::v8i64, { 1, 1, 1, 1 } }, | |||
3451 | { ISD::ABS, MVT::v4i64, { 1, 1, 1, 1 } }, | |||
3452 | { ISD::ABS, MVT::v2i64, { 1, 1, 1, 1 } }, | |||
3453 | { ISD::ABS, MVT::v16i32, { 1, 1, 1, 1 } }, | |||
3454 | { ISD::ABS, MVT::v8i32, { 1, 1, 1, 1 } }, | |||
3455 | { ISD::ABS, MVT::v32i16, { 2, 7, 4, 4 } }, | |||
3456 | { ISD::ABS, MVT::v16i16, { 1, 1, 1, 1 } }, | |||
3457 | { ISD::ABS, MVT::v64i8, { 2, 7, 4, 4 } }, | |||
3458 | { ISD::ABS, MVT::v32i8, { 1, 1, 1, 1 } }, | |||
3459 | { ISD::BITREVERSE, MVT::v8i64, { 36 } }, | |||
3460 | { ISD::BITREVERSE, MVT::v16i32, { 24 } }, | |||
3461 | { ISD::BITREVERSE, MVT::v32i16, { 10 } }, | |||
3462 | { ISD::BITREVERSE, MVT::v64i8, { 10 } }, | |||
3463 | { ISD::BSWAP, MVT::v8i64, { 4 } }, | |||
3464 | { ISD::BSWAP, MVT::v16i32, { 4 } }, | |||
3465 | { ISD::BSWAP, MVT::v32i16, { 4 } }, | |||
3466 | { ISD::CTLZ, MVT::v8i64, { 10, 28, 32, 32 } }, | |||
3467 | { ISD::CTLZ, MVT::v16i32, { 12, 30, 38, 38 } }, | |||
3468 | { ISD::CTLZ, MVT::v32i16, { 8, 15, 29, 29 } }, | |||
3469 | { ISD::CTLZ, MVT::v64i8, { 6, 11, 19, 19 } }, | |||
3470 | { ISD::CTPOP, MVT::v8i64, { 16, 16, 19, 19 } }, | |||
3471 | { ISD::CTPOP, MVT::v16i32, { 24, 19, 27, 27 } }, | |||
3472 | { ISD::CTPOP, MVT::v32i16, { 18, 15, 22, 22 } }, | |||
3473 | { ISD::CTPOP, MVT::v64i8, { 12, 11, 16, 16 } }, | |||
3474 | { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } }, | |||
3475 | { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } }, | |||
3476 | { ISD::CTTZ, MVT::v32i16, { 7, 17, 27, 27 } }, | |||
3477 | { ISD::CTTZ, MVT::v64i8, { 6, 13, 21, 21 } }, | |||
3478 | { ISD::ROTL, MVT::v8i64, { 1, 1, 1, 1 } }, | |||
3479 | { ISD::ROTL, MVT::v4i64, { 1, 1, 1, 1 } }, | |||
3480 | { ISD::ROTL, MVT::v2i64, { 1, 1, 1, 1 } }, | |||
3481 | { ISD::ROTL, MVT::v16i32, { 1, 1, 1, 1 } }, | |||
3482 | { ISD::ROTL, MVT::v8i32, { 1, 1, 1, 1 } }, | |||
3483 | { ISD::ROTL, MVT::v4i32, { 1, 1, 1, 1 } }, | |||
3484 | { ISD::ROTR, MVT::v8i64, { 1, 1, 1, 1 } }, | |||
3485 | { ISD::ROTR, MVT::v4i64, { 1, 1, 1, 1 } }, | |||
3486 | { ISD::ROTR, MVT::v2i64, { 1, 1, 1, 1 } }, | |||
3487 | { ISD::ROTR, MVT::v16i32, { 1, 1, 1, 1 } }, | |||
3488 | { ISD::ROTR, MVT::v8i32, { 1, 1, 1, 1 } }, | |||
3489 | { ISD::ROTR, MVT::v4i32, { 1, 1, 1, 1 } }, | |||
3490 | { ISD::SMAX, MVT::v8i64, { 1, 3, 1, 1 } }, | |||
3491 | { ISD::SMAX, MVT::v16i32, { 1, 1, 1, 1 } }, | |||
3492 | { ISD::SMAX, MVT::v32i16, { 3, 7, 5, 5 } }, | |||
3493 | { ISD::SMAX, MVT::v64i8, { 3, 7, 5, 5 } }, | |||
3494 | { ISD::SMAX, MVT::v4i64, { 1, 3, 1, 1 } }, | |||
3495 | { ISD::SMAX, MVT::v2i64, { 1, 3, 1, 1 } }, | |||
3496 | { ISD::SMIN, MVT::v8i64, { 1, 3, 1, 1 } }, | |||
3497 | { ISD::SMIN, MVT::v16i32, { 1, 1, 1, 1 } }, | |||
3498 | { ISD::SMIN, MVT::v32i16, { 3, 7, 5, 5 } }, | |||
3499 | { ISD::SMIN, MVT::v64i8, { 3, 7, 5, 5 } }, | |||
3500 | { ISD::SMIN, MVT::v4i64, { 1, 3, 1, 1 } }, | |||
3501 | { ISD::SMIN, MVT::v2i64, { 1, 3, 1, 1 } }, | |||
3502 | { ISD::UMAX, MVT::v8i64, { 1, 3, 1, 1 } }, | |||
3503 | { ISD::UMAX, MVT::v16i32, { 1, 1, 1, 1 } }, | |||
3504 | { ISD::UMAX, MVT::v32i16, { 3, 7, 5, 5 } }, | |||
3505 | { ISD::UMAX, MVT::v64i8, { 3, 7, 5, 5 } }, | |||
3506 | { ISD::UMAX, MVT::v4i64, { 1, 3, 1, 1 } }, | |||
3507 | { ISD::UMAX, MVT::v2i64, { 1, 3, 1, 1 } }, | |||
3508 | { ISD::UMIN, MVT::v8i64, { 1, 3, 1, 1 } }, | |||
3509 | { ISD::UMIN, MVT::v16i32, { 1, 1, 1, 1 } }, | |||
3510 | { ISD::UMIN, MVT::v32i16, { 3, 7, 5, 5 } }, | |||
3511 | { ISD::UMIN, MVT::v64i8, { 3, 7, 5, 5 } }, | |||
3512 | { ISD::UMIN, MVT::v4i64, { 1, 3, 1, 1 } }, | |||
3513 | { ISD::UMIN, MVT::v2i64, { 1, 3, 1, 1 } }, | |||
3514 | { ISD::USUBSAT, MVT::v16i32, { 2 } }, // pmaxud + psubd | |||
3515 | { ISD::USUBSAT, MVT::v2i64, { 2 } }, // pmaxuq + psubq | |||
3516 | { ISD::USUBSAT, MVT::v4i64, { 2 } }, // pmaxuq + psubq | |||
3517 | { ISD::USUBSAT, MVT::v8i64, { 2 } }, // pmaxuq + psubq | |||
3518 | { ISD::UADDSAT, MVT::v16i32, { 3 } }, // not + pminud + paddd | |||
3519 | { ISD::UADDSAT, MVT::v2i64, { 3 } }, // not + pminuq + paddq | |||
3520 | { ISD::UADDSAT, MVT::v4i64, { 3 } }, // not + pminuq + paddq | |||
3521 | { ISD::UADDSAT, MVT::v8i64, { 3 } }, // not + pminuq + paddq | |||
3522 | { ISD::SADDSAT, MVT::v32i16, { 2 } }, | |||
3523 | { ISD::SADDSAT, MVT::v64i8, { 2 } }, | |||
3524 | { ISD::SSUBSAT, MVT::v32i16, { 2 } }, | |||
3525 | { ISD::SSUBSAT, MVT::v64i8, { 2 } }, | |||
3526 | { ISD::UADDSAT, MVT::v32i16, { 2 } }, | |||
3527 | { ISD::UADDSAT, MVT::v64i8, { 2 } }, | |||
3528 | { ISD::USUBSAT, MVT::v32i16, { 2 } }, | |||
3529 | { ISD::USUBSAT, MVT::v64i8, { 2 } }, | |||
3530 | { ISD::FMAXNUM, MVT::f32, { 2 } }, | |||
3531 | { ISD::FMAXNUM, MVT::v4f32, { 2 } }, | |||
3532 | { ISD::FMAXNUM, MVT::v8f32, { 2 } }, | |||
3533 | { ISD::FMAXNUM, MVT::v16f32, { 2 } }, | |||
3534 | { ISD::FMAXNUM, MVT::f64, { 2 } }, | |||
3535 | { ISD::FMAXNUM, MVT::v2f64, { 2 } }, | |||
3536 | { ISD::FMAXNUM, MVT::v4f64, { 2 } }, | |||
3537 | { ISD::FMAXNUM, MVT::v8f64, { 2 } }, | |||
3538 | { ISD::FSQRT, MVT::f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
3539 | { ISD::FSQRT, MVT::v4f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
3540 | { ISD::FSQRT, MVT::v8f32, { 6, 12, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
3541 | { ISD::FSQRT, MVT::v16f32, { 12, 20, 1, 3 } }, // Skylake from http://www.agner.org/ | |||
3542 | { ISD::FSQRT, MVT::f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
3543 | { ISD::FSQRT, MVT::v2f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
3544 | { ISD::FSQRT, MVT::v4f64, { 12, 18, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
3545 | { ISD::FSQRT, MVT::v8f64, { 24, 32, 1, 3 } }, // Skylake from http://www.agner.org/ | |||
3546 | }; | |||
3547 | static const CostKindTblEntry XOPCostTbl[] = { | |||
3548 | { ISD::BITREVERSE, MVT::v4i64, { 4 } }, | |||
3549 | { ISD::BITREVERSE, MVT::v8i32, { 4 } }, | |||
3550 | { ISD::BITREVERSE, MVT::v16i16, { 4 } }, | |||
3551 | { ISD::BITREVERSE, MVT::v32i8, { 4 } }, | |||
3552 | { ISD::BITREVERSE, MVT::v2i64, { 1 } }, | |||
3553 | { ISD::BITREVERSE, MVT::v4i32, { 1 } }, | |||
3554 | { ISD::BITREVERSE, MVT::v8i16, { 1 } }, | |||
3555 | { ISD::BITREVERSE, MVT::v16i8, { 1 } }, | |||
3556 | { ISD::BITREVERSE, MVT::i64, { 3 } }, | |||
3557 | { ISD::BITREVERSE, MVT::i32, { 3 } }, | |||
3558 | { ISD::BITREVERSE, MVT::i16, { 3 } }, | |||
3559 | { ISD::BITREVERSE, MVT::i8, { 3 } }, | |||
3560 | // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y)) | |||
3561 | { ISD::ROTL, MVT::v4i64, { 4, 7, 5, 6 } }, | |||
3562 | { ISD::ROTL, MVT::v8i32, { 4, 7, 5, 6 } }, | |||
3563 | { ISD::ROTL, MVT::v16i16, { 4, 7, 5, 6 } }, | |||
3564 | { ISD::ROTL, MVT::v32i8, { 4, 7, 5, 6 } }, | |||
3565 | { ISD::ROTL, MVT::v2i64, { 1, 3, 1, 1 } }, | |||
3566 | { ISD::ROTL, MVT::v4i32, { 1, 3, 1, 1 } }, | |||
3567 | { ISD::ROTL, MVT::v8i16, { 1, 3, 1, 1 } }, | |||
3568 | { ISD::ROTL, MVT::v16i8, { 1, 3, 1, 1 } }, | |||
3569 | { ISD::ROTR, MVT::v4i64, { 4, 7, 8, 9 } }, | |||
3570 | { ISD::ROTR, MVT::v8i32, { 4, 7, 8, 9 } }, | |||
3571 | { ISD::ROTR, MVT::v16i16, { 4, 7, 8, 9 } }, | |||
3572 | { ISD::ROTR, MVT::v32i8, { 4, 7, 8, 9 } }, | |||
3573 | { ISD::ROTR, MVT::v2i64, { 1, 3, 3, 3 } }, | |||
3574 | { ISD::ROTR, MVT::v4i32, { 1, 3, 3, 3 } }, | |||
3575 | { ISD::ROTR, MVT::v8i16, { 1, 3, 3, 3 } }, | |||
3576 | { ISD::ROTR, MVT::v16i8, { 1, 3, 3, 3 } } | |||
3577 | }; | |||
3578 | static const CostKindTblEntry AVX2CostTbl[] = { | |||
3579 | { ISD::ABS, MVT::v2i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X) | |||
3580 | { ISD::ABS, MVT::v4i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X) | |||
3581 | { ISD::ABS, MVT::v4i32, { 1, 1, 1, 1 } }, | |||
3582 | { ISD::ABS, MVT::v8i32, { 1, 1, 1, 2 } }, | |||
3583 | { ISD::ABS, MVT::v8i16, { 1, 1, 1, 1 } }, | |||
3584 | { ISD::ABS, MVT::v16i16, { 1, 1, 1, 2 } }, | |||
3585 | { ISD::ABS, MVT::v16i8, { 1, 1, 1, 1 } }, | |||
3586 | { ISD::ABS, MVT::v32i8, { 1, 1, 1, 2 } }, | |||
3587 | { ISD::BITREVERSE, MVT::v2i64, { 3 } }, | |||
3588 | { ISD::BITREVERSE, MVT::v4i64, { 3 } }, | |||
3589 | { ISD::BITREVERSE, MVT::v4i32, { 3 } }, | |||
3590 | { ISD::BITREVERSE, MVT::v8i32, { 3 } }, | |||
3591 | { ISD::BITREVERSE, MVT::v8i16, { 3 } }, | |||
3592 | { ISD::BITREVERSE, MVT::v16i16, { 3 } }, | |||
3593 | { ISD::BITREVERSE, MVT::v16i8, { 3 } }, | |||
3594 | { ISD::BITREVERSE, MVT::v32i8, { 3 } }, | |||
3595 | { ISD::BSWAP, MVT::v4i64, { 1 } }, | |||
3596 | { ISD::BSWAP, MVT::v8i32, { 1 } }, | |||
3597 | { ISD::BSWAP, MVT::v16i16, { 1 } }, | |||
3598 | { ISD::CTLZ, MVT::v2i64, { 7, 18, 24, 25 } }, | |||
3599 | { ISD::CTLZ, MVT::v4i64, { 14, 18, 24, 44 } }, | |||
3600 | { ISD::CTLZ, MVT::v4i32, { 5, 16, 19, 20 } }, | |||
3601 | { ISD::CTLZ, MVT::v8i32, { 10, 16, 19, 34 } }, | |||
3602 | { ISD::CTLZ, MVT::v8i16, { 4, 13, 14, 15 } }, | |||
3603 | { ISD::CTLZ, MVT::v16i16, { 6, 14, 14, 24 } }, | |||
3604 | { ISD::CTLZ, MVT::v16i8, { 3, 12, 9, 10 } }, | |||
3605 | { ISD::CTLZ, MVT::v32i8, { 4, 12, 9, 14 } }, | |||
3606 | { ISD::CTPOP, MVT::v2i64, { 3, 9, 10, 10 } }, | |||
3607 | { ISD::CTPOP, MVT::v4i64, { 4, 9, 10, 14 } }, | |||
3608 | { ISD::CTPOP, MVT::v4i32, { 7, 12, 14, 14 } }, | |||
3609 | { ISD::CTPOP, MVT::v8i32, { 7, 12, 14, 18 } }, | |||
3610 | { ISD::CTPOP, MVT::v8i16, { 3, 7, 11, 11 } }, | |||
3611 | { ISD::CTPOP, MVT::v16i16, { 6, 8, 11, 18 } }, | |||
3612 | { ISD::CTPOP, MVT::v16i8, { 2, 5, 8, 8 } }, | |||
3613 | { ISD::CTPOP, MVT::v32i8, { 3, 5, 8, 12 } }, | |||
3614 | { ISD::CTTZ, MVT::v2i64, { 4, 11, 13, 13 } }, | |||
3615 | { ISD::CTTZ, MVT::v4i64, { 5, 11, 13, 20 } }, | |||
3616 | { ISD::CTTZ, MVT::v4i32, { 7, 14, 17, 17 } }, | |||
3617 | { ISD::CTTZ, MVT::v8i32, { 7, 15, 17, 24 } }, | |||
3618 | { ISD::CTTZ, MVT::v8i16, { 4, 9, 14, 14 } }, | |||
3619 | { ISD::CTTZ, MVT::v16i16, { 6, 9, 14, 24 } }, | |||
3620 | { ISD::CTTZ, MVT::v16i8, { 3, 7, 11, 11 } }, | |||
3621 | { ISD::CTTZ, MVT::v32i8, { 5, 7, 11, 18 } }, | |||
3622 | { ISD::SADDSAT, MVT::v16i16, { 1 } }, | |||
3623 | { ISD::SADDSAT, MVT::v32i8, { 1 } }, | |||
3624 | { ISD::SMAX, MVT::v2i64, { 2, 7, 2, 3 } }, | |||
3625 | { ISD::SMAX, MVT::v4i64, { 2, 7, 2, 3 } }, | |||
3626 | { ISD::SMAX, MVT::v8i32, { 1, 1, 1, 2 } }, | |||
3627 | { ISD::SMAX, MVT::v16i16, { 1, 1, 1, 2 } }, | |||
3628 | { ISD::SMAX, MVT::v32i8, { 1, 1, 1, 2 } }, | |||
3629 | { ISD::SMIN, MVT::v2i64, { 2, 7, 2, 3 } }, | |||
3630 | { ISD::SMIN, MVT::v4i64, { 2, 7, 2, 3 } }, | |||
3631 | { ISD::SMIN, MVT::v8i32, { 1, 1, 1, 2 } }, | |||
3632 | { ISD::SMIN, MVT::v16i16, { 1, 1, 1, 2 } }, | |||
3633 | { ISD::SMIN, MVT::v32i8, { 1, 1, 1, 2 } }, | |||
3634 | { ISD::SSUBSAT, MVT::v16i16, { 1 } }, | |||
3635 | { ISD::SSUBSAT, MVT::v32i8, { 1 } }, | |||
3636 | { ISD::UADDSAT, MVT::v16i16, { 1 } }, | |||
3637 | { ISD::UADDSAT, MVT::v32i8, { 1 } }, | |||
3638 | { ISD::UADDSAT, MVT::v8i32, { 3 } }, // not + pminud + paddd | |||
3639 | { ISD::UMAX, MVT::v2i64, { 2, 8, 5, 6 } }, | |||
3640 | { ISD::UMAX, MVT::v4i64, { 2, 8, 5, 8 } }, | |||
3641 | { ISD::UMAX, MVT::v8i32, { 1, 1, 1, 2 } }, | |||
3642 | { ISD::UMAX, MVT::v16i16, { 1, 1, 1, 2 } }, | |||
3643 | { ISD::UMAX, MVT::v32i8, { 1, 1, 1, 2 } }, | |||
3644 | { ISD::UMIN, MVT::v2i64, { 2, 8, 5, 6 } }, | |||
3645 | { ISD::UMIN, MVT::v4i64, { 2, 8, 5, 8 } }, | |||
3646 | { ISD::UMIN, MVT::v8i32, { 1, 1, 1, 2 } }, | |||
3647 | { ISD::UMIN, MVT::v16i16, { 1, 1, 1, 2 } }, | |||
3648 | { ISD::UMIN, MVT::v32i8, { 1, 1, 1, 2 } }, | |||
3649 | { ISD::USUBSAT, MVT::v16i16, { 1 } }, | |||
3650 | { ISD::USUBSAT, MVT::v32i8, { 1 } }, | |||
3651 | { ISD::USUBSAT, MVT::v8i32, { 2 } }, // pmaxud + psubd | |||
3652 | { ISD::FMAXNUM, MVT::v8f32, { 3 } }, // MAXPS + CMPUNORDPS + BLENDVPS | |||
3653 | { ISD::FMAXNUM, MVT::v4f64, { 3 } }, // MAXPD + CMPUNORDPD + BLENDVPD | |||
3654 | { ISD::FSQRT, MVT::f32, { 7, 15, 1, 1 } }, // vsqrtss | |||
3655 | { ISD::FSQRT, MVT::v4f32, { 7, 15, 1, 1 } }, // vsqrtps | |||
3656 | { ISD::FSQRT, MVT::v8f32, { 14, 21, 1, 3 } }, // vsqrtps | |||
3657 | { ISD::FSQRT, MVT::f64, { 14, 21, 1, 1 } }, // vsqrtsd | |||
3658 | { ISD::FSQRT, MVT::v2f64, { 14, 21, 1, 1 } }, // vsqrtpd | |||
3659 | { ISD::FSQRT, MVT::v4f64, { 28, 35, 1, 3 } }, // vsqrtpd | |||
3660 | }; | |||
3661 | static const CostKindTblEntry AVX1CostTbl[] = { | |||
3662 | { ISD::ABS, MVT::v4i64, { 6, 8, 6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X) | |||
3663 | { ISD::ABS, MVT::v8i32, { 3, 6, 4, 5 } }, | |||
3664 | { ISD::ABS, MVT::v16i16, { 3, 6, 4, 5 } }, | |||
3665 | { ISD::ABS, MVT::v32i8, { 3, 6, 4, 5 } }, | |||
3666 | { ISD::BITREVERSE, MVT::v4i64, { 12 } }, // 2 x 128-bit Op + extract/insert | |||
3667 | { ISD::BITREVERSE, MVT::v8i32, { 12 } }, // 2 x 128-bit Op + extract/insert | |||
3668 | { ISD::BITREVERSE, MVT::v16i16, { 12 } }, // 2 x 128-bit Op + extract/insert | |||
3669 | { ISD::BITREVERSE, MVT::v32i8, { 12 } }, // 2 x 128-bit Op + extract/insert | |||
3670 | { ISD::BSWAP, MVT::v4i64, { 4 } }, | |||
3671 | { ISD::BSWAP, MVT::v8i32, { 4 } }, | |||
3672 | { ISD::BSWAP, MVT::v16i16, { 4 } }, | |||
3673 | { ISD::CTLZ, MVT::v4i64, { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert | |||
3674 | { ISD::CTLZ, MVT::v2i64, { 14, 24, 24, 28 } }, | |||
3675 | { ISD::CTLZ, MVT::v8i32, { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert | |||
3676 | { ISD::CTLZ, MVT::v4i32, { 12, 20, 19, 23 } }, | |||
3677 | { ISD::CTLZ, MVT::v16i16, { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert | |||
3678 | { ISD::CTLZ, MVT::v8i16, { 9, 16, 14, 18 } }, | |||
3679 | { ISD::CTLZ, MVT::v32i8, { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert | |||
3680 | { ISD::CTLZ, MVT::v16i8, { 7, 12, 9, 13 } }, | |||
3681 | { ISD::CTPOP, MVT::v4i64, { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert | |||
3682 | { ISD::CTPOP, MVT::v2i64, { 7, 14, 10, 14 } }, | |||
3683 | { ISD::CTPOP, MVT::v8i32, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert | |||
3684 | { ISD::CTPOP, MVT::v4i32, { 9, 20, 14, 18 } }, | |||
3685 | { ISD::CTPOP, MVT::v16i16, { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert | |||
3686 | { ISD::CTPOP, MVT::v8i16, { 8, 18, 11, 15 } }, | |||
3687 | { ISD::CTPOP, MVT::v32i8, { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert | |||
3688 | { ISD::CTPOP, MVT::v16i8, { 6, 12, 8, 12 } }, | |||
3689 | { ISD::CTTZ, MVT::v4i64, { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert | |||
3690 | { ISD::CTTZ, MVT::v2i64, { 9, 19, 13, 17 } }, | |||
3691 | { ISD::CTTZ, MVT::v8i32, { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert | |||
3692 | { ISD::CTTZ, MVT::v4i32, { 11, 24, 17, 21 } }, | |||
3693 | { ISD::CTTZ, MVT::v16i16, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert | |||
3694 | { ISD::CTTZ, MVT::v8i16, { 9, 21, 14, 18 } }, | |||
3695 | { ISD::CTTZ, MVT::v32i8, { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert | |||
3696 | { ISD::CTTZ, MVT::v16i8, { 8, 16, 11, 15 } }, | |||
3697 | { ISD::SADDSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert | |||
3698 | { ISD::SADDSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert | |||
3699 | { ISD::SMAX, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert | |||
3700 | { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 4 } }, | |||
3701 | { ISD::SMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert | |||
3702 | { ISD::SMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert | |||
3703 | { ISD::SMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert | |||
3704 | { ISD::SMIN, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert | |||
3705 | { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } }, | |||
3706 | { ISD::SMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert | |||
3707 | { ISD::SMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert | |||
3708 | { ISD::SMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert | |||
3709 | { ISD::SSUBSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert | |||
3710 | { ISD::SSUBSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert | |||
3711 | { ISD::UADDSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert | |||
3712 | { ISD::UADDSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert | |||
3713 | { ISD::UADDSAT, MVT::v8i32, { 8 } }, // 2 x 128-bit Op + extract/insert | |||
3714 | { ISD::UMAX, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert | |||
3715 | { ISD::UMAX, MVT::v2i64, { 4, 8, 5, 7 } }, | |||
3716 | { ISD::UMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert | |||
3717 | { ISD::UMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert | |||
3718 | { ISD::UMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert | |||
3719 | { ISD::UMIN, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert | |||
3720 | { ISD::UMIN, MVT::v2i64, { 4, 8, 5, 7 } }, | |||
3721 | { ISD::UMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert | |||
3722 | { ISD::UMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert | |||
3723 | { ISD::UMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert | |||
3724 | { ISD::USUBSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert | |||
3725 | { ISD::USUBSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert | |||
3726 | { ISD::USUBSAT, MVT::v8i32, { 6 } }, // 2 x 128-bit Op + extract/insert | |||
3727 | { ISD::FMAXNUM, MVT::f32, { 3 } }, // MAXSS + CMPUNORDSS + BLENDVPS | |||
3728 | { ISD::FMAXNUM, MVT::v4f32, { 3 } }, // MAXPS + CMPUNORDPS + BLENDVPS | |||
3729 | { ISD::FMAXNUM, MVT::v8f32, { 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS + ? | |||
3730 | { ISD::FMAXNUM, MVT::f64, { 3 } }, // MAXSD + CMPUNORDSD + BLENDVPD | |||
3731 | { ISD::FMAXNUM, MVT::v2f64, { 3 } }, // MAXPD + CMPUNORDPD + BLENDVPD | |||
3732 | { ISD::FMAXNUM, MVT::v4f64, { 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD + ? | |||
3733 | { ISD::FSQRT, MVT::f32, { 21, 21, 1, 1 } }, // vsqrtss | |||
3734 | { ISD::FSQRT, MVT::v4f32, { 21, 21, 1, 1 } }, // vsqrtps | |||
3735 | { ISD::FSQRT, MVT::v8f32, { 42, 42, 1, 3 } }, // vsqrtps | |||
3736 | { ISD::FSQRT, MVT::f64, { 27, 27, 1, 1 } }, // vsqrtsd | |||
3737 | { ISD::FSQRT, MVT::v2f64, { 27, 27, 1, 1 } }, // vsqrtpd | |||
3738 | { ISD::FSQRT, MVT::v4f64, { 54, 54, 1, 3 } }, // vsqrtpd | |||
3739 | }; | |||
3740 | static const CostKindTblEntry GLMCostTbl[] = { | |||
3741 | { ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss | |||
3742 | { ISD::FSQRT, MVT::v4f32, { 37, 41, 1, 5 } }, // sqrtps | |||
3743 | { ISD::FSQRT, MVT::f64, { 34, 35, 1, 1 } }, // sqrtsd | |||
3744 | { ISD::FSQRT, MVT::v2f64, { 67, 71, 1, 5 } }, // sqrtpd | |||
3745 | }; | |||
3746 | static const CostKindTblEntry SLMCostTbl[] = { | |||
3747 | { ISD::FSQRT, MVT::f32, { 20, 20, 1, 1 } }, // sqrtss | |||
3748 | { ISD::FSQRT, MVT::v4f32, { 40, 41, 1, 5 } }, // sqrtps | |||
3749 | { ISD::FSQRT, MVT::f64, { 35, 35, 1, 1 } }, // sqrtsd | |||
3750 | { ISD::FSQRT, MVT::v2f64, { 70, 71, 1, 5 } }, // sqrtpd | |||
3751 | }; | |||
3752 | static const CostKindTblEntry SSE42CostTbl[] = { | |||
3753 | { ISD::USUBSAT, MVT::v4i32, { 2 } }, // pmaxud + psubd | |||
3754 | { ISD::UADDSAT, MVT::v4i32, { 3 } }, // not + pminud + paddd | |||
3755 | { ISD::FSQRT, MVT::f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
3756 | { ISD::FSQRT, MVT::v4f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
3757 | }; | |||
3758 | static const CostKindTblEntry SSE41CostTbl[] = { | |||
3759 | { ISD::ABS, MVT::v2i64, { 3, 4, 3, 5 } }, // BLENDVPD(X,PSUBQ(0,X),X) | |||
3760 | { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 3 } }, | |||
3761 | { ISD::SMAX, MVT::v4i32, { 1, 1, 1, 1 } }, | |||
3762 | { ISD::SMAX, MVT::v16i8, { 1, 1, 1, 1 } }, | |||
3763 | { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } }, | |||
3764 | { ISD::SMIN, MVT::v4i32, { 1, 1, 1, 1 } }, | |||
3765 | { ISD::SMIN, MVT::v16i8, { 1, 1, 1, 1 } }, | |||
3766 | { ISD::UMAX, MVT::v2i64, { 2, 11, 6, 7 } }, | |||
3767 | { ISD::UMAX, MVT::v4i32, { 1, 1, 1, 1 } }, | |||
3768 | { ISD::UMAX, MVT::v8i16, { 1, 1, 1, 1 } }, | |||
3769 | { ISD::UMIN, MVT::v2i64, { 2, 11, 6, 7 } }, | |||
3770 | { ISD::UMIN, MVT::v4i32, { 1, 1, 1, 1 } }, | |||
3771 | { ISD::UMIN, MVT::v8i16, { 1, 1, 1, 1 } }, | |||
3772 | }; | |||
3773 | static const CostKindTblEntry SSSE3CostTbl[] = { | |||
3774 | { ISD::ABS, MVT::v4i32, { 1, 2, 1, 1 } }, | |||
3775 | { ISD::ABS, MVT::v8i16, { 1, 2, 1, 1 } }, | |||
3776 | { ISD::ABS, MVT::v16i8, { 1, 2, 1, 1 } }, | |||
3777 | { ISD::BITREVERSE, MVT::v2i64, { 5 } }, | |||
3778 | { ISD::BITREVERSE, MVT::v4i32, { 5 } }, | |||
3779 | { ISD::BITREVERSE, MVT::v8i16, { 5 } }, | |||
3780 | { ISD::BITREVERSE, MVT::v16i8, { 5 } }, | |||
3781 | { ISD::BSWAP, MVT::v2i64, { 1 } }, | |||
3782 | { ISD::BSWAP, MVT::v4i32, { 1 } }, | |||
3783 | { ISD::BSWAP, MVT::v8i16, { 1 } }, | |||
3784 | { ISD::CTLZ, MVT::v2i64, { 18, 28, 28, 35 } }, | |||
3785 | { ISD::CTLZ, MVT::v4i32, { 15, 20, 22, 28 } }, | |||
3786 | { ISD::CTLZ, MVT::v8i16, { 13, 17, 16, 22 } }, | |||
3787 | { ISD::CTLZ, MVT::v16i8, { 11, 15, 10, 16 } }, | |||
3788 | { ISD::CTPOP, MVT::v2i64, { 13, 19, 12, 18 } }, | |||
3789 | { ISD::CTPOP, MVT::v4i32, { 18, 24, 16, 22 } }, | |||
3790 | { ISD::CTPOP, MVT::v8i16, { 13, 18, 14, 20 } }, | |||
3791 | { ISD::CTPOP, MVT::v16i8, { 11, 12, 10, 16 } }, | |||
3792 | { ISD::CTTZ, MVT::v2i64, { 13, 25, 15, 22 } }, | |||
3793 | { ISD::CTTZ, MVT::v4i32, { 18, 26, 19, 25 } }, | |||
3794 | { ISD::CTTZ, MVT::v8i16, { 13, 20, 17, 23 } }, | |||
3795 | { ISD::CTTZ, MVT::v16i8, { 11, 16, 13, 19 } } | |||
3796 | }; | |||
3797 | static const CostKindTblEntry SSE2CostTbl[] = { | |||
3798 | { ISD::ABS, MVT::v2i64, { 3, 6, 5, 5 } }, | |||
3799 | { ISD::ABS, MVT::v4i32, { 1, 4, 4, 4 } }, | |||
3800 | { ISD::ABS, MVT::v8i16, { 1, 2, 3, 3 } }, | |||
3801 | { ISD::ABS, MVT::v16i8, { 1, 2, 3, 3 } }, | |||
3802 | { ISD::BITREVERSE, MVT::v2i64, { 29 } }, | |||
3803 | { ISD::BITREVERSE, MVT::v4i32, { 27 } }, | |||
3804 | { ISD::BITREVERSE, MVT::v8i16, { 27 } }, | |||
3805 | { ISD::BITREVERSE, MVT::v16i8, { 20 } }, | |||
3806 | { ISD::BSWAP, MVT::v2i64, { 7 } }, | |||
3807 | { ISD::BSWAP, MVT::v4i32, { 7 } }, | |||
3808 | { ISD::BSWAP, MVT::v8i16, { 7 } }, | |||
3809 | { ISD::CTLZ, MVT::v2i64, { 10, 45, 36, 38 } }, | |||
3810 | { ISD::CTLZ, MVT::v4i32, { 10, 45, 38, 40 } }, | |||
3811 | { ISD::CTLZ, MVT::v8i16, { 9, 38, 32, 34 } }, | |||
3812 | { ISD::CTLZ, MVT::v16i8, { 8, 39, 29, 32 } }, | |||
3813 | { ISD::CTPOP, MVT::v2i64, { 12, 26, 16, 18 } }, | |||
3814 | { ISD::CTPOP, MVT::v4i32, { 15, 29, 21, 23 } }, | |||
3815 | { ISD::CTPOP, MVT::v8i16, { 13, 25, 18, 20 } }, | |||
3816 | { ISD::CTPOP, MVT::v16i8, { 10, 21, 14, 16 } }, | |||
3817 | { ISD::CTTZ, MVT::v2i64, { 14, 28, 19, 21 } }, | |||
3818 | { ISD::CTTZ, MVT::v4i32, { 18, 31, 24, 26 } }, | |||
3819 | { ISD::CTTZ, MVT::v8i16, { 16, 27, 21, 23 } }, | |||
3820 | { ISD::CTTZ, MVT::v16i8, { 13, 23, 17, 19 } }, | |||
3821 | { ISD::SADDSAT, MVT::v8i16, { 1 } }, | |||
3822 | { ISD::SADDSAT, MVT::v16i8, { 1 } }, | |||
3823 | { ISD::SMAX, MVT::v2i64, { 4, 8, 15, 15 } }, | |||
3824 | { ISD::SMAX, MVT::v4i32, { 2, 4, 5, 5 } }, | |||
3825 | { ISD::SMAX, MVT::v8i16, { 1, 1, 1, 1 } }, | |||
3826 | { ISD::SMAX, MVT::v16i8, { 2, 4, 5, 5 } }, | |||
3827 | { ISD::SMIN, MVT::v2i64, { 4, 8, 15, 15 } }, | |||
3828 | { ISD::SMIN, MVT::v4i32, { 2, 4, 5, 5 } }, | |||
3829 | { ISD::SMIN, MVT::v8i16, { 1, 1, 1, 1 } }, | |||
3830 | { ISD::SMIN, MVT::v16i8, { 2, 4, 5, 5 } }, | |||
3831 | { ISD::SSUBSAT, MVT::v8i16, { 1 } }, | |||
3832 | { ISD::SSUBSAT, MVT::v16i8, { 1 } }, | |||
3833 | { ISD::UADDSAT, MVT::v8i16, { 1 } }, | |||
3834 | { ISD::UADDSAT, MVT::v16i8, { 1 } }, | |||
3835 | { ISD::UMAX, MVT::v2i64, { 4, 8, 15, 15 } }, | |||
3836 | { ISD::UMAX, MVT::v4i32, { 2, 5, 8, 8 } }, | |||
3837 | { ISD::UMAX, MVT::v8i16, { 1, 3, 3, 3 } }, | |||
3838 | { ISD::UMAX, MVT::v16i8, { 1, 1, 1, 1 } }, | |||
3839 | { ISD::UMIN, MVT::v2i64, { 4, 8, 15, 15 } }, | |||
3840 | { ISD::UMIN, MVT::v4i32, { 2, 5, 8, 8 } }, | |||
3841 | { ISD::UMIN, MVT::v8i16, { 1, 3, 3, 3 } }, | |||
3842 | { ISD::UMIN, MVT::v16i8, { 1, 1, 1, 1 } }, | |||
3843 | { ISD::USUBSAT, MVT::v8i16, { 1 } }, | |||
3844 | { ISD::USUBSAT, MVT::v16i8, { 1 } }, | |||
3845 | { ISD::FMAXNUM, MVT::f64, { 4 } }, | |||
3846 | { ISD::FMAXNUM, MVT::v2f64, { 4 } }, | |||
3847 | { ISD::FSQRT, MVT::f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
3848 | { ISD::FSQRT, MVT::v2f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
3849 | }; | |||
3850 | static const CostKindTblEntry SSE1CostTbl[] = { | |||
3851 | { ISD::FMAXNUM, MVT::f32, { 4 } }, | |||
3852 | { ISD::FMAXNUM, MVT::v4f32, { 4 } }, | |||
3853 | { ISD::FSQRT, MVT::f32, { 28, 30, 1, 2 } }, // Pentium III from http://www.agner.org/ | |||
3854 | { ISD::FSQRT, MVT::v4f32, { 56, 56, 1, 2 } }, // Pentium III from http://www.agner.org/ | |||
3855 | }; | |||
3856 | static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets | |||
3857 | { ISD::CTTZ, MVT::i64, { 1 } }, | |||
3858 | }; | |||
3859 | static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets | |||
3860 | { ISD::CTTZ, MVT::i32, { 1 } }, | |||
3861 | { ISD::CTTZ, MVT::i16, { 1 } }, | |||
3862 | { ISD::CTTZ, MVT::i8, { 1 } }, | |||
3863 | }; | |||
3864 | static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets | |||
3865 | { ISD::CTLZ, MVT::i64, { 1 } }, | |||
3866 | }; | |||
3867 | static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets | |||
3868 | { ISD::CTLZ, MVT::i32, { 1 } }, | |||
3869 | { ISD::CTLZ, MVT::i16, { 2 } }, | |||
3870 | { ISD::CTLZ, MVT::i8, { 2 } }, | |||
3871 | }; | |||
3872 | static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets | |||
3873 | { ISD::CTPOP, MVT::i64, { 1, 1, 1, 1 } }, // popcnt | |||
3874 | }; | |||
3875 | static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets | |||
3876 | { ISD::CTPOP, MVT::i32, { 1, 1, 1, 1 } }, // popcnt | |||
3877 | { ISD::CTPOP, MVT::i16, { 1, 1, 2, 2 } }, // popcnt(zext()) | |||
3878 | { ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext()) | |||
3879 | }; | |||
3880 | static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets | |||
3881 | { ISD::ABS, MVT::i64, { 1, 2, 3, 4 } }, // SUB+CMOV | |||
3882 | { ISD::BITREVERSE, MVT::i64, { 14 } }, | |||
3883 | { ISD::BSWAP, MVT::i64, { 1 } }, | |||
3884 | { ISD::CTLZ, MVT::i64, { 4 } }, // BSR+XOR or BSR+XOR+CMOV | |||
3885 | { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 1, 1, 1 } }, // BSR+XOR | |||
3886 | { ISD::CTTZ, MVT::i64, { 3 } }, // TEST+BSF+CMOV/BRANCH | |||
3887 | { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 1, 1, 1 } }, // BSR | |||
3888 | { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } }, | |||
3889 | { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } }, | |||
3890 | { ISD::ROTR, MVT::i64, { 2, 3, 1, 3 } }, | |||
3891 | { ISD::FSHL, MVT::i64, { 4, 4, 1, 4 } }, | |||
3892 | { ISD::SMAX, MVT::i64, { 1, 3, 2, 3 } }, | |||
3893 | { ISD::SMIN, MVT::i64, { 1, 3, 2, 3 } }, | |||
3894 | { ISD::UMAX, MVT::i64, { 1, 3, 2, 3 } }, | |||
3895 | { ISD::UMIN, MVT::i64, { 1, 3, 2, 3 } }, | |||
3896 | { ISD::SADDO, MVT::i64, { 1 } }, | |||
3897 | { ISD::UADDO, MVT::i64, { 1 } }, | |||
3898 | { ISD::UMULO, MVT::i64, { 2 } }, // mulq + seto | |||
3899 | }; | |||
3900 | static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets | |||
3901 | { ISD::ABS, MVT::i32, { 1, 2, 3, 4 } }, // SUB+XOR+SRA or SUB+CMOV | |||
3902 | { ISD::ABS, MVT::i16, { 2, 2, 3, 4 } }, // SUB+XOR+SRA or SUB+CMOV | |||
3903 | { ISD::ABS, MVT::i8, { 2, 4, 4, 4 } }, // SUB+XOR+SRA | |||
3904 | { ISD::BITREVERSE, MVT::i32, { 14 } }, | |||
3905 | { ISD::BITREVERSE, MVT::i16, { 14 } }, | |||
3906 | { ISD::BITREVERSE, MVT::i8, { 11 } }, | |||
3907 | { ISD::BSWAP, MVT::i32, { 1 } }, | |||
3908 | { ISD::BSWAP, MVT::i16, { 1 } }, // ROL | |||
3909 | { ISD::CTLZ, MVT::i32, { 4 } }, // BSR+XOR or BSR+XOR+CMOV | |||
3910 | { ISD::CTLZ, MVT::i16, { 4 } }, // BSR+XOR or BSR+XOR+CMOV | |||
3911 | { ISD::CTLZ, MVT::i8, { 4 } }, // BSR+XOR or BSR+XOR+CMOV | |||
3912 | { ISD::CTLZ_ZERO_UNDEF, MVT::i32,{ 1, 1, 1, 1 } }, // BSR+XOR | |||
3913 | { ISD::CTLZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 3, 3 } }, // BSR+XOR | |||
3914 | { ISD::CTLZ_ZERO_UNDEF, MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR | |||
3915 | { ISD::CTTZ, MVT::i32, { 3 } }, // TEST+BSF+CMOV/BRANCH | |||
3916 | { ISD::CTTZ, MVT::i16, { 3 } }, // TEST+BSF+CMOV/BRANCH | |||
3917 | { ISD::CTTZ, MVT::i8, { 3 } }, // TEST+BSF+CMOV/BRANCH | |||
3918 | { ISD::CTTZ_ZERO_UNDEF, MVT::i32,{ 1, 1, 1, 1 } }, // BSF | |||
3919 | { ISD::CTTZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 1, 1 } }, // BSF | |||
3920 | { ISD::CTTZ_ZERO_UNDEF, MVT::i8, { 2, 2, 1, 1 } }, // BSF | |||
3921 | { ISD::CTPOP, MVT::i32, { 8, 7, 15, 15 } }, | |||
3922 | { ISD::CTPOP, MVT::i16, { 9, 8, 17, 17 } }, | |||
3923 | { ISD::CTPOP, MVT::i8, { 7, 6, 13, 13 } }, | |||
3924 | { ISD::ROTL, MVT::i32, { 2, 3, 1, 3 } }, | |||
3925 | { ISD::ROTL, MVT::i16, { 2, 3, 1, 3 } }, | |||
3926 | { ISD::ROTL, MVT::i8, { 2, 3, 1, 3 } }, | |||
3927 | { ISD::ROTR, MVT::i32, { 2, 3, 1, 3 } }, | |||
3928 | { ISD::ROTR, MVT::i16, { 2, 3, 1, 3 } }, | |||
3929 | { ISD::ROTR, MVT::i8, { 2, 3, 1, 3 } }, | |||
3930 | { ISD::FSHL, MVT::i32, { 4, 4, 1, 4 } }, | |||
3931 | { ISD::FSHL, MVT::i16, { 4, 4, 2, 5 } }, | |||
3932 | { ISD::FSHL, MVT::i8, { 4, 4, 2, 5 } }, | |||
3933 | { ISD::SMAX, MVT::i32, { 1, 2, 2, 3 } }, | |||
3934 | { ISD::SMAX, MVT::i16, { 1, 4, 2, 4 } }, | |||
3935 | { ISD::SMAX, MVT::i8, { 1, 4, 2, 4 } }, | |||
3936 | { ISD::SMIN, MVT::i32, { 1, 2, 2, 3 } }, | |||
3937 | { ISD::SMIN, MVT::i16, { 1, 4, 2, 4 } }, | |||
3938 | { ISD::SMIN, MVT::i8, { 1, 4, 2, 4 } }, | |||
3939 | { ISD::UMAX, MVT::i32, { 1, 2, 2, 3 } }, | |||
3940 | { ISD::UMAX, MVT::i16, { 1, 4, 2, 4 } }, | |||
3941 | { ISD::UMAX, MVT::i8, { 1, 4, 2, 4 } }, | |||
3942 | { ISD::UMIN, MVT::i32, { 1, 2, 2, 3 } }, | |||
3943 | { ISD::UMIN, MVT::i16, { 1, 4, 2, 4 } }, | |||
3944 | { ISD::UMIN, MVT::i8, { 1, 4, 2, 4 } }, | |||
3945 | { ISD::SADDO, MVT::i32, { 1 } }, | |||
3946 | { ISD::SADDO, MVT::i16, { 1 } }, | |||
3947 | { ISD::SADDO, MVT::i8, { 1 } }, | |||
3948 | { ISD::UADDO, MVT::i32, { 1 } }, | |||
3949 | { ISD::UADDO, MVT::i16, { 1 } }, | |||
3950 | { ISD::UADDO, MVT::i8, { 1 } }, | |||
3951 | { ISD::UMULO, MVT::i32, { 2 } }, // mul + seto | |||
3952 | { ISD::UMULO, MVT::i16, { 2 } }, | |||
3953 | { ISD::UMULO, MVT::i8, { 2 } }, | |||
3954 | }; | |||
3955 | ||||
3956 | Type *RetTy = ICA.getReturnType(); | |||
3957 | Type *OpTy = RetTy; | |||
3958 | Intrinsic::ID IID = ICA.getID(); | |||
3959 | unsigned ISD = ISD::DELETED_NODE; | |||
3960 | switch (IID) { | |||
3961 | default: | |||
3962 | break; | |||
3963 | case Intrinsic::abs: | |||
3964 | ISD = ISD::ABS; | |||
3965 | break; | |||
3966 | case Intrinsic::bitreverse: | |||
3967 | ISD = ISD::BITREVERSE; | |||
3968 | break; | |||
3969 | case Intrinsic::bswap: | |||
3970 | ISD = ISD::BSWAP; | |||
3971 | break; | |||
3972 | case Intrinsic::ctlz: | |||
3973 | ISD = ISD::CTLZ; | |||
3974 | break; | |||
3975 | case Intrinsic::ctpop: | |||
3976 | ISD = ISD::CTPOP; | |||
3977 | break; | |||
3978 | case Intrinsic::cttz: | |||
3979 | ISD = ISD::CTTZ; | |||
3980 | break; | |||
3981 | case Intrinsic::fshl: | |||
3982 | ISD = ISD::FSHL; | |||
3983 | if (!ICA.isTypeBasedOnly()) { | |||
3984 | const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); | |||
3985 | if (Args[0] == Args[1]) | |||
3986 | ISD = ISD::ROTL; | |||
3987 | } | |||
3988 | break; | |||
3989 | case Intrinsic::fshr: | |||
3990 | // FSHR has same costs so don't duplicate. | |||
3991 | ISD = ISD::FSHL; | |||
3992 | if (!ICA.isTypeBasedOnly()) { | |||
3993 | const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); | |||
3994 | if (Args[0] == Args[1]) | |||
3995 | ISD = ISD::ROTR; | |||
3996 | } | |||
3997 | break; | |||
3998 | case Intrinsic::maxnum: | |||
3999 | case Intrinsic::minnum: | |||
4000 | // FMINNUM has same costs so don't duplicate. | |||
4001 | ISD = ISD::FMAXNUM; | |||
4002 | break; | |||
4003 | case Intrinsic::sadd_sat: | |||
4004 | ISD = ISD::SADDSAT; | |||
4005 | break; | |||
4006 | case Intrinsic::smax: | |||
4007 | ISD = ISD::SMAX; | |||
4008 | break; | |||
4009 | case Intrinsic::smin: | |||
4010 | ISD = ISD::SMIN; | |||
4011 | break; | |||
4012 | case Intrinsic::ssub_sat: | |||
4013 | ISD = ISD::SSUBSAT; | |||
4014 | break; | |||
4015 | case Intrinsic::uadd_sat: | |||
4016 | ISD = ISD::UADDSAT; | |||
4017 | break; | |||
4018 | case Intrinsic::umax: | |||
4019 | ISD = ISD::UMAX; | |||
4020 | break; | |||
4021 | case Intrinsic::umin: | |||
4022 | ISD = ISD::UMIN; | |||
4023 | break; | |||
4024 | case Intrinsic::usub_sat: | |||
4025 | ISD = ISD::USUBSAT; | |||
4026 | break; | |||
4027 | case Intrinsic::sqrt: | |||
4028 | ISD = ISD::FSQRT; | |||
4029 | break; | |||
4030 | case Intrinsic::sadd_with_overflow: | |||
4031 | case Intrinsic::ssub_with_overflow: | |||
4032 | // SSUBO has same costs so don't duplicate. | |||
4033 | ISD = ISD::SADDO; | |||
4034 | OpTy = RetTy->getContainedType(0); | |||
4035 | break; | |||
4036 | case Intrinsic::uadd_with_overflow: | |||
4037 | case Intrinsic::usub_with_overflow: | |||
4038 | // USUBO has same costs so don't duplicate. | |||
4039 | ISD = ISD::UADDO; | |||
4040 | OpTy = RetTy->getContainedType(0); | |||
4041 | break; | |||
4042 | case Intrinsic::umul_with_overflow: | |||
4043 | case Intrinsic::smul_with_overflow: | |||
4044 | // SMULO has same costs so don't duplicate. | |||
4045 | ISD = ISD::UMULO; | |||
4046 | OpTy = RetTy->getContainedType(0); | |||
4047 | break; | |||
4048 | } | |||
4049 | ||||
4050 | if (ISD != ISD::DELETED_NODE) { | |||
4051 | // Legalize the type. | |||
4052 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy); | |||
4053 | MVT MTy = LT.second; | |||
4054 | ||||
4055 | // Attempt to lookup cost. | |||
4056 | if (ISD == ISD::BITREVERSE && ST->hasGFNI() && ST->hasSSSE3() && | |||
4057 | MTy.isVector()) { | |||
4058 | // With PSHUFB the code is very similar for all types. If we have integer | |||
4059 | // byte operations, we just need a GF2P8AFFINEQB for vXi8. For other types | |||
4060 | // we also need a PSHUFB. | |||
4061 | unsigned Cost = MTy.getVectorElementType() == MVT::i8 ? 1 : 2; | |||
4062 | ||||
4063 | // Without byte operations, we need twice as many GF2P8AFFINEQB and PSHUFB | |||
4064 | // instructions. We also need an extract and an insert. | |||
4065 | if (!(MTy.is128BitVector() || (ST->hasAVX2() && MTy.is256BitVector()) || | |||
4066 | (ST->hasBWI() && MTy.is512BitVector()))) | |||
4067 | Cost = Cost * 2 + 2; | |||
4068 | ||||
4069 | return LT.first * Cost; | |||
4070 | } | |||
4071 | ||||
4072 | // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost. | |||
4073 | if (((ISD == ISD::CTTZ && !ST->hasBMI()) || | |||
4074 | (ISD == ISD::CTLZ && !ST->hasLZCNT())) && | |||
4075 | !MTy.isVector() && !ICA.isTypeBasedOnly()) { | |||
4076 | const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); | |||
4077 | if (auto *Cst = dyn_cast<ConstantInt>(Args[1])) | |||
4078 | if (Cst->isAllOnesValue()) | |||
4079 | ISD = ISD == ISD::CTTZ ? ISD::CTTZ_ZERO_UNDEF : ISD::CTLZ_ZERO_UNDEF; | |||
4080 | } | |||
4081 | ||||
4082 | // FSQRT is a single instruction. | |||
4083 | if (ISD == ISD::FSQRT && CostKind == TTI::TCK_CodeSize) | |||
4084 | return LT.first; | |||
4085 | ||||
4086 | auto adjustTableCost = [](int ISD, unsigned Cost, | |||
4087 | InstructionCost LegalizationCost, | |||
4088 | FastMathFlags FMF) { | |||
4089 | // If there are no NANs to deal with, then these are reduced to a | |||
4090 | // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we | |||
4091 | // assume is used in the non-fast case. | |||
4092 | if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) { | |||
4093 | if (FMF.noNaNs()) | |||
4094 | return LegalizationCost * 1; | |||
4095 | } | |||
4096 | return LegalizationCost * (int)Cost; | |||
4097 | }; | |||
4098 | ||||
4099 | if (ST->useGLMDivSqrtCosts()) | |||
4100 | if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy)) | |||
4101 | if (auto KindCost = Entry->Cost[CostKind]) | |||
4102 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
4103 | ICA.getFlags()); | |||
4104 | ||||
4105 | if (ST->useSLMArithCosts()) | |||
4106 | if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) | |||
4107 | if (auto KindCost = Entry->Cost[CostKind]) | |||
4108 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
4109 | ICA.getFlags()); | |||
4110 | ||||
4111 | if (ST->hasVBMI2()) | |||
4112 | if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy)) | |||
4113 | if (auto KindCost = Entry->Cost[CostKind]) | |||
4114 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
4115 | ICA.getFlags()); | |||
4116 | ||||
4117 | if (ST->hasBITALG()) | |||
4118 | if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy)) | |||
4119 | if (auto KindCost = Entry->Cost[CostKind]) | |||
4120 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
4121 | ICA.getFlags()); | |||
4122 | ||||
4123 | if (ST->hasVPOPCNTDQ()) | |||
4124 | if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy)) | |||
4125 | if (auto KindCost = Entry->Cost[CostKind]) | |||
4126 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
4127 | ICA.getFlags()); | |||
4128 | ||||
4129 | if (ST->hasCDI()) | |||
4130 | if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy)) | |||
4131 | if (auto KindCost = Entry->Cost[CostKind]) | |||
4132 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
4133 | ICA.getFlags()); | |||
4134 | ||||
4135 | if (ST->hasBWI()) | |||
4136 | if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) | |||
4137 | if (auto KindCost = Entry->Cost[CostKind]) | |||
4138 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
4139 | ICA.getFlags()); | |||
4140 | ||||
4141 | if (ST->hasAVX512()) | |||
4142 | if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) | |||
4143 | if (auto KindCost = Entry->Cost[CostKind]) | |||
4144 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
4145 | ICA.getFlags()); | |||
4146 | ||||
4147 | if (ST->hasXOP()) | |||
4148 | if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy)) | |||
4149 | if (auto KindCost = Entry->Cost[CostKind]) | |||
4150 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
4151 | ICA.getFlags()); | |||
4152 | ||||
4153 | if (ST->hasAVX2()) | |||
4154 | if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) | |||
4155 | if (auto KindCost = Entry->Cost[CostKind]) | |||
4156 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
4157 | ICA.getFlags()); | |||
4158 | ||||
4159 | if (ST->hasAVX()) | |||
4160 | if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) | |||
4161 | if (auto KindCost = Entry->Cost[CostKind]) | |||
4162 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
4163 | ICA.getFlags()); | |||
4164 | ||||
4165 | if (ST->hasSSE42()) | |||
4166 | if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) | |||
4167 | if (auto KindCost = Entry->Cost[CostKind]) | |||
4168 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
4169 | ICA.getFlags()); | |||
4170 | ||||
4171 | if (ST->hasSSE41()) | |||
4172 | if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) | |||
4173 | if (auto KindCost = Entry->Cost[CostKind]) | |||
4174 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
4175 | ICA.getFlags()); | |||
4176 | ||||
4177 | if (ST->hasSSSE3()) | |||
4178 | if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy)) | |||
4179 | if (auto KindCost = Entry->Cost[CostKind]) | |||
4180 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
4181 | ICA.getFlags()); | |||
4182 | ||||
4183 | if (ST->hasSSE2()) | |||
4184 | if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) | |||
4185 | if (auto KindCost = Entry->Cost[CostKind]) | |||
4186 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
4187 | ICA.getFlags()); | |||
4188 | ||||
4189 | if (ST->hasSSE1()) | |||
4190 | if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) | |||
4191 | if (auto KindCost = Entry->Cost[CostKind]) | |||
4192 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
4193 | ICA.getFlags()); | |||
4194 | ||||
4195 | if (ST->hasBMI()) { | |||
4196 | if (ST->is64Bit()) | |||
4197 | if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy)) | |||
4198 | if (auto KindCost = Entry->Cost[CostKind]) | |||
4199 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
4200 | ICA.getFlags()); | |||
4201 | ||||
4202 | if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy)) | |||
4203 | if (auto KindCost = Entry->Cost[CostKind]) | |||
4204 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
4205 | ICA.getFlags()); | |||
4206 | } | |||
4207 | ||||
4208 | if (ST->hasLZCNT()) { | |||
4209 | if (ST->is64Bit()) | |||
4210 | if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy)) | |||
4211 | if (auto KindCost = Entry->Cost[CostKind]) | |||
4212 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
4213 | ICA.getFlags()); | |||
4214 | ||||
4215 | if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy)) | |||
4216 | if (auto KindCost = Entry->Cost[CostKind]) | |||
4217 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
4218 | ICA.getFlags()); | |||
4219 | } | |||
4220 | ||||
4221 | if (ST->hasPOPCNT()) { | |||
4222 | if (ST->is64Bit()) | |||
4223 | if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy)) | |||
4224 | if (auto KindCost = Entry->Cost[CostKind]) | |||
4225 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
4226 | ICA.getFlags()); | |||
4227 | ||||
4228 | if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy)) | |||
4229 | if (auto KindCost = Entry->Cost[CostKind]) | |||
4230 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
4231 | ICA.getFlags()); | |||
4232 | } | |||
4233 | ||||
4234 | if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) { | |||
4235 | if (const Instruction *II = ICA.getInst()) { | |||
4236 | if (II->hasOneUse() && isa<StoreInst>(II->user_back())) | |||
4237 | return TTI::TCC_Free; | |||
4238 | if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) { | |||
4239 | if (LI->hasOneUse()) | |||
4240 | return TTI::TCC_Free; | |||
4241 | } | |||
4242 | } | |||
4243 | } | |||
4244 | ||||
4245 | if (ST->is64Bit()) | |||
4246 | if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy)) | |||
4247 | if (auto KindCost = Entry->Cost[CostKind]) | |||
4248 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
4249 | ICA.getFlags()); | |||
4250 | ||||
4251 | if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy)) | |||
4252 | if (auto KindCost = Entry->Cost[CostKind]) | |||
4253 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, ICA.getFlags()); | |||
4254 | } | |||
4255 | ||||
4256 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); | |||
4257 | } | |||
4258 | ||||
4259 | InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, | |||
4260 | unsigned Index) { | |||
4261 | static const CostTblEntry SLMCostTbl[] = { | |||
4262 | { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 }, | |||
4263 | { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 }, | |||
4264 | { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 }, | |||
4265 | { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 } | |||
4266 | }; | |||
4267 | ||||
4268 | assert(Val->isVectorTy() && "This must be a vector type")(static_cast <bool> (Val->isVectorTy() && "This must be a vector type" ) ? void (0) : __assert_fail ("Val->isVectorTy() && \"This must be a vector type\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4268, __extension__ __PRETTY_FUNCTION__)); | |||
4269 | Type *ScalarType = Val->getScalarType(); | |||
4270 | InstructionCost RegisterFileMoveCost = 0; | |||
4271 | TTI::TargetCostKind CostKind = TTI::TargetCostKind::TCK_RecipThroughput; | |||
4272 | ||||
4273 | // Non-immediate extraction/insertion can be handled as a sequence of | |||
4274 | // aliased loads+stores via the stack. | |||
4275 | if (Index == -1U && (Opcode == Instruction::ExtractElement || | |||
4276 | Opcode == Instruction::InsertElement)) { | |||
4277 | // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns: | |||
4278 | // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0. | |||
4279 | ||||
4280 | // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling. | |||
4281 | assert(isa<FixedVectorType>(Val) && "Fixed vector type expected")(static_cast <bool> (isa<FixedVectorType>(Val) && "Fixed vector type expected") ? void (0) : __assert_fail ("isa<FixedVectorType>(Val) && \"Fixed vector type expected\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4281, __extension__ __PRETTY_FUNCTION__)); | |||
4282 | Align VecAlign = DL.getPrefTypeAlign(Val); | |||
4283 | Align SclAlign = DL.getPrefTypeAlign(ScalarType); | |||
4284 | ||||
4285 | // Extract - store vector to stack, load scalar. | |||
4286 | if (Opcode == Instruction::ExtractElement) { | |||
4287 | return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) + | |||
4288 | getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0, | |||
4289 | CostKind); | |||
4290 | } | |||
4291 | // Insert - store vector to stack, store scalar, load vector. | |||
4292 | if (Opcode == Instruction::InsertElement) { | |||
4293 | return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) + | |||
4294 | getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0, | |||
4295 | CostKind) + | |||
4296 | getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind); | |||
4297 | } | |||
4298 | } | |||
4299 | ||||
4300 | if (Index != -1U && (Opcode
| |||
4301 | Opcode == Instruction::InsertElement)) { | |||
4302 | // Extraction of vXi1 elements are now efficiently handled by MOVMSK. | |||
4303 | if (Opcode
| |||
4304 | ScalarType->getScalarSizeInBits() == 1 && | |||
4305 | cast<FixedVectorType>(Val)->getNumElements() > 1) | |||
4306 | return 1; | |||
4307 | ||||
4308 | // Legalize the type. | |||
4309 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val); | |||
4310 | ||||
4311 | // This type is legalized to a scalar type. | |||
4312 | if (!LT.second.isVector()) | |||
4313 | return 0; | |||
4314 | ||||
4315 | // The type may be split. Normalize the index to the new type. | |||
4316 | unsigned SizeInBits = LT.second.getSizeInBits(); | |||
4317 | unsigned NumElts = LT.second.getVectorNumElements(); | |||
4318 | unsigned SubNumElts = NumElts; | |||
4319 | Index = Index % NumElts; | |||
4320 | ||||
4321 | // For >128-bit vectors, we need to extract higher 128-bit subvectors. | |||
4322 | // For inserts, we also need to insert the subvector back. | |||
4323 | if (SizeInBits > 128) { | |||
4324 | assert((SizeInBits % 128) == 0 && "Illegal vector")(static_cast <bool> ((SizeInBits % 128) == 0 && "Illegal vector") ? void (0) : __assert_fail ("(SizeInBits % 128) == 0 && \"Illegal vector\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4324, __extension__ __PRETTY_FUNCTION__)); | |||
4325 | unsigned NumSubVecs = SizeInBits / 128; | |||
4326 | SubNumElts = NumElts / NumSubVecs; | |||
4327 | if (SubNumElts <= Index) { | |||
4328 | RegisterFileMoveCost += (Opcode
| |||
4329 | Index %= SubNumElts; | |||
| ||||
4330 | } | |||
4331 | } | |||
4332 | ||||
4333 | if (Index == 0) { | |||
4334 | // Floating point scalars are already located in index #0. | |||
4335 | // Many insertions to #0 can fold away for scalar fp-ops, so let's assume | |||
4336 | // true for all. | |||
4337 | if (ScalarType->isFloatingPointTy()) | |||
4338 | return RegisterFileMoveCost; | |||
4339 | ||||
4340 | // Assume movd/movq XMM -> GPR is relatively cheap on all targets. | |||
4341 | if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement) | |||
4342 | return 1 + RegisterFileMoveCost; | |||
4343 | } | |||
4344 | ||||
4345 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
4346 | assert(ISD && "Unexpected vector opcode")(static_cast <bool> (ISD && "Unexpected vector opcode" ) ? void (0) : __assert_fail ("ISD && \"Unexpected vector opcode\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4346, __extension__ __PRETTY_FUNCTION__)); | |||
4347 | MVT MScalarTy = LT.second.getScalarType(); | |||
4348 | if (ST->useSLMArithCosts()) | |||
4349 | if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy)) | |||
4350 | return Entry->Cost + RegisterFileMoveCost; | |||
4351 | ||||
4352 | // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets. | |||
4353 | if ((MScalarTy == MVT::i16 && ST->hasSSE2()) || | |||
4354 | (MScalarTy.isInteger() && ST->hasSSE41())) | |||
4355 | return 1 + RegisterFileMoveCost; | |||
4356 | ||||
4357 | // Assume insertps is relatively cheap on all targets. | |||
4358 | if (MScalarTy == MVT::f32 && ST->hasSSE41() && | |||
4359 | Opcode == Instruction::InsertElement) | |||
4360 | return 1 + RegisterFileMoveCost; | |||
4361 | ||||
4362 | // For extractions we just need to shuffle the element to index 0, which | |||
4363 | // should be very cheap (assume cost = 1). For insertions we need to shuffle | |||
4364 | // the elements to its destination. In both cases we must handle the | |||
4365 | // subvector move(s). | |||
4366 | // If the vector type is already less than 128-bits then don't reduce it. | |||
4367 | // TODO: Under what circumstances should we shuffle using the full width? | |||
4368 | InstructionCost ShuffleCost = 1; | |||
4369 | if (Opcode == Instruction::InsertElement) { | |||
4370 | auto *SubTy = cast<VectorType>(Val); | |||
4371 | EVT VT = TLI->getValueType(DL, Val); | |||
4372 | if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128) | |||
4373 | SubTy = FixedVectorType::get(ScalarType, SubNumElts); | |||
4374 | ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, std::nullopt, | |||
4375 | CostKind, 0, SubTy); | |||
4376 | } | |||
4377 | int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1; | |||
4378 | return ShuffleCost + IntOrFpCost + RegisterFileMoveCost; | |||
4379 | } | |||
4380 | ||||
4381 | // Add to the base cost if we know that the extracted element of a vector is | |||
4382 | // destined to be moved to and used in the integer register file. | |||
4383 | if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy()) | |||
4384 | RegisterFileMoveCost += 1; | |||
4385 | ||||
4386 | return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost; | |||
4387 | } | |||
4388 | ||||
4389 | InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty, | |||
4390 | const APInt &DemandedElts, | |||
4391 | bool Insert, | |||
4392 | bool Extract) { | |||
4393 | assert(DemandedElts.getBitWidth() ==(static_cast <bool> (DemandedElts.getBitWidth() == cast <FixedVectorType>(Ty)->getNumElements() && "Vector size mismatch" ) ? void (0) : __assert_fail ("DemandedElts.getBitWidth() == cast<FixedVectorType>(Ty)->getNumElements() && \"Vector size mismatch\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4395, __extension__ __PRETTY_FUNCTION__)) | |||
4394 | cast<FixedVectorType>(Ty)->getNumElements() &&(static_cast <bool> (DemandedElts.getBitWidth() == cast <FixedVectorType>(Ty)->getNumElements() && "Vector size mismatch" ) ? void (0) : __assert_fail ("DemandedElts.getBitWidth() == cast<FixedVectorType>(Ty)->getNumElements() && \"Vector size mismatch\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4395, __extension__ __PRETTY_FUNCTION__)) | |||
4395 | "Vector size mismatch")(static_cast <bool> (DemandedElts.getBitWidth() == cast <FixedVectorType>(Ty)->getNumElements() && "Vector size mismatch" ) ? void (0) : __assert_fail ("DemandedElts.getBitWidth() == cast<FixedVectorType>(Ty)->getNumElements() && \"Vector size mismatch\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4395, __extension__ __PRETTY_FUNCTION__)); | |||
4396 | ||||
4397 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); | |||
4398 | MVT MScalarTy = LT.second.getScalarType(); | |||
4399 | unsigned LegalVectorBitWidth = LT.second.getSizeInBits(); | |||
4400 | TTI::TargetCostKind CostKind = TTI::TargetCostKind::TCK_RecipThroughput; | |||
4401 | InstructionCost Cost = 0; | |||
4402 | ||||
4403 | constexpr unsigned LaneBitWidth = 128; | |||
4404 | assert((LegalVectorBitWidth < LaneBitWidth ||(static_cast <bool> ((LegalVectorBitWidth < LaneBitWidth || (LegalVectorBitWidth % LaneBitWidth) == 0) && "Illegal vector" ) ? void (0) : __assert_fail ("(LegalVectorBitWidth < LaneBitWidth || (LegalVectorBitWidth % LaneBitWidth) == 0) && \"Illegal vector\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4406, __extension__ __PRETTY_FUNCTION__)) | |||
4405 | (LegalVectorBitWidth % LaneBitWidth) == 0) &&(static_cast <bool> ((LegalVectorBitWidth < LaneBitWidth || (LegalVectorBitWidth % LaneBitWidth) == 0) && "Illegal vector" ) ? void (0) : __assert_fail ("(LegalVectorBitWidth < LaneBitWidth || (LegalVectorBitWidth % LaneBitWidth) == 0) && \"Illegal vector\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4406, __extension__ __PRETTY_FUNCTION__)) | |||
4406 | "Illegal vector")(static_cast <bool> ((LegalVectorBitWidth < LaneBitWidth || (LegalVectorBitWidth % LaneBitWidth) == 0) && "Illegal vector" ) ? void (0) : __assert_fail ("(LegalVectorBitWidth < LaneBitWidth || (LegalVectorBitWidth % LaneBitWidth) == 0) && \"Illegal vector\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4406, __extension__ __PRETTY_FUNCTION__)); | |||
4407 | ||||
4408 | const int NumLegalVectors = *LT.first.getValue(); | |||
4409 | assert(NumLegalVectors >= 0 && "Negative cost!")(static_cast <bool> (NumLegalVectors >= 0 && "Negative cost!") ? void (0) : __assert_fail ("NumLegalVectors >= 0 && \"Negative cost!\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4409, __extension__ __PRETTY_FUNCTION__)); | |||
4410 | ||||
4411 | // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much | |||
4412 | // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT. | |||
4413 | if (Insert) { | |||
4414 | if ((MScalarTy == MVT::i16 && ST->hasSSE2()) || | |||
4415 | (MScalarTy.isInteger() && ST->hasSSE41()) || | |||
4416 | (MScalarTy == MVT::f32 && ST->hasSSE41())) { | |||
4417 | // For types we can insert directly, insertion into 128-bit sub vectors is | |||
4418 | // cheap, followed by a cheap chain of concatenations. | |||
4419 | if (LegalVectorBitWidth <= LaneBitWidth) { | |||
4420 | Cost += | |||
4421 | BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false); | |||
4422 | } else { | |||
4423 | // In each 128-lane, if at least one index is demanded but not all | |||
4424 | // indices are demanded and this 128-lane is not the first 128-lane of | |||
4425 | // the legalized-vector, then this 128-lane needs a extracti128; If in | |||
4426 | // each 128-lane, there is at least one demanded index, this 128-lane | |||
4427 | // needs a inserti128. | |||
4428 | ||||
4429 | // The following cases will help you build a better understanding: | |||
4430 | // Assume we insert several elements into a v8i32 vector in avx2, | |||
4431 | // Case#1: inserting into 1th index needs vpinsrd + inserti128. | |||
4432 | // Case#2: inserting into 5th index needs extracti128 + vpinsrd + | |||
4433 | // inserti128. | |||
4434 | // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128. | |||
4435 | assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector")(static_cast <bool> ((LegalVectorBitWidth % LaneBitWidth ) == 0 && "Illegal vector") ? void (0) : __assert_fail ("(LegalVectorBitWidth % LaneBitWidth) == 0 && \"Illegal vector\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4435, __extension__ __PRETTY_FUNCTION__)); | |||
4436 | unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth; | |||
4437 | unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors; | |||
4438 | unsigned NumLegalElts = | |||
4439 | LT.second.getVectorNumElements() * NumLegalVectors; | |||
4440 | assert(NumLegalElts >= DemandedElts.getBitWidth() &&(static_cast <bool> (NumLegalElts >= DemandedElts.getBitWidth () && "Vector has been legalized to smaller element count" ) ? void (0) : __assert_fail ("NumLegalElts >= DemandedElts.getBitWidth() && \"Vector has been legalized to smaller element count\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4441, __extension__ __PRETTY_FUNCTION__)) | |||
4441 | "Vector has been legalized to smaller element count")(static_cast <bool> (NumLegalElts >= DemandedElts.getBitWidth () && "Vector has been legalized to smaller element count" ) ? void (0) : __assert_fail ("NumLegalElts >= DemandedElts.getBitWidth() && \"Vector has been legalized to smaller element count\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4441, __extension__ __PRETTY_FUNCTION__)); | |||
4442 | assert((NumLegalElts % NumLanesTotal) == 0 &&(static_cast <bool> ((NumLegalElts % NumLanesTotal) == 0 && "Unexpected elts per lane") ? void (0) : __assert_fail ("(NumLegalElts % NumLanesTotal) == 0 && \"Unexpected elts per lane\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4443, __extension__ __PRETTY_FUNCTION__)) | |||
4443 | "Unexpected elts per lane")(static_cast <bool> ((NumLegalElts % NumLanesTotal) == 0 && "Unexpected elts per lane") ? void (0) : __assert_fail ("(NumLegalElts % NumLanesTotal) == 0 && \"Unexpected elts per lane\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4443, __extension__ __PRETTY_FUNCTION__)); | |||
4444 | unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal; | |||
4445 | ||||
4446 | APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts); | |||
4447 | auto *LaneTy = | |||
4448 | FixedVectorType::get(Ty->getElementType(), NumEltsPerLane); | |||
4449 | ||||
4450 | for (unsigned I = 0; I != NumLanesTotal; ++I) { | |||
4451 | APInt LaneEltMask = WidenedDemandedElts.extractBits( | |||
4452 | NumEltsPerLane, NumEltsPerLane * I); | |||
4453 | if (LaneEltMask.isNullValue()) | |||
4454 | continue; | |||
4455 | // FIXME: we don't need to extract if all non-demanded elements | |||
4456 | // are legalization-inserted padding. | |||
4457 | if (!LaneEltMask.isAllOnes()) | |||
4458 | Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, | |||
4459 | CostKind, I * NumEltsPerLane, LaneTy); | |||
4460 | Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert, | |||
4461 | false); | |||
4462 | } | |||
4463 | ||||
4464 | APInt AffectedLanes = | |||
4465 | APIntOps::ScaleBitMask(WidenedDemandedElts, NumLanesTotal); | |||
4466 | APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask( | |||
4467 | AffectedLanes, NumLegalVectors, /*MatchAllBits=*/true); | |||
4468 | for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) { | |||
4469 | for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) { | |||
4470 | unsigned I = NumLegalLanes * LegalVec + Lane; | |||
4471 | // No need to insert unaffected lane; or lane 0 of each legal vector | |||
4472 | // iff ALL lanes of that vector were affected and will be inserted. | |||
4473 | if (!AffectedLanes[I] || | |||
4474 | (Lane == 0 && FullyAffectedLegalVectors[LegalVec])) | |||
4475 | continue; | |||
4476 | Cost += getShuffleCost(TTI::SK_InsertSubvector, Ty, std::nullopt, | |||
4477 | CostKind, I * NumEltsPerLane, LaneTy); | |||
4478 | } | |||
4479 | } | |||
4480 | } | |||
4481 | } else if (LT.second.isVector()) { | |||
4482 | // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded | |||
4483 | // integer element as a SCALAR_TO_VECTOR, then we build the vector as a | |||
4484 | // series of UNPCK followed by CONCAT_VECTORS - all of these can be | |||
4485 | // considered cheap. | |||
4486 | if (Ty->isIntOrIntVectorTy()) | |||
4487 | Cost += DemandedElts.countPopulation(); | |||
4488 | ||||
4489 | // Get the smaller of the legalized or original pow2-extended number of | |||
4490 | // vector elements, which represents the number of unpacks we'll end up | |||
4491 | // performing. | |||
4492 | unsigned NumElts = LT.second.getVectorNumElements(); | |||
4493 | unsigned Pow2Elts = | |||
4494 | PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements()); | |||
4495 | Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first; | |||
4496 | } | |||
4497 | } | |||
4498 | ||||
4499 | if (Extract) { | |||
4500 | // vXi1 can be efficiently extracted with MOVMSK. | |||
4501 | // TODO: AVX512 predicate mask handling. | |||
4502 | // NOTE: This doesn't work well for roundtrip scalarization. | |||
4503 | if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) { | |||
4504 | unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements(); | |||
4505 | unsigned MaxElts = ST->hasAVX2() ? 32 : 16; | |||
4506 | unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts; | |||
4507 | return MOVMSKCost; | |||
4508 | } | |||
4509 | ||||
4510 | if (LT.second.isVector()) { | |||
4511 | unsigned NumLegalElts = | |||
4512 | LT.second.getVectorNumElements() * NumLegalVectors; | |||
4513 | assert(NumLegalElts >= DemandedElts.getBitWidth() &&(static_cast <bool> (NumLegalElts >= DemandedElts.getBitWidth () && "Vector has been legalized to smaller element count" ) ? void (0) : __assert_fail ("NumLegalElts >= DemandedElts.getBitWidth() && \"Vector has been legalized to smaller element count\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4514, __extension__ __PRETTY_FUNCTION__)) | |||
4514 | "Vector has been legalized to smaller element count")(static_cast <bool> (NumLegalElts >= DemandedElts.getBitWidth () && "Vector has been legalized to smaller element count" ) ? void (0) : __assert_fail ("NumLegalElts >= DemandedElts.getBitWidth() && \"Vector has been legalized to smaller element count\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4514, __extension__ __PRETTY_FUNCTION__)); | |||
4515 | ||||
4516 | // If we're extracting elements from a 128-bit subvector lane, | |||
4517 | // we only need to extract each lane once, not for every element. | |||
4518 | if (LegalVectorBitWidth > LaneBitWidth) { | |||
4519 | unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth; | |||
4520 | unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors; | |||
4521 | assert((NumLegalElts % NumLanesTotal) == 0 &&(static_cast <bool> ((NumLegalElts % NumLanesTotal) == 0 && "Unexpected elts per lane") ? void (0) : __assert_fail ("(NumLegalElts % NumLanesTotal) == 0 && \"Unexpected elts per lane\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4522, __extension__ __PRETTY_FUNCTION__)) | |||
4522 | "Unexpected elts per lane")(static_cast <bool> ((NumLegalElts % NumLanesTotal) == 0 && "Unexpected elts per lane") ? void (0) : __assert_fail ("(NumLegalElts % NumLanesTotal) == 0 && \"Unexpected elts per lane\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4522, __extension__ __PRETTY_FUNCTION__)); | |||
4523 | unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal; | |||
4524 | ||||
4525 | // Add cost for each demanded 128-bit subvector extraction. | |||
4526 | // Luckily this is a lot easier than for insertion. | |||
4527 | APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts); | |||
4528 | auto *LaneTy = | |||
4529 | FixedVectorType::get(Ty->getElementType(), NumEltsPerLane); | |||
4530 | ||||
4531 | for (unsigned I = 0; I != NumLanesTotal; ++I) { | |||
4532 | APInt LaneEltMask = WidenedDemandedElts.extractBits( | |||
4533 | NumEltsPerLane, I * NumEltsPerLane); | |||
4534 | if (LaneEltMask.isNullValue()) | |||
4535 | continue; | |||
4536 | Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, | |||
4537 | CostKind, I * NumEltsPerLane, LaneTy); | |||
4538 | Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, false, | |||
4539 | Extract); | |||
4540 | } | |||
4541 | ||||
4542 | return Cost; | |||
4543 | } | |||
4544 | } | |||
4545 | ||||
4546 | // Fallback to default extraction. | |||
4547 | Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, false, Extract); | |||
4548 | } | |||
4549 | ||||
4550 | return Cost; | |||
4551 | } | |||
4552 | ||||
4553 | InstructionCost | |||
4554 | X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, | |||
4555 | int VF, const APInt &DemandedDstElts, | |||
4556 | TTI::TargetCostKind CostKind) { | |||
4557 | const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy); | |||
4558 | // We don't differentiate element types here, only element bit width. | |||
4559 | EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits); | |||
4560 | ||||
4561 | auto bailout = [&]() { | |||
4562 | return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF, | |||
4563 | DemandedDstElts, CostKind); | |||
4564 | }; | |||
4565 | ||||
4566 | // For now, only deal with AVX512 cases. | |||
4567 | if (!ST->hasAVX512()) | |||
4568 | return bailout(); | |||
4569 | ||||
4570 | // Do we have a native shuffle for this element type, or should we promote? | |||
4571 | unsigned PromEltTyBits = EltTyBits; | |||
4572 | switch (EltTyBits) { | |||
4573 | case 32: | |||
4574 | case 64: | |||
4575 | break; // AVX512F. | |||
4576 | case 16: | |||
4577 | if (!ST->hasBWI()) | |||
4578 | PromEltTyBits = 32; // promote to i32, AVX512F. | |||
4579 | break; // AVX512BW | |||
4580 | case 8: | |||
4581 | if (!ST->hasVBMI()) | |||
4582 | PromEltTyBits = 32; // promote to i32, AVX512F. | |||
4583 | break; // AVX512VBMI | |||
4584 | case 1: | |||
4585 | // There is no support for shuffling i1 elements. We *must* promote. | |||
4586 | if (ST->hasBWI()) { | |||
4587 | if (ST->hasVBMI()) | |||
4588 | PromEltTyBits = 8; // promote to i8, AVX512VBMI. | |||
4589 | else | |||
4590 | PromEltTyBits = 16; // promote to i16, AVX512BW. | |||
4591 | break; | |||
4592 | } | |||
4593 | PromEltTyBits = 32; // promote to i32, AVX512F. | |||
4594 | break; | |||
4595 | default: | |||
4596 | return bailout(); | |||
4597 | } | |||
4598 | auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits); | |||
4599 | ||||
4600 | auto *SrcVecTy = FixedVectorType::get(EltTy, VF); | |||
4601 | auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF); | |||
4602 | ||||
4603 | int NumDstElements = VF * ReplicationFactor; | |||
4604 | auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements); | |||
4605 | auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements); | |||
4606 | ||||
4607 | // Legalize the types. | |||
4608 | MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second; | |||
4609 | MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second; | |||
4610 | MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second; | |||
4611 | MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second; | |||
4612 | // They should have legalized into vector types. | |||
4613 | if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() || | |||
4614 | !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector()) | |||
4615 | return bailout(); | |||
4616 | ||||
4617 | if (PromEltTyBits != EltTyBits) { | |||
4618 | // If we have to perform the shuffle with wider elt type than our data type, | |||
4619 | // then we will first need to anyext (we don't care about the new bits) | |||
4620 | // the source elements, and then truncate Dst elements. | |||
4621 | InstructionCost PromotionCost; | |||
4622 | PromotionCost += getCastInstrCost( | |||
4623 | Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy, | |||
4624 | TargetTransformInfo::CastContextHint::None, CostKind); | |||
4625 | PromotionCost += | |||
4626 | getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy, | |||
4627 | /*Src=*/PromDstVecTy, | |||
4628 | TargetTransformInfo::CastContextHint::None, CostKind); | |||
4629 | return PromotionCost + getReplicationShuffleCost(PromEltTy, | |||
4630 | ReplicationFactor, VF, | |||
4631 | DemandedDstElts, CostKind); | |||
4632 | } | |||
4633 | ||||
4634 | assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&(static_cast <bool> (LegalSrcVecTy.getScalarSizeInBits( ) == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy .getScalarType() && "We expect that the legalization doesn't affect the element width, " "doesn't coalesce/split elements.") ? void (0) : __assert_fail ("LegalSrcVecTy.getScalarSizeInBits() == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() && \"We expect that the legalization doesn't affect the element width, \" \"doesn't coalesce/split elements.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4637, __extension__ __PRETTY_FUNCTION__)) | |||
4635 | LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&(static_cast <bool> (LegalSrcVecTy.getScalarSizeInBits( ) == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy .getScalarType() && "We expect that the legalization doesn't affect the element width, " "doesn't coalesce/split elements.") ? void (0) : __assert_fail ("LegalSrcVecTy.getScalarSizeInBits() == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() && \"We expect that the legalization doesn't affect the element width, \" \"doesn't coalesce/split elements.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4637, __extension__ __PRETTY_FUNCTION__)) | |||
4636 | "We expect that the legalization doesn't affect the element width, "(static_cast <bool> (LegalSrcVecTy.getScalarSizeInBits( ) == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy .getScalarType() && "We expect that the legalization doesn't affect the element width, " "doesn't coalesce/split elements.") ? void (0) : __assert_fail ("LegalSrcVecTy.getScalarSizeInBits() == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() && \"We expect that the legalization doesn't affect the element width, \" \"doesn't coalesce/split elements.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4637, __extension__ __PRETTY_FUNCTION__)) | |||
4637 | "doesn't coalesce/split elements.")(static_cast <bool> (LegalSrcVecTy.getScalarSizeInBits( ) == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy .getScalarType() && "We expect that the legalization doesn't affect the element width, " "doesn't coalesce/split elements.") ? void (0) : __assert_fail ("LegalSrcVecTy.getScalarSizeInBits() == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() && \"We expect that the legalization doesn't affect the element width, \" \"doesn't coalesce/split elements.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4637, __extension__ __PRETTY_FUNCTION__)); | |||
4638 | ||||
4639 | unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements(); | |||
4640 | unsigned NumDstVectors = | |||
4641 | divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec); | |||
4642 | ||||
4643 | auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec); | |||
4644 | ||||
4645 | // Not all the produced Dst elements may be demanded. In our case, | |||
4646 | // given that a single Dst vector is formed by a single shuffle, | |||
4647 | // if all elements that will form a single Dst vector aren't demanded, | |||
4648 | // then we won't need to do that shuffle, so adjust the cost accordingly. | |||
4649 | APInt DemandedDstVectors = APIntOps::ScaleBitMask( | |||
4650 | DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors); | |||
4651 | unsigned NumDstVectorsDemanded = DemandedDstVectors.countPopulation(); | |||
4652 | ||||
4653 | InstructionCost SingleShuffleCost = getShuffleCost( | |||
4654 | TTI::SK_PermuteSingleSrc, SingleDstVecTy, /*Mask=*/std::nullopt, CostKind, | |||
4655 | /*Index=*/0, /*SubTp=*/nullptr); | |||
4656 | return NumDstVectorsDemanded * SingleShuffleCost; | |||
4657 | } | |||
4658 | ||||
4659 | InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, | |||
4660 | MaybeAlign Alignment, | |||
4661 | unsigned AddressSpace, | |||
4662 | TTI::TargetCostKind CostKind, | |||
4663 | TTI::OperandValueInfo OpInfo, | |||
4664 | const Instruction *I) { | |||
4665 | // TODO: Handle other cost kinds. | |||
4666 | if (CostKind != TTI::TCK_RecipThroughput) { | |||
4667 | if (auto *SI = dyn_cast_or_null<StoreInst>(I)) { | |||
4668 | // Store instruction with index and scale costs 2 Uops. | |||
4669 | // Check the preceding GEP to identify non-const indices. | |||
4670 | if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) { | |||
4671 | if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); })) | |||
4672 | return TTI::TCC_Basic * 2; | |||
4673 | } | |||
4674 | } | |||
4675 | return TTI::TCC_Basic; | |||
4676 | } | |||
4677 | ||||
4678 | assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&(static_cast <bool> ((Opcode == Instruction::Load || Opcode == Instruction::Store) && "Invalid Opcode") ? void ( 0) : __assert_fail ("(Opcode == Instruction::Load || Opcode == Instruction::Store) && \"Invalid Opcode\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4679, __extension__ __PRETTY_FUNCTION__)) | |||
4679 | "Invalid Opcode")(static_cast <bool> ((Opcode == Instruction::Load || Opcode == Instruction::Store) && "Invalid Opcode") ? void ( 0) : __assert_fail ("(Opcode == Instruction::Load || Opcode == Instruction::Store) && \"Invalid Opcode\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4679, __extension__ __PRETTY_FUNCTION__)); | |||
4680 | // Type legalization can't handle structs | |||
4681 | if (TLI->getValueType(DL, Src, true) == MVT::Other) | |||
4682 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, | |||
4683 | CostKind); | |||
4684 | ||||
4685 | // Legalize the type. | |||
4686 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src); | |||
4687 | ||||
4688 | auto *VTy = dyn_cast<FixedVectorType>(Src); | |||
4689 | ||||
4690 | InstructionCost Cost = 0; | |||
4691 | ||||
4692 | // Add a cost for constant load to vector. | |||
4693 | if (Opcode == Instruction::Store && OpInfo.isConstant()) | |||
4694 | Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src), | |||
4695 | /*AddressSpace=*/0, CostKind); | |||
4696 | ||||
4697 | // Handle the simple case of non-vectors. | |||
4698 | // NOTE: this assumes that legalization never creates vector from scalars! | |||
4699 | if (!VTy || !LT.second.isVector()) { | |||
4700 | // Each load/store unit costs 1. | |||
4701 | return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1; | |||
4702 | } | |||
4703 | ||||
4704 | bool IsLoad = Opcode == Instruction::Load; | |||
4705 | ||||
4706 | Type *EltTy = VTy->getElementType(); | |||
4707 | ||||
4708 | const int EltTyBits = DL.getTypeSizeInBits(EltTy); | |||
4709 | ||||
4710 | // Source of truth: how many elements were there in the original IR vector? | |||
4711 | const unsigned SrcNumElt = VTy->getNumElements(); | |||
4712 | ||||
4713 | // How far have we gotten? | |||
4714 | int NumEltRemaining = SrcNumElt; | |||
4715 | // Note that we intentionally capture by-reference, NumEltRemaining changes. | |||
4716 | auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; }; | |||
4717 | ||||
4718 | const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8); | |||
4719 | ||||
4720 | // Note that even if we can store 64 bits of an XMM, we still operate on XMM. | |||
4721 | const unsigned XMMBits = 128; | |||
4722 | if (XMMBits % EltTyBits != 0) | |||
4723 | // Vector size must be a multiple of the element size. I.e. no padding. | |||
4724 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, | |||
4725 | CostKind); | |||
4726 | const int NumEltPerXMM = XMMBits / EltTyBits; | |||
4727 | ||||
4728 | auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM); | |||
4729 | ||||
4730 | for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0; | |||
4731 | NumEltRemaining > 0; CurrOpSizeBytes /= 2) { | |||
4732 | // How many elements would a single op deal with at once? | |||
4733 | if ((8 * CurrOpSizeBytes) % EltTyBits != 0) | |||
4734 | // Vector size must be a multiple of the element size. I.e. no padding. | |||
4735 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, | |||
4736 | CostKind); | |||
4737 | int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits; | |||
4738 | ||||
4739 | assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?")(static_cast <bool> (CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?") ? void (0) : __assert_fail ("CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && \"How'd we get here?\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4739, __extension__ __PRETTY_FUNCTION__)); | |||
4740 | assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||(static_cast <bool> ((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes )) && "Unless we haven't halved the op size yet, " "we have less than two op's sized units of work left." ) ? void (0) : __assert_fail ("(((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes)) && \"Unless we haven't halved the op size yet, \" \"we have less than two op's sized units of work left.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4743, __extension__ __PRETTY_FUNCTION__)) | |||
4741 | (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&(static_cast <bool> ((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes )) && "Unless we haven't halved the op size yet, " "we have less than two op's sized units of work left." ) ? void (0) : __assert_fail ("(((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes)) && \"Unless we haven't halved the op size yet, \" \"we have less than two op's sized units of work left.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4743, __extension__ __PRETTY_FUNCTION__)) | |||
4742 | "Unless we haven't halved the op size yet, "(static_cast <bool> ((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes )) && "Unless we haven't halved the op size yet, " "we have less than two op's sized units of work left." ) ? void (0) : __assert_fail ("(((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes)) && \"Unless we haven't halved the op size yet, \" \"we have less than two op's sized units of work left.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4743, __extension__ __PRETTY_FUNCTION__)) | |||
4743 | "we have less than two op's sized units of work left.")(static_cast <bool> ((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes )) && "Unless we haven't halved the op size yet, " "we have less than two op's sized units of work left." ) ? void (0) : __assert_fail ("(((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes)) && \"Unless we haven't halved the op size yet, \" \"we have less than two op's sized units of work left.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4743, __extension__ __PRETTY_FUNCTION__)); | |||
4744 | ||||
4745 | auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM | |||
4746 | ? FixedVectorType::get(EltTy, CurrNumEltPerOp) | |||
4747 | : XMMVecTy; | |||
4748 | ||||
4749 | assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&(static_cast <bool> (CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && "After halving sizes, the vector elt count is no longer a multiple " "of number of elements per operation?") ? void (0) : __assert_fail ("CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && \"After halving sizes, the vector elt count is no longer a multiple \" \"of number of elements per operation?\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4751, __extension__ __PRETTY_FUNCTION__)) | |||
4750 | "After halving sizes, the vector elt count is no longer a multiple "(static_cast <bool> (CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && "After halving sizes, the vector elt count is no longer a multiple " "of number of elements per operation?") ? void (0) : __assert_fail ("CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && \"After halving sizes, the vector elt count is no longer a multiple \" \"of number of elements per operation?\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4751, __extension__ __PRETTY_FUNCTION__)) | |||
4751 | "of number of elements per operation?")(static_cast <bool> (CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && "After halving sizes, the vector elt count is no longer a multiple " "of number of elements per operation?") ? void (0) : __assert_fail ("CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && \"After halving sizes, the vector elt count is no longer a multiple \" \"of number of elements per operation?\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4751, __extension__ __PRETTY_FUNCTION__)); | |||
4752 | auto *CoalescedVecTy = | |||
4753 | CurrNumEltPerOp == 1 | |||
4754 | ? CurrVecTy | |||
4755 | : FixedVectorType::get( | |||
4756 | IntegerType::get(Src->getContext(), | |||
4757 | EltTyBits * CurrNumEltPerOp), | |||
4758 | CurrVecTy->getNumElements() / CurrNumEltPerOp); | |||
4759 | assert(DL.getTypeSizeInBits(CoalescedVecTy) ==(static_cast <bool> (DL.getTypeSizeInBits(CoalescedVecTy ) == DL.getTypeSizeInBits(CurrVecTy) && "coalesciing elements doesn't change vector width." ) ? void (0) : __assert_fail ("DL.getTypeSizeInBits(CoalescedVecTy) == DL.getTypeSizeInBits(CurrVecTy) && \"coalesciing elements doesn't change vector width.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4761, __extension__ __PRETTY_FUNCTION__)) | |||
4760 | DL.getTypeSizeInBits(CurrVecTy) &&(static_cast <bool> (DL.getTypeSizeInBits(CoalescedVecTy ) == DL.getTypeSizeInBits(CurrVecTy) && "coalesciing elements doesn't change vector width." ) ? void (0) : __assert_fail ("DL.getTypeSizeInBits(CoalescedVecTy) == DL.getTypeSizeInBits(CurrVecTy) && \"coalesciing elements doesn't change vector width.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4761, __extension__ __PRETTY_FUNCTION__)) | |||
4761 | "coalesciing elements doesn't change vector width.")(static_cast <bool> (DL.getTypeSizeInBits(CoalescedVecTy ) == DL.getTypeSizeInBits(CurrVecTy) && "coalesciing elements doesn't change vector width." ) ? void (0) : __assert_fail ("DL.getTypeSizeInBits(CoalescedVecTy) == DL.getTypeSizeInBits(CurrVecTy) && \"coalesciing elements doesn't change vector width.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4761, __extension__ __PRETTY_FUNCTION__)); | |||
4762 | ||||
4763 | while (NumEltRemaining > 0) { | |||
4764 | assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?")(static_cast <bool> (SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?" ) ? void (0) : __assert_fail ("SubVecEltsLeft >= 0 && \"Subreg element count overconsumtion?\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4764, __extension__ __PRETTY_FUNCTION__)); | |||
4765 | ||||
4766 | // Can we use this vector size, as per the remaining element count? | |||
4767 | // Iff the vector is naturally aligned, we can do a wide load regardless. | |||
4768 | if (NumEltRemaining < CurrNumEltPerOp && | |||
4769 | (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) && | |||
4770 | CurrOpSizeBytes != 1) | |||
4771 | break; // Try smalled vector size. | |||
4772 | ||||
4773 | bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0; | |||
4774 | ||||
4775 | // If we have fully processed the previous reg, we need to replenish it. | |||
4776 | if (SubVecEltsLeft == 0) { | |||
4777 | SubVecEltsLeft += CurrVecTy->getNumElements(); | |||
4778 | // And that's free only for the 0'th subvector of a legalized vector. | |||
4779 | if (!Is0thSubVec) | |||
4780 | Cost += getShuffleCost(IsLoad ? TTI::ShuffleKind::SK_InsertSubvector | |||
4781 | : TTI::ShuffleKind::SK_ExtractSubvector, | |||
4782 | VTy, std::nullopt, CostKind, NumEltDone(), | |||
4783 | CurrVecTy); | |||
4784 | } | |||
4785 | ||||
4786 | // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM, | |||
4787 | // for smaller widths (32/16/8) we have to insert/extract them separately. | |||
4788 | // Again, it's free for the 0'th subreg (if op is 32/64 bit wide, | |||
4789 | // but let's pretend that it is also true for 16/8 bit wide ops...) | |||
4790 | if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) { | |||
4791 | int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM; | |||
4792 | assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "")(static_cast <bool> (NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "") ? void (0) : __assert_fail ("NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && \"\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4792, __extension__ __PRETTY_FUNCTION__)); | |||
4793 | int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp; | |||
4794 | APInt DemandedElts = | |||
4795 | APInt::getBitsSet(CoalescedVecTy->getNumElements(), | |||
4796 | CoalescedVecEltIdx, CoalescedVecEltIdx + 1); | |||
4797 | assert(DemandedElts.countPopulation() == 1 && "Inserting single value")(static_cast <bool> (DemandedElts.countPopulation() == 1 && "Inserting single value") ? void (0) : __assert_fail ("DemandedElts.countPopulation() == 1 && \"Inserting single value\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4797, __extension__ __PRETTY_FUNCTION__)); | |||
4798 | Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad, | |||
4799 | !IsLoad); | |||
4800 | } | |||
4801 | ||||
4802 | // This isn't exactly right. We're using slow unaligned 32-byte accesses | |||
4803 | // as a proxy for a double-pumped AVX memory interface such as on | |||
4804 | // Sandybridge. | |||
4805 | if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow()) | |||
4806 | Cost += 2; | |||
4807 | else | |||
4808 | Cost += 1; | |||
4809 | ||||
4810 | SubVecEltsLeft -= CurrNumEltPerOp; | |||
4811 | NumEltRemaining -= CurrNumEltPerOp; | |||
4812 | Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes); | |||
4813 | } | |||
4814 | } | |||
4815 | ||||
4816 | assert(NumEltRemaining <= 0 && "Should have processed all the elements.")(static_cast <bool> (NumEltRemaining <= 0 && "Should have processed all the elements.") ? void (0) : __assert_fail ("NumEltRemaining <= 0 && \"Should have processed all the elements.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4816, __extension__ __PRETTY_FUNCTION__)); | |||
4817 | ||||
4818 | return Cost; | |||
4819 | } | |||
4820 | ||||
4821 | InstructionCost | |||
4822 | X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment, | |||
4823 | unsigned AddressSpace, | |||
4824 | TTI::TargetCostKind CostKind) { | |||
4825 | bool IsLoad = (Instruction::Load == Opcode); | |||
4826 | bool IsStore = (Instruction::Store == Opcode); | |||
4827 | ||||
4828 | auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy); | |||
4829 | if (!SrcVTy) | |||
4830 | // To calculate scalar take the regular cost, without mask | |||
4831 | return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind); | |||
4832 | ||||
4833 | unsigned NumElem = SrcVTy->getNumElements(); | |||
4834 | auto *MaskTy = | |||
4835 | FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem); | |||
4836 | if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) || | |||
4837 | (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) { | |||
4838 | // Scalarization | |||
4839 | APInt DemandedElts = APInt::getAllOnes(NumElem); | |||
4840 | InstructionCost MaskSplitCost = | |||
4841 | getScalarizationOverhead(MaskTy, DemandedElts, false, true); | |||
4842 | InstructionCost ScalarCompareCost = getCmpSelInstrCost( | |||
4843 | Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr, | |||
4844 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | |||
4845 | InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind); | |||
4846 | InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); | |||
4847 | InstructionCost ValueSplitCost = | |||
4848 | getScalarizationOverhead(SrcVTy, DemandedElts, IsLoad, IsStore); | |||
4849 | InstructionCost MemopCost = | |||
4850 | NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(), | |||
4851 | Alignment, AddressSpace, CostKind); | |||
4852 | return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost; | |||
4853 | } | |||
4854 | ||||
4855 | // Legalize the type. | |||
4856 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy); | |||
4857 | auto VT = TLI->getValueType(DL, SrcVTy); | |||
4858 | InstructionCost Cost = 0; | |||
4859 | if (VT.isSimple() && LT.second != VT.getSimpleVT() && | |||
4860 | LT.second.getVectorNumElements() == NumElem) | |||
4861 | // Promotion requires extend/truncate for data and a shuffle for mask. | |||
4862 | Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, std::nullopt, | |||
4863 | CostKind, 0, nullptr) + | |||
4864 | getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, std::nullopt, | |||
4865 | CostKind, 0, nullptr); | |||
4866 | ||||
4867 | else if (LT.first * LT.second.getVectorNumElements() > NumElem) { | |||
4868 | auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(), | |||
4869 | LT.second.getVectorNumElements()); | |||
4870 | // Expanding requires fill mask with zeroes | |||
4871 | Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, std::nullopt, | |||
4872 | CostKind, 0, MaskTy); | |||
4873 | } | |||
4874 | ||||
4875 | // Pre-AVX512 - each maskmov load costs 2 + store costs ~8. | |||
4876 | if (!ST->hasAVX512()) | |||
4877 | return Cost + LT.first * (IsLoad ? 2 : 8); | |||
4878 | ||||
4879 | // AVX-512 masked load/store is cheaper | |||
4880 | return Cost + LT.first; | |||
4881 | } | |||
4882 | ||||
4883 | InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty, | |||
4884 | ScalarEvolution *SE, | |||
4885 | const SCEV *Ptr) { | |||
4886 | // Address computations in vectorized code with non-consecutive addresses will | |||
4887 | // likely result in more instructions compared to scalar code where the | |||
4888 | // computation can more often be merged into the index mode. The resulting | |||
4889 | // extra micro-ops can significantly decrease throughput. | |||
4890 | const unsigned NumVectorInstToHideOverhead = 10; | |||
4891 | ||||
4892 | // Cost modeling of Strided Access Computation is hidden by the indexing | |||
4893 | // modes of X86 regardless of the stride value. We dont believe that there | |||
4894 | // is a difference between constant strided access in gerenal and constant | |||
4895 | // strided value which is less than or equal to 64. | |||
4896 | // Even in the case of (loop invariant) stride whose value is not known at | |||
4897 | // compile time, the address computation will not incur more than one extra | |||
4898 | // ADD instruction. | |||
4899 | if (Ty->isVectorTy() && SE && !ST->hasAVX2()) { | |||
4900 | // TODO: AVX2 is the current cut-off because we don't have correct | |||
4901 | // interleaving costs for prior ISA's. | |||
4902 | if (!BaseT::isStridedAccess(Ptr)) | |||
4903 | return NumVectorInstToHideOverhead; | |||
4904 | if (!BaseT::getConstantStrideStep(SE, Ptr)) | |||
4905 | return 1; | |||
4906 | } | |||
4907 | ||||
4908 | return BaseT::getAddressComputationCost(Ty, SE, Ptr); | |||
4909 | } | |||
4910 | ||||
4911 | InstructionCost | |||
4912 | X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, | |||
4913 | std::optional<FastMathFlags> FMF, | |||
4914 | TTI::TargetCostKind CostKind) { | |||
4915 | if (TTI::requiresOrderedReduction(FMF)) | |||
4916 | return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); | |||
4917 | ||||
4918 | // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput | |||
4919 | // and make it as the cost. | |||
4920 | ||||
4921 | static const CostTblEntry SLMCostTblNoPairWise[] = { | |||
4922 | { ISD::FADD, MVT::v2f64, 3 }, | |||
4923 | { ISD::ADD, MVT::v2i64, 5 }, | |||
4924 | }; | |||
4925 | ||||
4926 | static const CostTblEntry SSE2CostTblNoPairWise[] = { | |||
4927 | { ISD::FADD, MVT::v2f64, 2 }, | |||
4928 | { ISD::FADD, MVT::v2f32, 2 }, | |||
4929 | { ISD::FADD, MVT::v4f32, 4 }, | |||
4930 | { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". | |||
4931 | { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32 | |||
4932 | { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3". | |||
4933 | { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3". | |||
4934 | { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3". | |||
4935 | { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3". | |||
4936 | { ISD::ADD, MVT::v2i8, 2 }, | |||
4937 | { ISD::ADD, MVT::v4i8, 2 }, | |||
4938 | { ISD::ADD, MVT::v8i8, 2 }, | |||
4939 | { ISD::ADD, MVT::v16i8, 3 }, | |||
4940 | }; | |||
4941 | ||||
4942 | static const CostTblEntry AVX1CostTblNoPairWise[] = { | |||
4943 | { ISD::FADD, MVT::v4f64, 3 }, | |||
4944 | { ISD::FADD, MVT::v4f32, 3 }, | |||
4945 | { ISD::FADD, MVT::v8f32, 4 }, | |||
4946 | { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". | |||
4947 | { ISD::ADD, MVT::v4i64, 3 }, | |||
4948 | { ISD::ADD, MVT::v8i32, 5 }, | |||
4949 | { ISD::ADD, MVT::v16i16, 5 }, | |||
4950 | { ISD::ADD, MVT::v32i8, 4 }, | |||
4951 | }; | |||
4952 | ||||
4953 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
4954 | assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ? void (0) : __assert_fail ("ISD && \"Invalid opcode\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4954, __extension__ __PRETTY_FUNCTION__)); | |||
4955 | ||||
4956 | // Before legalizing the type, give a chance to look up illegal narrow types | |||
4957 | // in the table. | |||
4958 | // FIXME: Is there a better way to do this? | |||
4959 | EVT VT = TLI->getValueType(DL, ValTy); | |||
4960 | if (VT.isSimple()) { | |||
4961 | MVT MTy = VT.getSimpleVT(); | |||
4962 | if (ST->useSLMArithCosts()) | |||
4963 | if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy)) | |||
4964 | return Entry->Cost; | |||
4965 | ||||
4966 | if (ST->hasAVX()) | |||
4967 | if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) | |||
4968 | return Entry->Cost; | |||
4969 | ||||
4970 | if (ST->hasSSE2()) | |||
4971 | if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) | |||
4972 | return Entry->Cost; | |||
4973 | } | |||
4974 | ||||
4975 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); | |||
4976 | ||||
4977 | MVT MTy = LT.second; | |||
4978 | ||||
4979 | auto *ValVTy = cast<FixedVectorType>(ValTy); | |||
4980 | ||||
4981 | // Special case: vXi8 mul reductions are performed as vXi16. | |||
4982 | if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) { | |||
4983 | auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16); | |||
4984 | auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements()); | |||
4985 | return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy, | |||
4986 | TargetTransformInfo::CastContextHint::None, | |||
4987 | CostKind) + | |||
4988 | getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind); | |||
4989 | } | |||
4990 | ||||
4991 | InstructionCost ArithmeticCost = 0; | |||
4992 | if (LT.first != 1 && MTy.isVector() && | |||
4993 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { | |||
4994 | // Type needs to be split. We need LT.first - 1 arithmetic ops. | |||
4995 | auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(), | |||
4996 | MTy.getVectorNumElements()); | |||
4997 | ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind); | |||
4998 | ArithmeticCost *= LT.first - 1; | |||
4999 | } | |||
5000 | ||||
5001 | if (ST->useSLMArithCosts()) | |||
5002 | if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy)) | |||
5003 | return ArithmeticCost + Entry->Cost; | |||
5004 | ||||
5005 | if (ST->hasAVX()) | |||
5006 | if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) | |||
5007 | return ArithmeticCost + Entry->Cost; | |||
5008 | ||||
5009 | if (ST->hasSSE2()) | |||
5010 | if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) | |||
5011 | return ArithmeticCost + Entry->Cost; | |||
5012 | ||||
5013 | // FIXME: These assume a naive kshift+binop lowering, which is probably | |||
5014 | // conservative in most cases. | |||
5015 | static const CostTblEntry AVX512BoolReduction[] = { | |||
5016 | { ISD::AND, MVT::v2i1, 3 }, | |||
5017 | { ISD::AND, MVT::v4i1, 5 }, | |||
5018 | { ISD::AND, MVT::v8i1, 7 }, | |||
5019 | { ISD::AND, MVT::v16i1, 9 }, | |||
5020 | { ISD::AND, MVT::v32i1, 11 }, | |||
5021 | { ISD::AND, MVT::v64i1, 13 }, | |||
5022 | { ISD::OR, MVT::v2i1, 3 }, | |||
5023 | { ISD::OR, MVT::v4i1, 5 }, | |||
5024 | { ISD::OR, MVT::v8i1, 7 }, | |||
5025 | { ISD::OR, MVT::v16i1, 9 }, | |||
5026 | { ISD::OR, MVT::v32i1, 11 }, | |||
5027 | { ISD::OR, MVT::v64i1, 13 }, | |||
5028 | }; | |||
5029 | ||||
5030 | static const CostTblEntry AVX2BoolReduction[] = { | |||
5031 | { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp | |||
5032 | { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp | |||
5033 | { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp | |||
5034 | { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp | |||
5035 | }; | |||
5036 | ||||
5037 | static const CostTblEntry AVX1BoolReduction[] = { | |||
5038 | { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp | |||
5039 | { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp | |||
5040 | { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp | |||
5041 | { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp | |||
5042 | { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp | |||
5043 | { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp | |||
5044 | { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp | |||
5045 | { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp | |||
5046 | }; | |||
5047 | ||||
5048 | static const CostTblEntry SSE2BoolReduction[] = { | |||
5049 | { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp | |||
5050 | { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp | |||
5051 | { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp | |||
5052 | { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp | |||
5053 | { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp | |||
5054 | { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp | |||
5055 | { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp | |||
5056 | { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp | |||
5057 | }; | |||
5058 | ||||
5059 | // Handle bool allof/anyof patterns. | |||
5060 | if (ValVTy->getElementType()->isIntegerTy(1)) { | |||
5061 | InstructionCost ArithmeticCost = 0; | |||
5062 | if (LT.first != 1 && MTy.isVector() && | |||
5063 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { | |||
5064 | // Type needs to be split. We need LT.first - 1 arithmetic ops. | |||
5065 | auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(), | |||
5066 | MTy.getVectorNumElements()); | |||
5067 | ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind); | |||
5068 | ArithmeticCost *= LT.first - 1; | |||
5069 | } | |||
5070 | ||||
5071 | if (ST->hasAVX512()) | |||
5072 | if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy)) | |||
5073 | return ArithmeticCost + Entry->Cost; | |||
5074 | if (ST->hasAVX2()) | |||
5075 | if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy)) | |||
5076 | return ArithmeticCost + Entry->Cost; | |||
5077 | if (ST->hasAVX()) | |||
5078 | if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy)) | |||
5079 | return ArithmeticCost + Entry->Cost; | |||
5080 | if (ST->hasSSE2()) | |||
5081 | if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy)) | |||
5082 | return ArithmeticCost + Entry->Cost; | |||
5083 | ||||
5084 | return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind); | |||
5085 | } | |||
5086 | ||||
5087 | unsigned NumVecElts = ValVTy->getNumElements(); | |||
5088 | unsigned ScalarSize = ValVTy->getScalarSizeInBits(); | |||
5089 | ||||
5090 | // Special case power of 2 reductions where the scalar type isn't changed | |||
5091 | // by type legalization. | |||
5092 | if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits()) | |||
5093 | return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind); | |||
5094 | ||||
5095 | InstructionCost ReductionCost = 0; | |||
5096 | ||||
5097 | auto *Ty = ValVTy; | |||
5098 | if (LT.first != 1 && MTy.isVector() && | |||
5099 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { | |||
5100 | // Type needs to be split. We need LT.first - 1 arithmetic ops. | |||
5101 | Ty = FixedVectorType::get(ValVTy->getElementType(), | |||
5102 | MTy.getVectorNumElements()); | |||
5103 | ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind); | |||
5104 | ReductionCost *= LT.first - 1; | |||
5105 | NumVecElts = MTy.getVectorNumElements(); | |||
5106 | } | |||
5107 | ||||
5108 | // Now handle reduction with the legal type, taking into account size changes | |||
5109 | // at each level. | |||
5110 | while (NumVecElts > 1) { | |||
5111 | // Determine the size of the remaining vector we need to reduce. | |||
5112 | unsigned Size = NumVecElts * ScalarSize; | |||
5113 | NumVecElts /= 2; | |||
5114 | // If we're reducing from 256/512 bits, use an extract_subvector. | |||
5115 | if (Size > 128) { | |||
5116 | auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts); | |||
5117 | ReductionCost += | |||
5118 | getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, CostKind, | |||
5119 | NumVecElts, SubTy); | |||
5120 | Ty = SubTy; | |||
5121 | } else if (Size == 128) { | |||
5122 | // Reducing from 128 bits is a permute of v2f64/v2i64. | |||
5123 | FixedVectorType *ShufTy; | |||
5124 | if (ValVTy->isFloatingPointTy()) | |||
5125 | ShufTy = | |||
5126 | FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2); | |||
5127 | else | |||
5128 | ShufTy = | |||
5129 | FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2); | |||
5130 | ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, | |||
5131 | std::nullopt, CostKind, 0, nullptr); | |||
5132 | } else if (Size == 64) { | |||
5133 | // Reducing from 64 bits is a shuffle of v4f32/v4i32. | |||
5134 | FixedVectorType *ShufTy; | |||
5135 | if (ValVTy->isFloatingPointTy()) | |||
5136 | ShufTy = | |||
5137 | FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4); | |||
5138 | else | |||
5139 | ShufTy = | |||
5140 | FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4); | |||
5141 | ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, | |||
5142 | std::nullopt, CostKind, 0, nullptr); | |||
5143 | } else { | |||
5144 | // Reducing from smaller size is a shift by immediate. | |||
5145 | auto *ShiftTy = FixedVectorType::get( | |||
5146 | Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size); | |||
5147 | ReductionCost += getArithmeticInstrCost( | |||
5148 | Instruction::LShr, ShiftTy, CostKind, | |||
5149 | {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, | |||
5150 | {TargetTransformInfo::OK_UniformConstantValue, TargetTransformInfo::OP_None}); | |||
5151 | } | |||
5152 | ||||
5153 | // Add the arithmetic op for this level. | |||
5154 | ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind); | |||
5155 | } | |||
5156 | ||||
5157 | // Add the final extract element to the cost. | |||
5158 | return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0); | |||
5159 | } | |||
5160 | ||||
5161 | InstructionCost X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy, | |||
5162 | bool IsUnsigned) { | |||
5163 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); | |||
5164 | ||||
5165 | MVT MTy = LT.second; | |||
5166 | ||||
5167 | int ISD; | |||
5168 | if (Ty->isIntOrIntVectorTy()) { | |||
5169 | ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN; | |||
5170 | } else { | |||
5171 | assert(Ty->isFPOrFPVectorTy() &&(static_cast <bool> (Ty->isFPOrFPVectorTy() && "Expected float point or integer vector type.") ? void (0) : __assert_fail ("Ty->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5172, __extension__ __PRETTY_FUNCTION__)) | |||
5172 | "Expected float point or integer vector type.")(static_cast <bool> (Ty->isFPOrFPVectorTy() && "Expected float point or integer vector type.") ? void (0) : __assert_fail ("Ty->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5172, __extension__ __PRETTY_FUNCTION__)); | |||
5173 | ISD = ISD::FMINNUM; | |||
5174 | } | |||
5175 | ||||
5176 | static const CostTblEntry SSE1CostTbl[] = { | |||
5177 | {ISD::FMINNUM, MVT::v4f32, 1}, | |||
5178 | }; | |||
5179 | ||||
5180 | static const CostTblEntry SSE2CostTbl[] = { | |||
5181 | {ISD::FMINNUM, MVT::v2f64, 1}, | |||
5182 | {ISD::SMIN, MVT::v8i16, 1}, | |||
5183 | {ISD::UMIN, MVT::v16i8, 1}, | |||
5184 | }; | |||
5185 | ||||
5186 | static const CostTblEntry SSE41CostTbl[] = { | |||
5187 | {ISD::SMIN, MVT::v4i32, 1}, | |||
5188 | {ISD::UMIN, MVT::v4i32, 1}, | |||
5189 | {ISD::UMIN, MVT::v8i16, 1}, | |||
5190 | {ISD::SMIN, MVT::v16i8, 1}, | |||
5191 | }; | |||
5192 | ||||
5193 | static const CostTblEntry SSE42CostTbl[] = { | |||
5194 | {ISD::UMIN, MVT::v2i64, 3}, // xor+pcmpgtq+blendvpd | |||
5195 | }; | |||
5196 | ||||
5197 | static const CostTblEntry AVX1CostTbl[] = { | |||
5198 | {ISD::FMINNUM, MVT::v8f32, 1}, | |||
5199 | {ISD::FMINNUM, MVT::v4f64, 1}, | |||
5200 | {ISD::SMIN, MVT::v8i32, 3}, | |||
5201 | {ISD::UMIN, MVT::v8i32, 3}, | |||
5202 | {ISD::SMIN, MVT::v16i16, 3}, | |||
5203 | {ISD::UMIN, MVT::v16i16, 3}, | |||
5204 | {ISD::SMIN, MVT::v32i8, 3}, | |||
5205 | {ISD::UMIN, MVT::v32i8, 3}, | |||
5206 | }; | |||
5207 | ||||
5208 | static const CostTblEntry AVX2CostTbl[] = { | |||
5209 | {ISD::SMIN, MVT::v8i32, 1}, | |||
5210 | {ISD::UMIN, MVT::v8i32, 1}, | |||
5211 | {ISD::SMIN, MVT::v16i16, 1}, | |||
5212 | {ISD::UMIN, MVT::v16i16, 1}, | |||
5213 | {ISD::SMIN, MVT::v32i8, 1}, | |||
5214 | {ISD::UMIN, MVT::v32i8, 1}, | |||
5215 | }; | |||
5216 | ||||
5217 | static const CostTblEntry AVX512CostTbl[] = { | |||
5218 | {ISD::FMINNUM, MVT::v16f32, 1}, | |||
5219 | {ISD::FMINNUM, MVT::v8f64, 1}, | |||
5220 | {ISD::SMIN, MVT::v2i64, 1}, | |||
5221 | {ISD::UMIN, MVT::v2i64, 1}, | |||
5222 | {ISD::SMIN, MVT::v4i64, 1}, | |||
5223 | {ISD::UMIN, MVT::v4i64, 1}, | |||
5224 | {ISD::SMIN, MVT::v8i64, 1}, | |||
5225 | {ISD::UMIN, MVT::v8i64, 1}, | |||
5226 | {ISD::SMIN, MVT::v16i32, 1}, | |||
5227 | {ISD::UMIN, MVT::v16i32, 1}, | |||
5228 | }; | |||
5229 | ||||
5230 | static const CostTblEntry AVX512BWCostTbl[] = { | |||
5231 | {ISD::SMIN, MVT::v32i16, 1}, | |||
5232 | {ISD::UMIN, MVT::v32i16, 1}, | |||
5233 | {ISD::SMIN, MVT::v64i8, 1}, | |||
5234 | {ISD::UMIN, MVT::v64i8, 1}, | |||
5235 | }; | |||
5236 | ||||
5237 | // If we have a native MIN/MAX instruction for this type, use it. | |||
5238 | if (ST->hasBWI()) | |||
5239 | if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) | |||
5240 | return LT.first * Entry->Cost; | |||
5241 | ||||
5242 | if (ST->hasAVX512()) | |||
5243 | if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) | |||
5244 | return LT.first * Entry->Cost; | |||
5245 | ||||
5246 | if (ST->hasAVX2()) | |||
5247 | if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) | |||
5248 | return LT.first * Entry->Cost; | |||
5249 | ||||
5250 | if (ST->hasAVX()) | |||
5251 | if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) | |||
5252 | return LT.first * Entry->Cost; | |||
5253 | ||||
5254 | if (ST->hasSSE42()) | |||
5255 | if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) | |||
5256 | return LT.first * Entry->Cost; | |||
5257 | ||||
5258 | if (ST->hasSSE41()) | |||
5259 | if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) | |||
5260 | return LT.first * Entry->Cost; | |||
5261 | ||||
5262 | if (ST->hasSSE2()) | |||
5263 | if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) | |||
5264 | return LT.first * Entry->Cost; | |||
5265 | ||||
5266 | if (ST->hasSSE1()) | |||
5267 | if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) | |||
5268 | return LT.first * Entry->Cost; | |||
5269 | ||||
5270 | unsigned CmpOpcode; | |||
5271 | if (Ty->isFPOrFPVectorTy()) { | |||
5272 | CmpOpcode = Instruction::FCmp; | |||
5273 | } else { | |||
5274 | assert(Ty->isIntOrIntVectorTy() &&(static_cast <bool> (Ty->isIntOrIntVectorTy() && "expecting floating point or integer type for min/max reduction" ) ? void (0) : __assert_fail ("Ty->isIntOrIntVectorTy() && \"expecting floating point or integer type for min/max reduction\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5275, __extension__ __PRETTY_FUNCTION__)) | |||
5275 | "expecting floating point or integer type for min/max reduction")(static_cast <bool> (Ty->isIntOrIntVectorTy() && "expecting floating point or integer type for min/max reduction" ) ? void (0) : __assert_fail ("Ty->isIntOrIntVectorTy() && \"expecting floating point or integer type for min/max reduction\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5275, __extension__ __PRETTY_FUNCTION__)); | |||
5276 | CmpOpcode = Instruction::ICmp; | |||
5277 | } | |||
5278 | ||||
5279 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; | |||
5280 | // Otherwise fall back to cmp+select. | |||
5281 | InstructionCost Result = | |||
5282 | getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CmpInst::BAD_ICMP_PREDICATE, | |||
5283 | CostKind) + | |||
5284 | getCmpSelInstrCost(Instruction::Select, Ty, CondTy, | |||
5285 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | |||
5286 | return Result; | |||
5287 | } | |||
5288 | ||||
5289 | InstructionCost | |||
5290 | X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy, | |||
5291 | bool IsUnsigned, | |||
5292 | TTI::TargetCostKind CostKind) { | |||
5293 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); | |||
5294 | ||||
5295 | MVT MTy = LT.second; | |||
5296 | ||||
5297 | int ISD; | |||
5298 | if (ValTy->isIntOrIntVectorTy()) { | |||
5299 | ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN; | |||
| ||||
5300 | } else { | |||
5301 | assert(ValTy->isFPOrFPVectorTy() &&(static_cast <bool> (ValTy->isFPOrFPVectorTy() && "Expected float point or integer vector type.") ? void (0) : __assert_fail ("ValTy->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5302, __extension__ __PRETTY_FUNCTION__)) | |||
5302 | "Expected float point or integer vector type.")(static_cast <bool> (ValTy->isFPOrFPVectorTy() && "Expected float point or integer vector type.") ? void (0) : __assert_fail ("ValTy->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5302, __extension__ __PRETTY_FUNCTION__)); | |||
5303 | ISD = ISD::FMINNUM; | |||
5304 | } | |||
5305 | ||||
5306 | // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput | |||
5307 | // and make it as the cost. | |||
5308 | ||||
5309 | static const CostTblEntry SSE2CostTblNoPairWise[] = { | |||
5310 | {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw | |||
5311 | {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw | |||
5312 | {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw | |||
5313 | }; | |||
5314 | ||||
5315 | static const CostTblEntry SSE41CostTblNoPairWise[] = { | |||
5316 | {ISD::SMIN, MVT::v2i16, 3}, // same as sse2 | |||
5317 | {ISD::SMIN, MVT::v4i16, 5}, // same as sse2 | |||
5318 | {ISD::UMIN, MVT::v2i16, 5}, // same as sse2 | |||
5319 | {ISD::UMIN, MVT::v4i16, 7}, // same as sse2 | |||
5320 | {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor | |||
5321 | {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax | |||
5322 | {ISD::SMIN, MVT::v2i8, 3}, // pminsb | |||
5323 | {ISD::SMIN, MVT::v4i8, 5}, // pminsb | |||
5324 | {ISD::SMIN, MVT::v8i8, 7}, // pminsb | |||
5325 | {ISD::SMIN, MVT::v16i8, 6}, | |||
5326 | {ISD::UMIN, MVT::v2i8, 3}, // same as sse2 | |||
5327 | {ISD::UMIN, MVT::v4i8, 5}, // same as sse2 | |||
5328 | {ISD::UMIN, MVT::v8i8, 7}, // same as sse2 | |||
5329 | {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax | |||
5330 | }; | |||
5331 | ||||
5332 | static const CostTblEntry AVX1CostTblNoPairWise[] = { | |||
5333 | {ISD::SMIN, MVT::v16i16, 6}, | |||
5334 | {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax | |||
5335 | {ISD::SMIN, MVT::v32i8, 8}, | |||
5336 | {ISD::UMIN, MVT::v32i8, 8}, | |||
5337 | }; | |||
5338 | ||||
5339 | static const CostTblEntry AVX512BWCostTblNoPairWise[] = { | |||
5340 | {ISD::SMIN, MVT::v32i16, 8}, | |||
5341 | {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax | |||
5342 | {ISD::SMIN, MVT::v64i8, 10}, | |||
5343 | {ISD::UMIN, MVT::v64i8, 10}, | |||
5344 | }; | |||
5345 | ||||
5346 | // Before legalizing the type, give a chance to look up illegal narrow types | |||
5347 | // in the table. | |||
5348 | // FIXME: Is there a better way to do this? | |||
5349 | EVT VT = TLI->getValueType(DL, ValTy); | |||
5350 | if (VT.isSimple()) { | |||
5351 | MVT MTy = VT.getSimpleVT(); | |||
5352 | if (ST->hasBWI()) | |||
5353 | if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy)) | |||
5354 | return Entry->Cost; | |||
5355 | ||||
5356 | if (ST->hasAVX()) | |||
5357 | if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) | |||
5358 | return Entry->Cost; | |||
5359 | ||||
5360 | if (ST->hasSSE41()) | |||
5361 | if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy)) | |||
5362 | return Entry->Cost; | |||
5363 | ||||
5364 | if (ST->hasSSE2()) | |||
5365 | if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) | |||
5366 | return Entry->Cost; | |||
5367 | } | |||
5368 | ||||
5369 | auto *ValVTy = cast<FixedVectorType>(ValTy); | |||
5370 | unsigned NumVecElts = ValVTy->getNumElements(); | |||
5371 | ||||
5372 | auto *Ty = ValVTy; | |||
5373 | InstructionCost MinMaxCost = 0; | |||
5374 | if (LT.first != 1 && MTy.isVector() && | |||
5375 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { | |||
5376 | // Type needs to be split. We need LT.first - 1 operations ops. | |||
5377 | Ty = FixedVectorType::get(ValVTy->getElementType(), | |||
5378 | MTy.getVectorNumElements()); | |||
5379 | auto *SubCondTy = FixedVectorType::get(CondTy->getElementType(), | |||
5380 | MTy.getVectorNumElements()); | |||
5381 | MinMaxCost = getMinMaxCost(Ty, SubCondTy, IsUnsigned); | |||
5382 | MinMaxCost *= LT.first - 1; | |||
5383 | NumVecElts = MTy.getVectorNumElements(); | |||
5384 | } | |||
5385 | ||||
5386 | if (ST->hasBWI()) | |||
5387 | if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy)) | |||
5388 | return MinMaxCost + Entry->Cost; | |||
5389 | ||||
5390 | if (ST->hasAVX()) | |||
5391 | if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) | |||
5392 | return MinMaxCost + Entry->Cost; | |||
5393 | ||||
5394 | if (ST->hasSSE41()) | |||
5395 | if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy)) | |||
5396 | return MinMaxCost + Entry->Cost; | |||
5397 | ||||
5398 | if (ST->hasSSE2()) | |||
5399 | if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) | |||
5400 | return MinMaxCost + Entry->Cost; | |||
5401 | ||||
5402 | unsigned ScalarSize = ValTy->getScalarSizeInBits(); | |||
5403 | ||||
5404 | // Special case power of 2 reductions where the scalar type isn't changed | |||
5405 | // by type legalization. | |||
5406 | if (!isPowerOf2_32(ValVTy->getNumElements()) || | |||
5407 | ScalarSize != MTy.getScalarSizeInBits()) | |||
5408 | return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsUnsigned, CostKind); | |||
5409 | ||||
5410 | // Now handle reduction with the legal type, taking into account size changes | |||
5411 | // at each level. | |||
5412 | while (NumVecElts > 1) { | |||
5413 | // Determine the size of the remaining vector we need to reduce. | |||
5414 | unsigned Size = NumVecElts * ScalarSize; | |||
5415 | NumVecElts /= 2; | |||
5416 | // If we're reducing from 256/512 bits, use an extract_subvector. | |||
5417 | if (Size > 128) { | |||
5418 | auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts); | |||
5419 | MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, | |||
5420 | CostKind, NumVecElts, SubTy); | |||
5421 | Ty = SubTy; | |||
5422 | } else if (Size == 128) { | |||
5423 | // Reducing from 128 bits is a permute of v2f64/v2i64. | |||
5424 | VectorType *ShufTy; | |||
5425 | if (ValTy->isFloatingPointTy()) | |||
5426 | ShufTy = | |||
5427 | FixedVectorType::get(Type::getDoubleTy(ValTy->getContext()), 2); | |||
5428 | else | |||
5429 | ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2); | |||
5430 | MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, | |||
5431 | std::nullopt, CostKind, 0, nullptr); | |||
5432 | } else if (Size == 64) { | |||
5433 | // Reducing from 64 bits is a shuffle of v4f32/v4i32. | |||
5434 | FixedVectorType *ShufTy; | |||
5435 | if (ValTy->isFloatingPointTy()) | |||
5436 | ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4); | |||
5437 | else | |||
5438 | ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4); | |||
5439 | MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, | |||
5440 | std::nullopt, CostKind, 0, nullptr); | |||
5441 | } else { | |||
5442 | // Reducing from smaller size is a shift by immediate. | |||
5443 | auto *ShiftTy = FixedVectorType::get( | |||
5444 | Type::getIntNTy(ValTy->getContext(), Size), 128 / Size); | |||
5445 | MinMaxCost += getArithmeticInstrCost( | |||
5446 | Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput, | |||
5447 | {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, | |||
5448 | {TargetTransformInfo::OK_UniformConstantValue, TargetTransformInfo::OP_None}); | |||
5449 | } | |||
5450 | ||||
5451 | // Add the arithmetic op for this level. | |||
5452 | auto *SubCondTy = | |||
5453 | FixedVectorType::get(CondTy->getElementType(), Ty->getNumElements()); | |||
5454 | MinMaxCost += getMinMaxCost(Ty, SubCondTy, IsUnsigned); | |||
5455 | } | |||
5456 | ||||
5457 | // Add the final extract element to the cost. | |||
5458 | return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0); | |||
5459 | } | |||
5460 | ||||
5461 | /// Calculate the cost of materializing a 64-bit value. This helper | |||
5462 | /// method might only calculate a fraction of a larger immediate. Therefore it | |||
5463 | /// is valid to return a cost of ZERO. | |||
5464 | InstructionCost X86TTIImpl::getIntImmCost(int64_t Val) { | |||
5465 | if (Val == 0) | |||
5466 | return TTI::TCC_Free; | |||
5467 | ||||
5468 | if (isInt<32>(Val)) | |||
5469 | return TTI::TCC_Basic; | |||
5470 | ||||
5471 | return 2 * TTI::TCC_Basic; | |||
5472 | } | |||
5473 | ||||
5474 | InstructionCost X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, | |||
5475 | TTI::TargetCostKind CostKind) { | |||
5476 | assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) : __assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 5476, __extension__ __PRETTY_FUNCTION__)); | |||
5477 | ||||
5478 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); | |||
5479 | if (BitSize == 0) | |||
5480 | return ~0U; | |||
5481 | ||||
5482 | // Never hoist constants larger than 128bit, because this might lead to | |||
5483 | // incorrect code generation or assertions in codegen. | |||
5484 | // Fixme: Create a cost model for types larger than i128 once the codegen | |||
5485 | // issues have been fixed. | |||
5486 | if (BitSize > 128) | |||
5487 | return TTI::TCC_Free; | |||
5488 | ||||
5489 | if (Imm == 0) | |||
5490 | return TTI::TCC_Free; | |||
5491 | ||||
5492 | // Sign-extend all constants to a multiple of 64-bit. | |||
5493 | APInt ImmVal = Imm; | |||
5494 | if (BitSize % 64 != 0) | |||
5495 | ImmVal = Imm.sext(alignTo(BitSize, 64)); | |||
5496 | ||||
5497 | // Split the constant into 64-bit chunks and calculate the cost for each | |||
5498 | // chunk. | |||
5499 | InstructionCost Cost = 0; | |||
5500 | for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { | |||
5501 | APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); | |||
5502 | int64_t Val = Tmp.getSExtValue(); | |||
5503 | Cost += getIntImmCost(Val); | |||
5504 | } | |||
5505 | // We need at least one instruction to materialize the constant. | |||
5506 | return std::max<InstructionCost>(1, Cost); | |||
5507 | } | |||
5508 | ||||
5509 | InstructionCost X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, | |||
5510 | const APInt &Imm, Type *Ty, | |||
5511 | TTI::TargetCostKind CostKind, | |||
5512 | Instruction *Inst) { | |||
5513 | assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) : __assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 5513, __extension__ __PRETTY_FUNCTION__)); | |||
5514 | ||||
5515 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); | |||
5516 | // There is no cost model for constants with a bit size of 0. Return TCC_Free | |||
5517 | // here, so that constant hoisting will ignore this constant. | |||
5518 | if (BitSize == 0) | |||
5519 | return TTI::TCC_Free; | |||
5520 | ||||
5521 | unsigned ImmIdx = ~0U; | |||
5522 | switch (Opcode) { | |||
5523 | default: | |||
5524 | return TTI::TCC_Free; | |||
5525 | case Instruction::GetElementPtr: | |||
5526 | // Always hoist the base address of a GetElementPtr. This prevents the | |||
5527 | // creation of new constants for every base constant that gets constant | |||
5528 | // folded with the offset. | |||
5529 | if (Idx == 0) | |||
5530 | return 2 * TTI::TCC_Basic; | |||
5531 | return TTI::TCC_Free; | |||
5532 | case Instruction::Store: | |||
5533 | ImmIdx = 0; | |||
5534 | break; | |||
5535 | case Instruction::ICmp: | |||
5536 | // This is an imperfect hack to prevent constant hoisting of | |||
5537 | // compares that might be trying to check if a 64-bit value fits in | |||
5538 | // 32-bits. The backend can optimize these cases using a right shift by 32. | |||
5539 | // Ideally we would check the compare predicate here. There also other | |||
5540 | // similar immediates the backend can use shifts for. | |||
5541 | if (Idx == 1 && Imm.getBitWidth() == 64) { | |||
5542 | uint64_t ImmVal = Imm.getZExtValue(); | |||
5543 | if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff) | |||
5544 | return TTI::TCC_Free; | |||
5545 | } | |||
5546 | ImmIdx = 1; | |||
5547 | break; | |||
5548 | case Instruction::And: | |||
5549 | // We support 64-bit ANDs with immediates with 32-bits of leading zeroes | |||
5550 | // by using a 32-bit operation with implicit zero extension. Detect such | |||
5551 | // immediates here as the normal path expects bit 31 to be sign extended. | |||
5552 | if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.isIntN(32)) | |||
5553 | return TTI::TCC_Free; | |||
5554 | ImmIdx = 1; | |||
5555 | break; | |||
5556 | case Instruction::Add: | |||
5557 | case Instruction::Sub: | |||
5558 | // For add/sub, we can use the opposite instruction for INT32_MIN. | |||
5559 | if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000) | |||
5560 | return TTI::TCC_Free; | |||
5561 | ImmIdx = 1; | |||
5562 | break; | |||
5563 | case Instruction::UDiv: | |||
5564 | case Instruction::SDiv: | |||
5565 | case Instruction::URem: | |||
5566 | case Instruction::SRem: | |||
5567 | // Division by constant is typically expanded later into a different | |||
5568 | // instruction sequence. This completely changes the constants. | |||
5569 | // Report them as "free" to stop ConstantHoist from marking them as opaque. | |||
5570 | return TTI::TCC_Free; | |||
5571 | case Instruction::Mul: | |||
5572 | case Instruction::Or: | |||
5573 | case Instruction::Xor: | |||
5574 | ImmIdx = 1; | |||
5575 | break; | |||
5576 | // Always return TCC_Free for the shift value of a shift instruction. | |||
5577 | case Instruction::Shl: | |||
5578 | case Instruction::LShr: | |||
5579 | case Instruction::AShr: | |||
5580 | if (Idx == 1) | |||
5581 | return TTI::TCC_Free; | |||
5582 | break; | |||
5583 | case Instruction::Trunc: | |||
5584 | case Instruction::ZExt: | |||
5585 | case Instruction::SExt: | |||
5586 | case Instruction::IntToPtr: | |||
5587 | case Instruction::PtrToInt: | |||
5588 | case Instruction::BitCast: | |||
5589 | case Instruction::PHI: | |||
5590 | case Instruction::Call: | |||
5591 | case Instruction::Select: | |||
5592 | case Instruction::Ret: | |||
5593 | case Instruction::Load: | |||
5594 | break; | |||
5595 | } | |||
5596 | ||||
5597 | if (Idx == ImmIdx) { | |||
5598 | int NumConstants = divideCeil(BitSize, 64); | |||
5599 | InstructionCost Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); | |||
5600 | return (Cost <= NumConstants * TTI::TCC_Basic) | |||
5601 | ? static_cast<int>(TTI::TCC_Free) | |||
5602 | : Cost; | |||
5603 | } | |||
5604 | ||||
5605 | return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); | |||
5606 | } | |||
5607 | ||||
5608 | InstructionCost X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, | |||
5609 | const APInt &Imm, Type *Ty, | |||
5610 | TTI::TargetCostKind CostKind) { | |||
5611 | assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) : __assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 5611, __extension__ __PRETTY_FUNCTION__)); | |||
5612 | ||||
5613 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); | |||
5614 | // There is no cost model for constants with a bit size of 0. Return TCC_Free | |||
5615 | // here, so that constant hoisting will ignore this constant. | |||
5616 | if (BitSize == 0) | |||
5617 | return TTI::TCC_Free; | |||
5618 | ||||
5619 | switch (IID) { | |||
5620 | default: | |||
5621 | return TTI::TCC_Free; | |||
5622 | case Intrinsic::sadd_with_overflow: | |||
5623 | case Intrinsic::uadd_with_overflow: | |||
5624 | case Intrinsic::ssub_with_overflow: | |||
5625 | case Intrinsic::usub_with_overflow: | |||
5626 | case Intrinsic::smul_with_overflow: | |||
5627 | case Intrinsic::umul_with_overflow: | |||
5628 | if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(32)) | |||
5629 | return TTI::TCC_Free; | |||
5630 | break; | |||
5631 | case Intrinsic::experimental_stackmap: | |||
5632 | if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64))) | |||
5633 |