File: | build/llvm-toolchain-snapshot-16~++20220819100721+9e51cbac9ef9/llvm/lib/Target/X86/X86TargetTransformInfo.cpp |
Warning: | line 3741, column 15 Division by zero |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===// | |||
2 | // | |||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | |||
4 | // See https://llvm.org/LICENSE.txt for license information. | |||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | |||
6 | // | |||
7 | //===----------------------------------------------------------------------===// | |||
8 | /// \file | |||
9 | /// This file implements a TargetTransformInfo analysis pass specific to the | |||
10 | /// X86 target machine. It uses the target's detailed information to provide | |||
11 | /// more precise answers to certain TTI queries, while letting the target | |||
12 | /// independent and default TTI implementations handle the rest. | |||
13 | /// | |||
14 | //===----------------------------------------------------------------------===// | |||
15 | /// About Cost Model numbers used below it's necessary to say the following: | |||
16 | /// the numbers correspond to some "generic" X86 CPU instead of usage of | |||
17 | /// concrete CPU model. Usually the numbers correspond to CPU where the feature | |||
18 | /// apeared at the first time. For example, if we do Subtarget.hasSSE42() in | |||
19 | /// the lookups below the cost is based on Nehalem as that was the first CPU | |||
20 | /// to support that feature level and thus has most likely the worst case cost. | |||
21 | /// Some examples of other technologies/CPUs: | |||
22 | /// SSE 3 - Pentium4 / Athlon64 | |||
23 | /// SSE 4.1 - Penryn | |||
24 | /// SSE 4.2 - Nehalem | |||
25 | /// AVX - Sandy Bridge | |||
26 | /// AVX2 - Haswell | |||
27 | /// AVX-512 - Xeon Phi / Skylake | |||
28 | /// And some examples of instruction target dependent costs (latency) | |||
29 | /// divss sqrtss rsqrtss | |||
30 | /// AMD K7 11-16 19 3 | |||
31 | /// Piledriver 9-24 13-15 5 | |||
32 | /// Jaguar 14 16 2 | |||
33 | /// Pentium II,III 18 30 2 | |||
34 | /// Nehalem 7-14 7-18 3 | |||
35 | /// Haswell 10-13 11 5 | |||
36 | /// TODO: Develop and implement the target dependent cost model and | |||
37 | /// specialize cost numbers for different Cost Model Targets such as throughput, | |||
38 | /// code size, latency and uop count. | |||
39 | //===----------------------------------------------------------------------===// | |||
40 | ||||
41 | #include "X86TargetTransformInfo.h" | |||
42 | #include "llvm/Analysis/TargetTransformInfo.h" | |||
43 | #include "llvm/CodeGen/BasicTTIImpl.h" | |||
44 | #include "llvm/CodeGen/CostTable.h" | |||
45 | #include "llvm/CodeGen/TargetLowering.h" | |||
46 | #include "llvm/IR/InstIterator.h" | |||
47 | #include "llvm/IR/IntrinsicInst.h" | |||
48 | #include "llvm/Support/Debug.h" | |||
49 | ||||
50 | using namespace llvm; | |||
51 | ||||
52 | #define DEBUG_TYPE"x86tti" "x86tti" | |||
53 | ||||
54 | //===----------------------------------------------------------------------===// | |||
55 | // | |||
56 | // X86 cost model. | |||
57 | // | |||
58 | //===----------------------------------------------------------------------===// | |||
59 | ||||
60 | TargetTransformInfo::PopcntSupportKind | |||
61 | X86TTIImpl::getPopcntSupport(unsigned TyWidth) { | |||
62 | assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2")(static_cast <bool> (isPowerOf2_32(TyWidth) && "Ty width must be power of 2" ) ? void (0) : __assert_fail ("isPowerOf2_32(TyWidth) && \"Ty width must be power of 2\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 62, __extension__ __PRETTY_FUNCTION__)); | |||
63 | // TODO: Currently the __builtin_popcount() implementation using SSE3 | |||
64 | // instructions is inefficient. Once the problem is fixed, we should | |||
65 | // call ST->hasSSE3() instead of ST->hasPOPCNT(). | |||
66 | return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software; | |||
67 | } | |||
68 | ||||
69 | llvm::Optional<unsigned> X86TTIImpl::getCacheSize( | |||
70 | TargetTransformInfo::CacheLevel Level) const { | |||
71 | switch (Level) { | |||
72 | case TargetTransformInfo::CacheLevel::L1D: | |||
73 | // - Penryn | |||
74 | // - Nehalem | |||
75 | // - Westmere | |||
76 | // - Sandy Bridge | |||
77 | // - Ivy Bridge | |||
78 | // - Haswell | |||
79 | // - Broadwell | |||
80 | // - Skylake | |||
81 | // - Kabylake | |||
82 | return 32 * 1024; // 32 KByte | |||
83 | case TargetTransformInfo::CacheLevel::L2D: | |||
84 | // - Penryn | |||
85 | // - Nehalem | |||
86 | // - Westmere | |||
87 | // - Sandy Bridge | |||
88 | // - Ivy Bridge | |||
89 | // - Haswell | |||
90 | // - Broadwell | |||
91 | // - Skylake | |||
92 | // - Kabylake | |||
93 | return 256 * 1024; // 256 KByte | |||
94 | } | |||
95 | ||||
96 | llvm_unreachable("Unknown TargetTransformInfo::CacheLevel")::llvm::llvm_unreachable_internal("Unknown TargetTransformInfo::CacheLevel" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 96); | |||
97 | } | |||
98 | ||||
99 | llvm::Optional<unsigned> X86TTIImpl::getCacheAssociativity( | |||
100 | TargetTransformInfo::CacheLevel Level) const { | |||
101 | // - Penryn | |||
102 | // - Nehalem | |||
103 | // - Westmere | |||
104 | // - Sandy Bridge | |||
105 | // - Ivy Bridge | |||
106 | // - Haswell | |||
107 | // - Broadwell | |||
108 | // - Skylake | |||
109 | // - Kabylake | |||
110 | switch (Level) { | |||
111 | case TargetTransformInfo::CacheLevel::L1D: | |||
112 | [[fallthrough]]; | |||
113 | case TargetTransformInfo::CacheLevel::L2D: | |||
114 | return 8; | |||
115 | } | |||
116 | ||||
117 | llvm_unreachable("Unknown TargetTransformInfo::CacheLevel")::llvm::llvm_unreachable_internal("Unknown TargetTransformInfo::CacheLevel" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 117); | |||
118 | } | |||
119 | ||||
120 | unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const { | |||
121 | bool Vector = (ClassID == 1); | |||
122 | if (Vector && !ST->hasSSE1()) | |||
123 | return 0; | |||
124 | ||||
125 | if (ST->is64Bit()) { | |||
126 | if (Vector && ST->hasAVX512()) | |||
127 | return 32; | |||
128 | return 16; | |||
129 | } | |||
130 | return 8; | |||
131 | } | |||
132 | ||||
133 | TypeSize | |||
134 | X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { | |||
135 | unsigned PreferVectorWidth = ST->getPreferVectorWidth(); | |||
136 | switch (K) { | |||
137 | case TargetTransformInfo::RGK_Scalar: | |||
138 | return TypeSize::getFixed(ST->is64Bit() ? 64 : 32); | |||
139 | case TargetTransformInfo::RGK_FixedWidthVector: | |||
140 | if (ST->hasAVX512() && PreferVectorWidth >= 512) | |||
141 | return TypeSize::getFixed(512); | |||
142 | if (ST->hasAVX() && PreferVectorWidth >= 256) | |||
143 | return TypeSize::getFixed(256); | |||
144 | if (ST->hasSSE1() && PreferVectorWidth >= 128) | |||
145 | return TypeSize::getFixed(128); | |||
146 | return TypeSize::getFixed(0); | |||
147 | case TargetTransformInfo::RGK_ScalableVector: | |||
148 | return TypeSize::getScalable(0); | |||
149 | } | |||
150 | ||||
151 | llvm_unreachable("Unsupported register kind")::llvm::llvm_unreachable_internal("Unsupported register kind" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 151); | |||
152 | } | |||
153 | ||||
154 | unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const { | |||
155 | return getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) | |||
156 | .getFixedSize(); | |||
157 | } | |||
158 | ||||
159 | unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) { | |||
160 | // If the loop will not be vectorized, don't interleave the loop. | |||
161 | // Let regular unroll to unroll the loop, which saves the overflow | |||
162 | // check and memory check cost. | |||
163 | if (VF == 1) | |||
164 | return 1; | |||
165 | ||||
166 | if (ST->isAtom()) | |||
167 | return 1; | |||
168 | ||||
169 | // Sandybridge and Haswell have multiple execution ports and pipelined | |||
170 | // vector units. | |||
171 | if (ST->hasAVX()) | |||
172 | return 4; | |||
173 | ||||
174 | return 2; | |||
175 | } | |||
176 | ||||
177 | InstructionCost X86TTIImpl::getArithmeticInstrCost( | |||
178 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, | |||
179 | TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, | |||
180 | TTI::OperandValueProperties Opd1PropInfo, | |||
181 | TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args, | |||
182 | const Instruction *CxtI) { | |||
183 | // vXi8 multiplications are always promoted to vXi16. | |||
184 | if (Opcode == Instruction::Mul && Ty->isVectorTy() && | |||
185 | Ty->getScalarSizeInBits() == 8) { | |||
186 | Type *WideVecTy = | |||
187 | VectorType::getExtendedElementVectorType(cast<VectorType>(Ty)); | |||
188 | return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty, | |||
189 | TargetTransformInfo::CastContextHint::None, | |||
190 | CostKind) + | |||
191 | getCastInstrCost(Instruction::Trunc, Ty, WideVecTy, | |||
192 | TargetTransformInfo::CastContextHint::None, | |||
193 | CostKind) + | |||
194 | getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info, | |||
195 | Opd1PropInfo, Opd2PropInfo); | |||
196 | } | |||
197 | ||||
198 | // Legalize the type. | |||
199 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); | |||
200 | ||||
201 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
202 | assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ? void (0) : __assert_fail ("ISD && \"Invalid opcode\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 202, __extension__ __PRETTY_FUNCTION__)); | |||
203 | ||||
204 | if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() && | |||
205 | LT.second.getScalarType() == MVT::i32) { | |||
206 | // Check if the operands can be represented as a smaller datatype. | |||
207 | bool Op1Signed = false, Op2Signed = false; | |||
208 | unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed); | |||
209 | unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed); | |||
210 | unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize); | |||
211 | ||||
212 | // If both are representable as i15 and at least one is constant, | |||
213 | // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we | |||
214 | // can treat this as PMADDWD which has the same costs as a vXi16 multiply. | |||
215 | if (OpMinSize <= 15 && !ST->isPMADDWDSlow()) { | |||
216 | bool Op1Constant = | |||
217 | isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]); | |||
218 | bool Op2Constant = | |||
219 | isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]); | |||
220 | bool Op1Sext = isa<SExtInst>(Args[0]) && | |||
221 | (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41())); | |||
222 | bool Op2Sext = isa<SExtInst>(Args[1]) && | |||
223 | (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41())); | |||
224 | ||||
225 | bool IsZeroExtended = !Op1Signed || !Op2Signed; | |||
226 | bool IsConstant = Op1Constant || Op2Constant; | |||
227 | bool IsSext = Op1Sext || Op2Sext; | |||
228 | if (IsConstant || IsZeroExtended || IsSext) | |||
229 | LT.second = | |||
230 | MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements()); | |||
231 | } | |||
232 | } | |||
233 | ||||
234 | // Vector multiply by pow2 will be simplified to shifts. | |||
235 | if (ISD == ISD::MUL && | |||
236 | (Op2Info == TargetTransformInfo::OK_UniformConstantValue || | |||
237 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && | |||
238 | Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) | |||
239 | return getArithmeticInstrCost(Instruction::Shl, Ty, CostKind, Op1Info, | |||
240 | Op2Info, TargetTransformInfo::OP_None, | |||
241 | TargetTransformInfo::OP_None); | |||
242 | ||||
243 | // On X86, vector signed division by constants power-of-two are | |||
244 | // normally expanded to the sequence SRA + SRL + ADD + SRA. | |||
245 | // The OperandValue properties may not be the same as that of the previous | |||
246 | // operation; conservatively assume OP_None. | |||
247 | if ((ISD == ISD::SDIV || ISD == ISD::SREM) && | |||
248 | (Op2Info == TargetTransformInfo::OK_UniformConstantValue || | |||
249 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && | |||
250 | Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { | |||
251 | InstructionCost Cost = | |||
252 | 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Op1Info, | |||
253 | Op2Info, TargetTransformInfo::OP_None, | |||
254 | TargetTransformInfo::OP_None); | |||
255 | Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info, | |||
256 | Op2Info, TargetTransformInfo::OP_None, | |||
257 | TargetTransformInfo::OP_None); | |||
258 | Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, Op1Info, | |||
259 | Op2Info, TargetTransformInfo::OP_None, | |||
260 | TargetTransformInfo::OP_None); | |||
261 | ||||
262 | if (ISD == ISD::SREM) { | |||
263 | // For SREM: (X % C) is the equivalent of (X - (X/C)*C) | |||
264 | Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info, | |||
265 | Op2Info); | |||
266 | Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info, | |||
267 | Op2Info); | |||
268 | } | |||
269 | ||||
270 | return Cost; | |||
271 | } | |||
272 | ||||
273 | // Vector unsigned division/remainder will be simplified to shifts/masks. | |||
274 | if ((ISD == ISD::UDIV || ISD == ISD::UREM) && | |||
275 | (Op2Info == TargetTransformInfo::OK_UniformConstantValue || | |||
276 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && | |||
277 | Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { | |||
278 | if (ISD == ISD::UDIV) | |||
279 | return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info, | |||
280 | Op2Info, TargetTransformInfo::OP_None, | |||
281 | TargetTransformInfo::OP_None); | |||
282 | // UREM | |||
283 | return getArithmeticInstrCost(Instruction::And, Ty, CostKind, Op1Info, | |||
284 | Op2Info, TargetTransformInfo::OP_None, | |||
285 | TargetTransformInfo::OP_None); | |||
286 | } | |||
287 | ||||
288 | // TODO: Handle more cost kinds. | |||
289 | if (CostKind != TTI::TCK_RecipThroughput) | |||
290 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, | |||
291 | Opd1PropInfo, Opd2PropInfo, Args, | |||
292 | CxtI); | |||
293 | ||||
294 | static const CostTblEntry GLMCostTable[] = { | |||
295 | { ISD::FDIV, MVT::f32, 18 }, // divss | |||
296 | { ISD::FDIV, MVT::v4f32, 35 }, // divps | |||
297 | { ISD::FDIV, MVT::f64, 33 }, // divsd | |||
298 | { ISD::FDIV, MVT::v2f64, 65 }, // divpd | |||
299 | }; | |||
300 | ||||
301 | if (ST->useGLMDivSqrtCosts()) | |||
302 | if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, | |||
303 | LT.second)) | |||
304 | return LT.first * Entry->Cost; | |||
305 | ||||
306 | static const CostTblEntry SLMCostTable[] = { | |||
307 | { ISD::MUL, MVT::v4i32, 11 }, // pmulld | |||
308 | { ISD::MUL, MVT::v8i16, 2 }, // pmullw | |||
309 | { ISD::FMUL, MVT::f64, 2 }, // mulsd | |||
310 | { ISD::FMUL, MVT::v2f64, 4 }, // mulpd | |||
311 | { ISD::FMUL, MVT::v4f32, 2 }, // mulps | |||
312 | { ISD::FDIV, MVT::f32, 17 }, // divss | |||
313 | { ISD::FDIV, MVT::v4f32, 39 }, // divps | |||
314 | { ISD::FDIV, MVT::f64, 32 }, // divsd | |||
315 | { ISD::FDIV, MVT::v2f64, 69 }, // divpd | |||
316 | { ISD::FADD, MVT::v2f64, 2 }, // addpd | |||
317 | { ISD::FSUB, MVT::v2f64, 2 }, // subpd | |||
318 | // v2i64/v4i64 mul is custom lowered as a series of long: | |||
319 | // multiplies(3), shifts(3) and adds(2) | |||
320 | // slm muldq version throughput is 2 and addq throughput 4 | |||
321 | // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) + | |||
322 | // 3X4 (addq throughput) = 17 | |||
323 | { ISD::MUL, MVT::v2i64, 17 }, | |||
324 | // slm addq\subq throughput is 4 | |||
325 | { ISD::ADD, MVT::v2i64, 4 }, | |||
326 | { ISD::SUB, MVT::v2i64, 4 }, | |||
327 | }; | |||
328 | ||||
329 | if (ST->useSLMArithCosts()) { | |||
330 | if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) { | |||
331 | // Check if the operands can be shrinked into a smaller datatype. | |||
332 | // TODO: Merge this into generiic vXi32 MUL patterns above. | |||
333 | bool Op1Signed = false; | |||
334 | unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed); | |||
335 | bool Op2Signed = false; | |||
336 | unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed); | |||
337 | ||||
338 | bool SignedMode = Op1Signed || Op2Signed; | |||
339 | unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize); | |||
340 | ||||
341 | if (OpMinSize <= 7) | |||
342 | return LT.first * 3; // pmullw/sext | |||
343 | if (!SignedMode && OpMinSize <= 8) | |||
344 | return LT.first * 3; // pmullw/zext | |||
345 | if (OpMinSize <= 15) | |||
346 | return LT.first * 5; // pmullw/pmulhw/pshuf | |||
347 | if (!SignedMode && OpMinSize <= 16) | |||
348 | return LT.first * 5; // pmullw/pmulhw/pshuf | |||
349 | } | |||
350 | ||||
351 | if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, | |||
352 | LT.second)) { | |||
353 | return LT.first * Entry->Cost; | |||
354 | } | |||
355 | } | |||
356 | ||||
357 | static const CostTblEntry AVX512BWUniformConstCostTable[] = { | |||
358 | { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand. | |||
359 | { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand. | |||
360 | { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb. | |||
361 | }; | |||
362 | ||||
363 | if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && | |||
364 | ST->hasBWI()) { | |||
365 | if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD, | |||
366 | LT.second)) | |||
367 | return LT.first * Entry->Cost; | |||
368 | } | |||
369 | ||||
370 | static const CostTblEntry AVX512UniformConstCostTable[] = { | |||
371 | { ISD::SRA, MVT::v2i64, 1 }, | |||
372 | { ISD::SRA, MVT::v4i64, 1 }, | |||
373 | { ISD::SRA, MVT::v8i64, 1 }, | |||
374 | ||||
375 | { ISD::SHL, MVT::v64i8, 4 }, // psllw + pand. | |||
376 | { ISD::SRL, MVT::v64i8, 4 }, // psrlw + pand. | |||
377 | { ISD::SRA, MVT::v64i8, 8 }, // psrlw, pand, pxor, psubb. | |||
378 | ||||
379 | { ISD::SDIV, MVT::v16i32, 6 }, // pmuludq sequence | |||
380 | { ISD::SREM, MVT::v16i32, 8 }, // pmuludq+mul+sub sequence | |||
381 | { ISD::UDIV, MVT::v16i32, 5 }, // pmuludq sequence | |||
382 | { ISD::UREM, MVT::v16i32, 7 }, // pmuludq+mul+sub sequence | |||
383 | }; | |||
384 | ||||
385 | if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && | |||
386 | ST->hasAVX512()) { | |||
387 | if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD, | |||
388 | LT.second)) | |||
389 | return LT.first * Entry->Cost; | |||
390 | } | |||
391 | ||||
392 | static const CostTblEntry AVX2UniformConstCostTable[] = { | |||
393 | { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand. | |||
394 | { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand. | |||
395 | { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb. | |||
396 | ||||
397 | { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle. | |||
398 | ||||
399 | { ISD::SDIV, MVT::v8i32, 6 }, // pmuludq sequence | |||
400 | { ISD::SREM, MVT::v8i32, 8 }, // pmuludq+mul+sub sequence | |||
401 | { ISD::UDIV, MVT::v8i32, 5 }, // pmuludq sequence | |||
402 | { ISD::UREM, MVT::v8i32, 7 }, // pmuludq+mul+sub sequence | |||
403 | }; | |||
404 | ||||
405 | if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && | |||
406 | ST->hasAVX2()) { | |||
407 | if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD, | |||
408 | LT.second)) | |||
409 | return LT.first * Entry->Cost; | |||
410 | } | |||
411 | ||||
412 | static const CostTblEntry SSE2UniformConstCostTable[] = { | |||
413 | { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand. | |||
414 | { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand. | |||
415 | { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. | |||
416 | ||||
417 | { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split. | |||
418 | { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split. | |||
419 | { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split. | |||
420 | ||||
421 | { ISD::SDIV, MVT::v8i32, 12+2 }, // 2*pmuludq sequence + split. | |||
422 | { ISD::SREM, MVT::v8i32, 16+2 }, // 2*pmuludq+mul+sub sequence + split. | |||
423 | { ISD::SDIV, MVT::v4i32, 6 }, // pmuludq sequence | |||
424 | { ISD::SREM, MVT::v4i32, 8 }, // pmuludq+mul+sub sequence | |||
425 | { ISD::UDIV, MVT::v8i32, 10+2 }, // 2*pmuludq sequence + split. | |||
426 | { ISD::UREM, MVT::v8i32, 14+2 }, // 2*pmuludq+mul+sub sequence + split. | |||
427 | { ISD::UDIV, MVT::v4i32, 5 }, // pmuludq sequence | |||
428 | { ISD::UREM, MVT::v4i32, 7 }, // pmuludq+mul+sub sequence | |||
429 | }; | |||
430 | ||||
431 | // XOP has faster vXi8 shifts. | |||
432 | if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && | |||
433 | ST->hasSSE2() && !ST->hasXOP()) { | |||
434 | if (const auto *Entry = | |||
435 | CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second)) | |||
436 | return LT.first * Entry->Cost; | |||
437 | } | |||
438 | ||||
439 | static const CostTblEntry AVX512BWConstCostTable[] = { | |||
440 | { ISD::SDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence | |||
441 | { ISD::SREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence | |||
442 | { ISD::UDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence | |||
443 | { ISD::UREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence | |||
444 | { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence | |||
445 | { ISD::SREM, MVT::v32i16, 8 }, // vpmulhw+mul+sub sequence | |||
446 | { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence | |||
447 | { ISD::UREM, MVT::v32i16, 8 }, // vpmulhuw+mul+sub sequence | |||
448 | }; | |||
449 | ||||
450 | if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue || | |||
451 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && | |||
452 | ST->hasBWI()) { | |||
453 | if (const auto *Entry = | |||
454 | CostTableLookup(AVX512BWConstCostTable, ISD, LT.second)) | |||
455 | return LT.first * Entry->Cost; | |||
456 | } | |||
457 | ||||
458 | static const CostTblEntry AVX512ConstCostTable[] = { | |||
459 | { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence | |||
460 | { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence | |||
461 | { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence | |||
462 | { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence | |||
463 | { ISD::SDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence | |||
464 | { ISD::SREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence | |||
465 | { ISD::UDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence | |||
466 | { ISD::UREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence | |||
467 | { ISD::SDIV, MVT::v32i16, 12 }, // 2*vpmulhw sequence | |||
468 | { ISD::SREM, MVT::v32i16, 16 }, // 2*vpmulhw+mul+sub sequence | |||
469 | { ISD::UDIV, MVT::v32i16, 12 }, // 2*vpmulhuw sequence | |||
470 | { ISD::UREM, MVT::v32i16, 16 }, // 2*vpmulhuw+mul+sub sequence | |||
471 | }; | |||
472 | ||||
473 | if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue || | |||
474 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && | |||
475 | ST->hasAVX512()) { | |||
476 | if (const auto *Entry = | |||
477 | CostTableLookup(AVX512ConstCostTable, ISD, LT.second)) | |||
478 | return LT.first * Entry->Cost; | |||
479 | } | |||
480 | ||||
481 | static const CostTblEntry AVX2ConstCostTable[] = { | |||
482 | { ISD::SDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence | |||
483 | { ISD::SREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence | |||
484 | { ISD::UDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence | |||
485 | { ISD::UREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence | |||
486 | { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence | |||
487 | { ISD::SREM, MVT::v16i16, 8 }, // vpmulhw+mul+sub sequence | |||
488 | { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence | |||
489 | { ISD::UREM, MVT::v16i16, 8 }, // vpmulhuw+mul+sub sequence | |||
490 | { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence | |||
491 | { ISD::SREM, MVT::v8i32, 19 }, // vpmuldq+mul+sub sequence | |||
492 | { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence | |||
493 | { ISD::UREM, MVT::v8i32, 19 }, // vpmuludq+mul+sub sequence | |||
494 | }; | |||
495 | ||||
496 | if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue || | |||
497 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && | |||
498 | ST->hasAVX2()) { | |||
499 | if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second)) | |||
500 | return LT.first * Entry->Cost; | |||
501 | } | |||
502 | ||||
503 | static const CostTblEntry SSE2ConstCostTable[] = { | |||
504 | { ISD::SDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split. | |||
505 | { ISD::SREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split. | |||
506 | { ISD::SDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence | |||
507 | { ISD::SREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence | |||
508 | { ISD::UDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split. | |||
509 | { ISD::UREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split. | |||
510 | { ISD::UDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence | |||
511 | { ISD::UREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence | |||
512 | { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split. | |||
513 | { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split. | |||
514 | { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence | |||
515 | { ISD::SREM, MVT::v8i16, 8 }, // pmulhw+mul+sub sequence | |||
516 | { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split. | |||
517 | { ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split. | |||
518 | { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence | |||
519 | { ISD::UREM, MVT::v8i16, 8 }, // pmulhuw+mul+sub sequence | |||
520 | { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split. | |||
521 | { ISD::SREM, MVT::v8i32, 48+2 }, // 2*pmuludq+mul+sub sequence + split. | |||
522 | { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence | |||
523 | { ISD::SREM, MVT::v4i32, 24 }, // pmuludq+mul+sub sequence | |||
524 | { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split. | |||
525 | { ISD::UREM, MVT::v8i32, 40+2 }, // 2*pmuludq+mul+sub sequence + split. | |||
526 | { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence | |||
527 | { ISD::UREM, MVT::v4i32, 20 }, // pmuludq+mul+sub sequence | |||
528 | }; | |||
529 | ||||
530 | if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue || | |||
531 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && | |||
532 | ST->hasSSE2()) { | |||
533 | // pmuldq sequence. | |||
534 | if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX()) | |||
535 | return LT.first * 32; | |||
536 | if (ISD == ISD::SREM && LT.second == MVT::v8i32 && ST->hasAVX()) | |||
537 | return LT.first * 38; | |||
538 | if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41()) | |||
539 | return LT.first * 15; | |||
540 | if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41()) | |||
541 | return LT.first * 20; | |||
542 | ||||
543 | if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second)) | |||
544 | return LT.first * Entry->Cost; | |||
545 | } | |||
546 | ||||
547 | static const CostTblEntry AVX512BWShiftCostTable[] = { | |||
548 | { ISD::SHL, MVT::v16i8, 4 }, // extend/vpsllvw/pack sequence. | |||
549 | { ISD::SRL, MVT::v16i8, 4 }, // extend/vpsrlvw/pack sequence. | |||
550 | { ISD::SRA, MVT::v16i8, 4 }, // extend/vpsravw/pack sequence. | |||
551 | { ISD::SHL, MVT::v32i8, 4 }, // extend/vpsllvw/pack sequence. | |||
552 | { ISD::SRL, MVT::v32i8, 4 }, // extend/vpsrlvw/pack sequence. | |||
553 | { ISD::SRA, MVT::v32i8, 6 }, // extend/vpsravw/pack sequence. | |||
554 | { ISD::SHL, MVT::v64i8, 6 }, // extend/vpsllvw/pack sequence. | |||
555 | { ISD::SRL, MVT::v64i8, 7 }, // extend/vpsrlvw/pack sequence. | |||
556 | { ISD::SRA, MVT::v64i8, 15 }, // extend/vpsravw/pack sequence. | |||
557 | ||||
558 | { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw | |||
559 | { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw | |||
560 | { ISD::SRA, MVT::v8i16, 1 }, // vpsravw | |||
561 | { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw | |||
562 | { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw | |||
563 | { ISD::SRA, MVT::v16i16, 1 }, // vpsravw | |||
564 | { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw | |||
565 | { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw | |||
566 | { ISD::SRA, MVT::v32i16, 1 }, // vpsravw | |||
567 | }; | |||
568 | ||||
569 | if (ST->hasBWI()) | |||
570 | if (const auto *Entry = CostTableLookup(AVX512BWShiftCostTable, ISD, LT.second)) | |||
571 | return LT.first * Entry->Cost; | |||
572 | ||||
573 | static const CostTblEntry AVX2UniformCostTable[] = { | |||
574 | // Uniform splats are cheaper for the following instructions. | |||
575 | { ISD::SHL, MVT::v16i16, 1 }, // psllw. | |||
576 | { ISD::SRL, MVT::v16i16, 1 }, // psrlw. | |||
577 | { ISD::SRA, MVT::v16i16, 1 }, // psraw. | |||
578 | { ISD::SHL, MVT::v32i16, 2 }, // 2*psllw. | |||
579 | { ISD::SRL, MVT::v32i16, 2 }, // 2*psrlw. | |||
580 | { ISD::SRA, MVT::v32i16, 2 }, // 2*psraw. | |||
581 | ||||
582 | { ISD::SHL, MVT::v8i32, 1 }, // pslld | |||
583 | { ISD::SRL, MVT::v8i32, 1 }, // psrld | |||
584 | { ISD::SRA, MVT::v8i32, 1 }, // psrad | |||
585 | { ISD::SHL, MVT::v4i64, 1 }, // psllq | |||
586 | { ISD::SRL, MVT::v4i64, 1 }, // psrlq | |||
587 | }; | |||
588 | ||||
589 | if (ST->hasAVX2() && | |||
590 | ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || | |||
591 | (Op2Info == TargetTransformInfo::OK_UniformValue))) { | |||
592 | if (const auto *Entry = | |||
593 | CostTableLookup(AVX2UniformCostTable, ISD, LT.second)) | |||
594 | return LT.first * Entry->Cost; | |||
595 | } | |||
596 | ||||
597 | static const CostTblEntry SSE2UniformCostTable[] = { | |||
598 | // Uniform splats are cheaper for the following instructions. | |||
599 | { ISD::SHL, MVT::v8i16, 1 }, // psllw. | |||
600 | { ISD::SHL, MVT::v4i32, 1 }, // pslld | |||
601 | { ISD::SHL, MVT::v2i64, 1 }, // psllq. | |||
602 | ||||
603 | { ISD::SRL, MVT::v8i16, 1 }, // psrlw. | |||
604 | { ISD::SRL, MVT::v4i32, 1 }, // psrld. | |||
605 | { ISD::SRL, MVT::v2i64, 1 }, // psrlq. | |||
606 | ||||
607 | { ISD::SRA, MVT::v8i16, 1 }, // psraw. | |||
608 | { ISD::SRA, MVT::v4i32, 1 }, // psrad. | |||
609 | }; | |||
610 | ||||
611 | if (ST->hasSSE2() && | |||
612 | ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || | |||
613 | (Op2Info == TargetTransformInfo::OK_UniformValue))) { | |||
614 | if (const auto *Entry = | |||
615 | CostTableLookup(SSE2UniformCostTable, ISD, LT.second)) | |||
616 | return LT.first * Entry->Cost; | |||
617 | } | |||
618 | ||||
619 | static const CostTblEntry AVX512DQCostTable[] = { | |||
620 | { ISD::MUL, MVT::v2i64, 2 }, // pmullq | |||
621 | { ISD::MUL, MVT::v4i64, 2 }, // pmullq | |||
622 | { ISD::MUL, MVT::v8i64, 2 } // pmullq | |||
623 | }; | |||
624 | ||||
625 | // Look for AVX512DQ lowering tricks for custom cases. | |||
626 | if (ST->hasDQI()) | |||
627 | if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second)) | |||
628 | return LT.first * Entry->Cost; | |||
629 | ||||
630 | static const CostTblEntry AVX512BWCostTable[] = { | |||
631 | { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence. | |||
632 | { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence. | |||
633 | { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence. | |||
634 | }; | |||
635 | ||||
636 | // Look for AVX512BW lowering tricks for custom cases. | |||
637 | if (ST->hasBWI()) | |||
638 | if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second)) | |||
639 | return LT.first * Entry->Cost; | |||
640 | ||||
641 | static const CostTblEntry AVX512CostTable[] = { | |||
642 | { ISD::SHL, MVT::v4i32, 1 }, | |||
643 | { ISD::SRL, MVT::v4i32, 1 }, | |||
644 | { ISD::SRA, MVT::v4i32, 1 }, | |||
645 | { ISD::SHL, MVT::v8i32, 1 }, | |||
646 | { ISD::SRL, MVT::v8i32, 1 }, | |||
647 | { ISD::SRA, MVT::v8i32, 1 }, | |||
648 | { ISD::SHL, MVT::v16i32, 1 }, | |||
649 | { ISD::SRL, MVT::v16i32, 1 }, | |||
650 | { ISD::SRA, MVT::v16i32, 1 }, | |||
651 | ||||
652 | { ISD::SHL, MVT::v2i64, 1 }, | |||
653 | { ISD::SRL, MVT::v2i64, 1 }, | |||
654 | { ISD::SHL, MVT::v4i64, 1 }, | |||
655 | { ISD::SRL, MVT::v4i64, 1 }, | |||
656 | { ISD::SHL, MVT::v8i64, 1 }, | |||
657 | { ISD::SRL, MVT::v8i64, 1 }, | |||
658 | ||||
659 | { ISD::SRA, MVT::v2i64, 1 }, | |||
660 | { ISD::SRA, MVT::v4i64, 1 }, | |||
661 | { ISD::SRA, MVT::v8i64, 1 }, | |||
662 | ||||
663 | { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org) | |||
664 | { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org) | |||
665 | { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org) | |||
666 | { ISD::MUL, MVT::v8i64, 6 }, // 3*pmuludq/3*shift/2*add | |||
667 | { ISD::MUL, MVT::i64, 1 }, // Skylake from http://www.agner.org/ | |||
668 | ||||
669 | { ISD::FNEG, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ | |||
670 | { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ | |||
671 | { ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ | |||
672 | { ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ | |||
673 | { ISD::FDIV, MVT::f64, 4 }, // Skylake from http://www.agner.org/ | |||
674 | { ISD::FDIV, MVT::v2f64, 4 }, // Skylake from http://www.agner.org/ | |||
675 | { ISD::FDIV, MVT::v4f64, 8 }, // Skylake from http://www.agner.org/ | |||
676 | { ISD::FDIV, MVT::v8f64, 16 }, // Skylake from http://www.agner.org/ | |||
677 | ||||
678 | { ISD::FNEG, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ | |||
679 | { ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ | |||
680 | { ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ | |||
681 | { ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ | |||
682 | { ISD::FDIV, MVT::f32, 3 }, // Skylake from http://www.agner.org/ | |||
683 | { ISD::FDIV, MVT::v4f32, 3 }, // Skylake from http://www.agner.org/ | |||
684 | { ISD::FDIV, MVT::v8f32, 5 }, // Skylake from http://www.agner.org/ | |||
685 | { ISD::FDIV, MVT::v16f32, 10 }, // Skylake from http://www.agner.org/ | |||
686 | }; | |||
687 | ||||
688 | if (ST->hasAVX512()) | |||
689 | if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second)) | |||
690 | return LT.first * Entry->Cost; | |||
691 | ||||
692 | static const CostTblEntry AVX2ShiftCostTable[] = { | |||
693 | // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to | |||
694 | // customize them to detect the cases where shift amount is a scalar one. | |||
695 | { ISD::SHL, MVT::v4i32, 2 }, // vpsllvd (Haswell from agner.org) | |||
696 | { ISD::SRL, MVT::v4i32, 2 }, // vpsrlvd (Haswell from agner.org) | |||
697 | { ISD::SRA, MVT::v4i32, 2 }, // vpsravd (Haswell from agner.org) | |||
698 | { ISD::SHL, MVT::v8i32, 2 }, // vpsllvd (Haswell from agner.org) | |||
699 | { ISD::SRL, MVT::v8i32, 2 }, // vpsrlvd (Haswell from agner.org) | |||
700 | { ISD::SRA, MVT::v8i32, 2 }, // vpsravd (Haswell from agner.org) | |||
701 | { ISD::SHL, MVT::v2i64, 1 }, // vpsllvq (Haswell from agner.org) | |||
702 | { ISD::SRL, MVT::v2i64, 1 }, // vpsrlvq (Haswell from agner.org) | |||
703 | { ISD::SHL, MVT::v4i64, 1 }, // vpsllvq (Haswell from agner.org) | |||
704 | { ISD::SRL, MVT::v4i64, 1 }, // vpsrlvq (Haswell from agner.org) | |||
705 | }; | |||
706 | ||||
707 | if (ST->hasAVX512()) { | |||
708 | if (ISD == ISD::SHL && LT.second == MVT::v32i16 && | |||
709 | (Op2Info == TargetTransformInfo::OK_UniformConstantValue || | |||
710 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) | |||
711 | // On AVX512, a packed v32i16 shift left by a constant build_vector | |||
712 | // is lowered into a vector multiply (vpmullw). | |||
713 | return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, | |||
714 | Op1Info, Op2Info, | |||
715 | TargetTransformInfo::OP_None, | |||
716 | TargetTransformInfo::OP_None); | |||
717 | } | |||
718 | ||||
719 | // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts). | |||
720 | if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) { | |||
721 | if (ISD == ISD::SHL && LT.second == MVT::v16i16 && | |||
722 | (Op2Info == TargetTransformInfo::OK_UniformConstantValue || | |||
723 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) | |||
724 | // On AVX2, a packed v16i16 shift left by a constant build_vector | |||
725 | // is lowered into a vector multiply (vpmullw). | |||
726 | return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, | |||
727 | Op1Info, Op2Info, | |||
728 | TargetTransformInfo::OP_None, | |||
729 | TargetTransformInfo::OP_None); | |||
730 | ||||
731 | if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second)) | |||
732 | return LT.first * Entry->Cost; | |||
733 | } | |||
734 | ||||
735 | static const CostTblEntry XOPShiftCostTable[] = { | |||
736 | // 128bit shifts take 1cy, but right shifts require negation beforehand. | |||
737 | { ISD::SHL, MVT::v16i8, 1 }, | |||
738 | { ISD::SRL, MVT::v16i8, 2 }, | |||
739 | { ISD::SRA, MVT::v16i8, 2 }, | |||
740 | { ISD::SHL, MVT::v8i16, 1 }, | |||
741 | { ISD::SRL, MVT::v8i16, 2 }, | |||
742 | { ISD::SRA, MVT::v8i16, 2 }, | |||
743 | { ISD::SHL, MVT::v4i32, 1 }, | |||
744 | { ISD::SRL, MVT::v4i32, 2 }, | |||
745 | { ISD::SRA, MVT::v4i32, 2 }, | |||
746 | { ISD::SHL, MVT::v2i64, 1 }, | |||
747 | { ISD::SRL, MVT::v2i64, 2 }, | |||
748 | { ISD::SRA, MVT::v2i64, 2 }, | |||
749 | // 256bit shifts require splitting if AVX2 didn't catch them above. | |||
750 | { ISD::SHL, MVT::v32i8, 2+2 }, | |||
751 | { ISD::SRL, MVT::v32i8, 4+2 }, | |||
752 | { ISD::SRA, MVT::v32i8, 4+2 }, | |||
753 | { ISD::SHL, MVT::v16i16, 2+2 }, | |||
754 | { ISD::SRL, MVT::v16i16, 4+2 }, | |||
755 | { ISD::SRA, MVT::v16i16, 4+2 }, | |||
756 | { ISD::SHL, MVT::v8i32, 2+2 }, | |||
757 | { ISD::SRL, MVT::v8i32, 4+2 }, | |||
758 | { ISD::SRA, MVT::v8i32, 4+2 }, | |||
759 | { ISD::SHL, MVT::v4i64, 2+2 }, | |||
760 | { ISD::SRL, MVT::v4i64, 4+2 }, | |||
761 | { ISD::SRA, MVT::v4i64, 4+2 }, | |||
762 | }; | |||
763 | ||||
764 | // Look for XOP lowering tricks. | |||
765 | if (ST->hasXOP()) { | |||
766 | // If the right shift is constant then we'll fold the negation so | |||
767 | // it's as cheap as a left shift. | |||
768 | int ShiftISD = ISD; | |||
769 | if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && | |||
770 | (Op2Info == TargetTransformInfo::OK_UniformConstantValue || | |||
771 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) | |||
772 | ShiftISD = ISD::SHL; | |||
773 | if (const auto *Entry = | |||
774 | CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second)) | |||
775 | return LT.first * Entry->Cost; | |||
776 | } | |||
777 | ||||
778 | static const CostTblEntry SSE2UniformShiftCostTable[] = { | |||
779 | // Uniform splats are cheaper for the following instructions. | |||
780 | { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split. | |||
781 | { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split. | |||
782 | { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split. | |||
783 | ||||
784 | { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split. | |||
785 | { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split. | |||
786 | { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split. | |||
787 | ||||
788 | { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split. | |||
789 | { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split. | |||
790 | { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle. | |||
791 | { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split. | |||
792 | }; | |||
793 | ||||
794 | if (ST->hasSSE2() && | |||
795 | ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || | |||
796 | (Op2Info == TargetTransformInfo::OK_UniformValue))) { | |||
797 | ||||
798 | // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table. | |||
799 | if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2()) | |||
800 | return LT.first * 4; // 2*psrad + shuffle. | |||
801 | ||||
802 | if (const auto *Entry = | |||
803 | CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second)) | |||
804 | return LT.first * Entry->Cost; | |||
805 | } | |||
806 | ||||
807 | if (ISD == ISD::SHL && | |||
808 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) { | |||
809 | MVT VT = LT.second; | |||
810 | // Vector shift left by non uniform constant can be lowered | |||
811 | // into vector multiply. | |||
812 | if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) || | |||
813 | ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX())) | |||
814 | ISD = ISD::MUL; | |||
815 | } | |||
816 | ||||
817 | static const CostTblEntry AVX2CostTable[] = { | |||
818 | { ISD::SHL, MVT::v16i8, 6 }, // vpblendvb sequence. | |||
819 | { ISD::SHL, MVT::v32i8, 6 }, // vpblendvb sequence. | |||
820 | { ISD::SHL, MVT::v64i8, 12 }, // 2*vpblendvb sequence. | |||
821 | { ISD::SHL, MVT::v8i16, 5 }, // extend/vpsrlvd/pack sequence. | |||
822 | { ISD::SHL, MVT::v16i16, 7 }, // extend/vpsrlvd/pack sequence. | |||
823 | { ISD::SHL, MVT::v32i16, 14 }, // 2*extend/vpsrlvd/pack sequence. | |||
824 | ||||
825 | { ISD::SRL, MVT::v16i8, 6 }, // vpblendvb sequence. | |||
826 | { ISD::SRL, MVT::v32i8, 6 }, // vpblendvb sequence. | |||
827 | { ISD::SRL, MVT::v64i8, 12 }, // 2*vpblendvb sequence. | |||
828 | { ISD::SRL, MVT::v8i16, 5 }, // extend/vpsrlvd/pack sequence. | |||
829 | { ISD::SRL, MVT::v16i16, 7 }, // extend/vpsrlvd/pack sequence. | |||
830 | { ISD::SRL, MVT::v32i16, 14 }, // 2*extend/vpsrlvd/pack sequence. | |||
831 | ||||
832 | { ISD::SRA, MVT::v16i8, 17 }, // vpblendvb sequence. | |||
833 | { ISD::SRA, MVT::v32i8, 17 }, // vpblendvb sequence. | |||
834 | { ISD::SRA, MVT::v64i8, 34 }, // 2*vpblendvb sequence. | |||
835 | { ISD::SRA, MVT::v8i16, 5 }, // extend/vpsravd/pack sequence. | |||
836 | { ISD::SRA, MVT::v16i16, 7 }, // extend/vpsravd/pack sequence. | |||
837 | { ISD::SRA, MVT::v32i16, 14 }, // 2*extend/vpsravd/pack sequence. | |||
838 | { ISD::SRA, MVT::v2i64, 2 }, // srl/xor/sub sequence. | |||
839 | { ISD::SRA, MVT::v4i64, 2 }, // srl/xor/sub sequence. | |||
840 | ||||
841 | { ISD::SUB, MVT::v32i8, 1 }, // psubb | |||
842 | { ISD::ADD, MVT::v32i8, 1 }, // paddb | |||
843 | { ISD::SUB, MVT::v16i16, 1 }, // psubw | |||
844 | { ISD::ADD, MVT::v16i16, 1 }, // paddw | |||
845 | { ISD::SUB, MVT::v8i32, 1 }, // psubd | |||
846 | { ISD::ADD, MVT::v8i32, 1 }, // paddd | |||
847 | { ISD::SUB, MVT::v4i64, 1 }, // psubq | |||
848 | { ISD::ADD, MVT::v4i64, 1 }, // paddq | |||
849 | ||||
850 | { ISD::MUL, MVT::v16i16, 1 }, // pmullw | |||
851 | { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org) | |||
852 | { ISD::MUL, MVT::v4i64, 6 }, // 3*pmuludq/3*shift/2*add | |||
853 | ||||
854 | { ISD::FNEG, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ | |||
855 | { ISD::FNEG, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ | |||
856 | { ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ | |||
857 | { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ | |||
858 | { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ | |||
859 | { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ | |||
860 | { ISD::FMUL, MVT::f64, 1 }, // Haswell from http://www.agner.org/ | |||
861 | { ISD::FMUL, MVT::v2f64, 1 }, // Haswell from http://www.agner.org/ | |||
862 | { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ | |||
863 | { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ | |||
864 | ||||
865 | { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/ | |||
866 | { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/ | |||
867 | { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/ | |||
868 | { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/ | |||
869 | { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/ | |||
870 | { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/ | |||
871 | }; | |||
872 | ||||
873 | // Look for AVX2 lowering tricks for custom cases. | |||
874 | if (ST->hasAVX2()) | |||
875 | if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second)) | |||
876 | return LT.first * Entry->Cost; | |||
877 | ||||
878 | static const CostTblEntry AVX1CostTable[] = { | |||
879 | // We don't have to scalarize unsupported ops. We can issue two half-sized | |||
880 | // operations and we only need to extract the upper YMM half. | |||
881 | // Two ops + 1 extract + 1 insert = 4. | |||
882 | { ISD::MUL, MVT::v16i16, 4 }, | |||
883 | { ISD::MUL, MVT::v8i32, 5 }, // BTVER2 from http://www.agner.org/ | |||
884 | { ISD::MUL, MVT::v4i64, 12 }, | |||
885 | ||||
886 | { ISD::SUB, MVT::v32i8, 4 }, | |||
887 | { ISD::ADD, MVT::v32i8, 4 }, | |||
888 | { ISD::SUB, MVT::v16i16, 4 }, | |||
889 | { ISD::ADD, MVT::v16i16, 4 }, | |||
890 | { ISD::SUB, MVT::v8i32, 4 }, | |||
891 | { ISD::ADD, MVT::v8i32, 4 }, | |||
892 | { ISD::SUB, MVT::v4i64, 4 }, | |||
893 | { ISD::ADD, MVT::v4i64, 4 }, | |||
894 | ||||
895 | { ISD::SHL, MVT::v32i8, 22 }, // pblendvb sequence + split. | |||
896 | { ISD::SHL, MVT::v8i16, 6 }, // pblendvb sequence. | |||
897 | { ISD::SHL, MVT::v16i16, 13 }, // pblendvb sequence + split. | |||
898 | { ISD::SHL, MVT::v4i32, 3 }, // pslld/paddd/cvttps2dq/pmulld | |||
899 | { ISD::SHL, MVT::v8i32, 9 }, // pslld/paddd/cvttps2dq/pmulld + split | |||
900 | { ISD::SHL, MVT::v2i64, 2 }, // Shift each lane + blend. | |||
901 | { ISD::SHL, MVT::v4i64, 6 }, // Shift each lane + blend + split. | |||
902 | ||||
903 | { ISD::SRL, MVT::v32i8, 23 }, // pblendvb sequence + split. | |||
904 | { ISD::SRL, MVT::v16i16, 28 }, // pblendvb sequence + split. | |||
905 | { ISD::SRL, MVT::v4i32, 6 }, // Shift each lane + blend. | |||
906 | { ISD::SRL, MVT::v8i32, 14 }, // Shift each lane + blend + split. | |||
907 | { ISD::SRL, MVT::v2i64, 2 }, // Shift each lane + blend. | |||
908 | { ISD::SRL, MVT::v4i64, 6 }, // Shift each lane + blend + split. | |||
909 | ||||
910 | { ISD::SRA, MVT::v32i8, 44 }, // pblendvb sequence + split. | |||
911 | { ISD::SRA, MVT::v16i16, 28 }, // pblendvb sequence + split. | |||
912 | { ISD::SRA, MVT::v4i32, 6 }, // Shift each lane + blend. | |||
913 | { ISD::SRA, MVT::v8i32, 14 }, // Shift each lane + blend + split. | |||
914 | { ISD::SRA, MVT::v2i64, 5 }, // Shift each lane + blend. | |||
915 | { ISD::SRA, MVT::v4i64, 12 }, // Shift each lane + blend + split. | |||
916 | ||||
917 | { ISD::FNEG, MVT::v4f64, 2 }, // BTVER2 from http://www.agner.org/ | |||
918 | { ISD::FNEG, MVT::v8f32, 2 }, // BTVER2 from http://www.agner.org/ | |||
919 | ||||
920 | { ISD::FMUL, MVT::f64, 2 }, // BTVER2 from http://www.agner.org/ | |||
921 | { ISD::FMUL, MVT::v2f64, 2 }, // BTVER2 from http://www.agner.org/ | |||
922 | { ISD::FMUL, MVT::v4f64, 4 }, // BTVER2 from http://www.agner.org/ | |||
923 | ||||
924 | { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/ | |||
925 | { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/ | |||
926 | { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/ | |||
927 | { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/ | |||
928 | { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/ | |||
929 | { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/ | |||
930 | }; | |||
931 | ||||
932 | if (ST->hasAVX()) | |||
933 | if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second)) | |||
934 | return LT.first * Entry->Cost; | |||
935 | ||||
936 | static const CostTblEntry SSE42CostTable[] = { | |||
937 | { ISD::FADD, MVT::f64, 1 }, // Nehalem from http://www.agner.org/ | |||
938 | { ISD::FADD, MVT::f32, 1 }, // Nehalem from http://www.agner.org/ | |||
939 | { ISD::FADD, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/ | |||
940 | { ISD::FADD, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/ | |||
941 | ||||
942 | { ISD::FSUB, MVT::f64, 1 }, // Nehalem from http://www.agner.org/ | |||
943 | { ISD::FSUB, MVT::f32 , 1 }, // Nehalem from http://www.agner.org/ | |||
944 | { ISD::FSUB, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/ | |||
945 | { ISD::FSUB, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/ | |||
946 | ||||
947 | { ISD::FMUL, MVT::f64, 1 }, // Nehalem from http://www.agner.org/ | |||
948 | { ISD::FMUL, MVT::f32, 1 }, // Nehalem from http://www.agner.org/ | |||
949 | { ISD::FMUL, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/ | |||
950 | { ISD::FMUL, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/ | |||
951 | ||||
952 | { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/ | |||
953 | { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/ | |||
954 | { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/ | |||
955 | { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/ | |||
956 | ||||
957 | { ISD::MUL, MVT::v2i64, 6 } // 3*pmuludq/3*shift/2*add | |||
958 | }; | |||
959 | ||||
960 | if (ST->hasSSE42()) | |||
961 | if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second)) | |||
962 | return LT.first * Entry->Cost; | |||
963 | ||||
964 | static const CostTblEntry SSE41CostTable[] = { | |||
965 | { ISD::SHL, MVT::v16i8, 10 }, // pblendvb sequence. | |||
966 | { ISD::SHL, MVT::v8i16, 11 }, // pblendvb sequence. | |||
967 | { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld | |||
968 | ||||
969 | { ISD::SRL, MVT::v16i8, 11 }, // pblendvb sequence. | |||
970 | { ISD::SRL, MVT::v8i16, 13 }, // pblendvb sequence. | |||
971 | { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend. | |||
972 | ||||
973 | { ISD::SRA, MVT::v16i8, 21 }, // pblendvb sequence. | |||
974 | { ISD::SRA, MVT::v8i16, 13 }, // pblendvb sequence. | |||
975 | ||||
976 | { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org) | |||
977 | }; | |||
978 | ||||
979 | if (ST->hasSSE41()) | |||
980 | if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second)) | |||
981 | return LT.first * Entry->Cost; | |||
982 | ||||
983 | static const CostTblEntry SSE2CostTable[] = { | |||
984 | // We don't correctly identify costs of casts because they are marked as | |||
985 | // custom. | |||
986 | { ISD::SHL, MVT::v16i8, 13 }, // cmpgtb sequence. | |||
987 | { ISD::SHL, MVT::v8i16, 25 }, // cmpgtw sequence. | |||
988 | { ISD::SHL, MVT::v4i32, 16 }, // pslld/paddd/cvttps2dq/pmuludq. | |||
989 | { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence. | |||
990 | ||||
991 | { ISD::SRL, MVT::v16i8, 14 }, // cmpgtb sequence. | |||
992 | { ISD::SRL, MVT::v8i16, 16 }, // cmpgtw sequence. | |||
993 | { ISD::SRL, MVT::v4i32, 12 }, // Shift each lane + blend. | |||
994 | { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence. | |||
995 | ||||
996 | { ISD::SRA, MVT::v16i8, 27 }, // unpacked cmpgtb sequence. | |||
997 | { ISD::SRA, MVT::v8i16, 16 }, // cmpgtw sequence. | |||
998 | { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend. | |||
999 | { ISD::SRA, MVT::v2i64, 8 }, // srl/xor/sub splat+shuffle sequence. | |||
1000 | ||||
1001 | { ISD::MUL, MVT::v8i16, 1 }, // pmullw | |||
1002 | { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle | |||
1003 | { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add | |||
1004 | ||||
1005 | { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/ | |||
1006 | { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/ | |||
1007 | { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/ | |||
1008 | { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/ | |||
1009 | ||||
1010 | { ISD::FNEG, MVT::f32, 1 }, // Pentium IV from http://www.agner.org/ | |||
1011 | { ISD::FNEG, MVT::f64, 1 }, // Pentium IV from http://www.agner.org/ | |||
1012 | { ISD::FNEG, MVT::v4f32, 1 }, // Pentium IV from http://www.agner.org/ | |||
1013 | { ISD::FNEG, MVT::v2f64, 1 }, // Pentium IV from http://www.agner.org/ | |||
1014 | ||||
1015 | { ISD::FADD, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/ | |||
1016 | { ISD::FADD, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/ | |||
1017 | ||||
1018 | { ISD::FSUB, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/ | |||
1019 | { ISD::FSUB, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/ | |||
1020 | }; | |||
1021 | ||||
1022 | if (ST->hasSSE2()) | |||
1023 | if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second)) | |||
1024 | return LT.first * Entry->Cost; | |||
1025 | ||||
1026 | static const CostTblEntry SSE1CostTable[] = { | |||
1027 | { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/ | |||
1028 | { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/ | |||
1029 | ||||
1030 | { ISD::FNEG, MVT::f32, 2 }, // Pentium III from http://www.agner.org/ | |||
1031 | { ISD::FNEG, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/ | |||
1032 | ||||
1033 | { ISD::FADD, MVT::f32, 1 }, // Pentium III from http://www.agner.org/ | |||
1034 | { ISD::FADD, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/ | |||
1035 | ||||
1036 | { ISD::FSUB, MVT::f32, 1 }, // Pentium III from http://www.agner.org/ | |||
1037 | { ISD::FSUB, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/ | |||
1038 | }; | |||
1039 | ||||
1040 | if (ST->hasSSE1()) | |||
1041 | if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second)) | |||
1042 | return LT.first * Entry->Cost; | |||
1043 | ||||
1044 | static const CostTblEntry X64CostTbl[] = { // 64-bit targets | |||
1045 | { ISD::ADD, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/ | |||
1046 | { ISD::SUB, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/ | |||
1047 | { ISD::MUL, MVT::i64, 2 }, // Nehalem from http://www.agner.org/ | |||
1048 | }; | |||
1049 | ||||
1050 | if (ST->is64Bit()) | |||
1051 | if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second)) | |||
1052 | return LT.first * Entry->Cost; | |||
1053 | ||||
1054 | static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets | |||
1055 | { ISD::ADD, MVT::i8, 1 }, // Pentium III from http://www.agner.org/ | |||
1056 | { ISD::ADD, MVT::i16, 1 }, // Pentium III from http://www.agner.org/ | |||
1057 | { ISD::ADD, MVT::i32, 1 }, // Pentium III from http://www.agner.org/ | |||
1058 | ||||
1059 | { ISD::SUB, MVT::i8, 1 }, // Pentium III from http://www.agner.org/ | |||
1060 | { ISD::SUB, MVT::i16, 1 }, // Pentium III from http://www.agner.org/ | |||
1061 | { ISD::SUB, MVT::i32, 1 }, // Pentium III from http://www.agner.org/ | |||
1062 | }; | |||
1063 | ||||
1064 | if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second)) | |||
1065 | return LT.first * Entry->Cost; | |||
1066 | ||||
1067 | // It is not a good idea to vectorize division. We have to scalarize it and | |||
1068 | // in the process we will often end up having to spilling regular | |||
1069 | // registers. The overhead of division is going to dominate most kernels | |||
1070 | // anyways so try hard to prevent vectorization of division - it is | |||
1071 | // generally a bad idea. Assume somewhat arbitrarily that we have to be able | |||
1072 | // to hide "20 cycles" for each lane. | |||
1073 | if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM || | |||
1074 | ISD == ISD::UDIV || ISD == ISD::UREM)) { | |||
1075 | InstructionCost ScalarCost = getArithmeticInstrCost( | |||
1076 | Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info, | |||
1077 | TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); | |||
1078 | return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost; | |||
1079 | } | |||
1080 | ||||
1081 | // Fallback to the default implementation. | |||
1082 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, | |||
1083 | Opd1PropInfo, Opd2PropInfo, Args, CxtI); | |||
1084 | } | |||
1085 | ||||
1086 | InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, | |||
1087 | VectorType *BaseTp, | |||
1088 | ArrayRef<int> Mask, int Index, | |||
1089 | VectorType *SubTp, | |||
1090 | ArrayRef<const Value *> Args) { | |||
1091 | // 64-bit packed float vectors (v2f32) are widened to type v4f32. | |||
1092 | // 64-bit packed integer vectors (v2i32) are widened to type v4i32. | |||
1093 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(BaseTp); | |||
1094 | ||||
1095 | Kind = improveShuffleKindFromMask(Kind, Mask); | |||
1096 | // Treat Transpose as 2-op shuffles - there's no difference in lowering. | |||
1097 | if (Kind == TTI::SK_Transpose) | |||
1098 | Kind = TTI::SK_PermuteTwoSrc; | |||
1099 | ||||
1100 | // For Broadcasts we are splatting the first element from the first input | |||
1101 | // register, so only need to reference that input and all the output | |||
1102 | // registers are the same. | |||
1103 | if (Kind == TTI::SK_Broadcast) | |||
1104 | LT.first = 1; | |||
1105 | ||||
1106 | // Subvector extractions are free if they start at the beginning of a | |||
1107 | // vector and cheap if the subvectors are aligned. | |||
1108 | if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) { | |||
1109 | int NumElts = LT.second.getVectorNumElements(); | |||
1110 | if ((Index % NumElts) == 0) | |||
1111 | return 0; | |||
1112 | std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp); | |||
1113 | if (SubLT.second.isVector()) { | |||
1114 | int NumSubElts = SubLT.second.getVectorNumElements(); | |||
1115 | if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) | |||
1116 | return SubLT.first; | |||
1117 | // Handle some cases for widening legalization. For now we only handle | |||
1118 | // cases where the original subvector was naturally aligned and evenly | |||
1119 | // fit in its legalized subvector type. | |||
1120 | // FIXME: Remove some of the alignment restrictions. | |||
1121 | // FIXME: We can use permq for 64-bit or larger extracts from 256-bit | |||
1122 | // vectors. | |||
1123 | int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements(); | |||
1124 | if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 && | |||
1125 | (NumSubElts % OrigSubElts) == 0 && | |||
1126 | LT.second.getVectorElementType() == | |||
1127 | SubLT.second.getVectorElementType() && | |||
1128 | LT.second.getVectorElementType().getSizeInBits() == | |||
1129 | BaseTp->getElementType()->getPrimitiveSizeInBits()) { | |||
1130 | assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&(static_cast <bool> (NumElts >= NumSubElts && NumElts > OrigSubElts && "Unexpected number of elements!" ) ? void (0) : __assert_fail ("NumElts >= NumSubElts && NumElts > OrigSubElts && \"Unexpected number of elements!\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 1131, __extension__ __PRETTY_FUNCTION__)) | |||
1131 | "Unexpected number of elements!")(static_cast <bool> (NumElts >= NumSubElts && NumElts > OrigSubElts && "Unexpected number of elements!" ) ? void (0) : __assert_fail ("NumElts >= NumSubElts && NumElts > OrigSubElts && \"Unexpected number of elements!\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 1131, __extension__ __PRETTY_FUNCTION__)); | |||
1132 | auto *VecTy = FixedVectorType::get(BaseTp->getElementType(), | |||
1133 | LT.second.getVectorNumElements()); | |||
1134 | auto *SubTy = FixedVectorType::get(BaseTp->getElementType(), | |||
1135 | SubLT.second.getVectorNumElements()); | |||
1136 | int ExtractIndex = alignDown((Index % NumElts), NumSubElts); | |||
1137 | InstructionCost ExtractCost = getShuffleCost( | |||
1138 | TTI::SK_ExtractSubvector, VecTy, None, ExtractIndex, SubTy); | |||
1139 | ||||
1140 | // If the original size is 32-bits or more, we can use pshufd. Otherwise | |||
1141 | // if we have SSSE3 we can use pshufb. | |||
1142 | if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3()) | |||
1143 | return ExtractCost + 1; // pshufd or pshufb | |||
1144 | ||||
1145 | assert(SubTp->getPrimitiveSizeInBits() == 16 &&(static_cast <bool> (SubTp->getPrimitiveSizeInBits() == 16 && "Unexpected vector size") ? void (0) : __assert_fail ("SubTp->getPrimitiveSizeInBits() == 16 && \"Unexpected vector size\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 1146, __extension__ __PRETTY_FUNCTION__)) | |||
1146 | "Unexpected vector size")(static_cast <bool> (SubTp->getPrimitiveSizeInBits() == 16 && "Unexpected vector size") ? void (0) : __assert_fail ("SubTp->getPrimitiveSizeInBits() == 16 && \"Unexpected vector size\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 1146, __extension__ __PRETTY_FUNCTION__)); | |||
1147 | ||||
1148 | return ExtractCost + 2; // worst case pshufhw + pshufd | |||
1149 | } | |||
1150 | } | |||
1151 | } | |||
1152 | ||||
1153 | // Subvector insertions are cheap if the subvectors are aligned. | |||
1154 | // Note that in general, the insertion starting at the beginning of a vector | |||
1155 | // isn't free, because we need to preserve the rest of the wide vector. | |||
1156 | if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) { | |||
1157 | int NumElts = LT.second.getVectorNumElements(); | |||
1158 | std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp); | |||
1159 | if (SubLT.second.isVector()) { | |||
1160 | int NumSubElts = SubLT.second.getVectorNumElements(); | |||
1161 | if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) | |||
1162 | return SubLT.first; | |||
1163 | } | |||
1164 | ||||
1165 | // If the insertion isn't aligned, treat it like a 2-op shuffle. | |||
1166 | Kind = TTI::SK_PermuteTwoSrc; | |||
1167 | } | |||
1168 | ||||
1169 | // Handle some common (illegal) sub-vector types as they are often very cheap | |||
1170 | // to shuffle even on targets without PSHUFB. | |||
1171 | EVT VT = TLI->getValueType(DL, BaseTp); | |||
1172 | if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 && | |||
1173 | !ST->hasSSSE3()) { | |||
1174 | static const CostTblEntry SSE2SubVectorShuffleTbl[] = { | |||
1175 | {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw | |||
1176 | {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw | |||
1177 | {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw | |||
1178 | {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw | |||
1179 | {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck | |||
1180 | ||||
1181 | {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw | |||
1182 | {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw | |||
1183 | {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus | |||
1184 | {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck | |||
1185 | ||||
1186 | {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw | |||
1187 | {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw | |||
1188 | {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw | |||
1189 | {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw | |||
1190 | {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck | |||
1191 | ||||
1192 | {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw | |||
1193 | {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw | |||
1194 | {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw | |||
1195 | {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw | |||
1196 | {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck | |||
1197 | }; | |||
1198 | ||||
1199 | if (ST->hasSSE2()) | |||
1200 | if (const auto *Entry = | |||
1201 | CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT())) | |||
1202 | return Entry->Cost; | |||
1203 | } | |||
1204 | ||||
1205 | // We are going to permute multiple sources and the result will be in multiple | |||
1206 | // destinations. Providing an accurate cost only for splits where the element | |||
1207 | // type remains the same. | |||
1208 | if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) { | |||
1209 | MVT LegalVT = LT.second; | |||
1210 | if (LegalVT.isVector() && | |||
1211 | LegalVT.getVectorElementType().getSizeInBits() == | |||
1212 | BaseTp->getElementType()->getPrimitiveSizeInBits() && | |||
1213 | LegalVT.getVectorNumElements() < | |||
1214 | cast<FixedVectorType>(BaseTp)->getNumElements()) { | |||
1215 | ||||
1216 | unsigned VecTySize = DL.getTypeStoreSize(BaseTp); | |||
1217 | unsigned LegalVTSize = LegalVT.getStoreSize(); | |||
1218 | // Number of source vectors after legalization: | |||
1219 | unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize; | |||
1220 | // Number of destination vectors after legalization: | |||
1221 | InstructionCost NumOfDests = LT.first; | |||
1222 | ||||
1223 | auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(), | |||
1224 | LegalVT.getVectorNumElements()); | |||
1225 | ||||
1226 | if (!Mask.empty() && NumOfDests.isValid()) { | |||
1227 | // Try to perform better estimation of the permutation. | |||
1228 | // 1. Split the source/destination vectors into real registers. | |||
1229 | // 2. Do the mask analysis to identify which real registers are | |||
1230 | // permuted. If more than 1 source registers are used for the | |||
1231 | // destination register building, the cost for this destination register | |||
1232 | // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one | |||
1233 | // source register is used, build mask and calculate the cost as a cost | |||
1234 | // of PermuteSingleSrc. | |||
1235 | // Also, for the single register permute we try to identify if the | |||
1236 | // destination register is just a copy of the source register or the | |||
1237 | // copy of the previous destination register (the cost is | |||
1238 | // TTI::TCC_Basic). If the source register is just reused, the cost for | |||
1239 | // this operation is 0. | |||
1240 | unsigned E = *NumOfDests.getValue(); | |||
1241 | unsigned NormalizedVF = | |||
1242 | LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E); | |||
1243 | unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements(); | |||
1244 | unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements(); | |||
1245 | SmallVector<int> NormalizedMask(NormalizedVF, UndefMaskElem); | |||
1246 | copy(Mask, NormalizedMask.begin()); | |||
1247 | unsigned PrevSrcReg = 0; | |||
1248 | ArrayRef<int> PrevRegMask; | |||
1249 | InstructionCost Cost = 0; | |||
1250 | processShuffleMasks( | |||
1251 | NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {}, | |||
1252 | [this, SingleOpTy, &PrevSrcReg, &PrevRegMask, | |||
1253 | &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) { | |||
1254 | if (!ShuffleVectorInst::isIdentityMask(RegMask)) { | |||
1255 | // Check if the previous register can be just copied to the next | |||
1256 | // one. | |||
1257 | if (PrevRegMask.empty() || PrevSrcReg != SrcReg || | |||
1258 | PrevRegMask != RegMask) | |||
1259 | Cost += getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy, | |||
1260 | RegMask, 0, nullptr); | |||
1261 | else | |||
1262 | // Just a copy of previous destination register. | |||
1263 | Cost += TTI::TCC_Basic; | |||
1264 | return; | |||
1265 | } | |||
1266 | if (SrcReg != DestReg && | |||
1267 | any_of(RegMask, [](int I) { return I != UndefMaskElem; })) { | |||
1268 | // Just a copy of the source register. | |||
1269 | Cost += TTI::TCC_Basic; | |||
1270 | } | |||
1271 | PrevSrcReg = SrcReg; | |||
1272 | PrevRegMask = RegMask; | |||
1273 | }, | |||
1274 | [this, SingleOpTy, &Cost](ArrayRef<int> RegMask, | |||
1275 | unsigned /*Unused*/, | |||
1276 | unsigned /*Unused*/) { | |||
1277 | Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask, | |||
1278 | 0, nullptr); | |||
1279 | }); | |||
1280 | return Cost; | |||
1281 | } | |||
1282 | ||||
1283 | InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests; | |||
1284 | return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, | |||
1285 | None, 0, nullptr); | |||
1286 | } | |||
1287 | ||||
1288 | return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp); | |||
1289 | } | |||
1290 | ||||
1291 | // For 2-input shuffles, we must account for splitting the 2 inputs into many. | |||
1292 | if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) { | |||
1293 | // We assume that source and destination have the same vector type. | |||
1294 | InstructionCost NumOfDests = LT.first; | |||
1295 | InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1; | |||
1296 | LT.first = NumOfDests * NumOfShufflesPerDest; | |||
1297 | } | |||
1298 | ||||
1299 | static const CostTblEntry AVX512VBMIShuffleTbl[] = { | |||
1300 | {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb | |||
1301 | {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb | |||
1302 | ||||
1303 | {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb | |||
1304 | {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb | |||
1305 | ||||
1306 | {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b | |||
1307 | {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b | |||
1308 | {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b | |||
1309 | }; | |||
1310 | ||||
1311 | if (ST->hasVBMI()) | |||
1312 | if (const auto *Entry = | |||
1313 | CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second)) | |||
1314 | return LT.first * Entry->Cost; | |||
1315 | ||||
1316 | static const CostTblEntry AVX512BWShuffleTbl[] = { | |||
1317 | {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw | |||
1318 | {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw | |||
1319 | {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb | |||
1320 | ||||
1321 | {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw | |||
1322 | {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw | |||
1323 | {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw | |||
1324 | {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2 | |||
1325 | ||||
1326 | {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw | |||
1327 | {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw | |||
1328 | {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw | |||
1329 | {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw | |||
1330 | {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16 | |||
1331 | ||||
1332 | {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w | |||
1333 | {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w | |||
1334 | {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w | |||
1335 | {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w | |||
1336 | {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1 | |||
1337 | ||||
1338 | {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw | |||
1339 | {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb | |||
1340 | }; | |||
1341 | ||||
1342 | if (ST->hasBWI()) | |||
1343 | if (const auto *Entry = | |||
1344 | CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second)) | |||
1345 | return LT.first * Entry->Cost; | |||
1346 | ||||
1347 | static const CostTblEntry AVX512ShuffleTbl[] = { | |||
1348 | {TTI::SK_Broadcast, MVT::v8f64, 1}, // vbroadcastpd | |||
1349 | {TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps | |||
1350 | {TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq | |||
1351 | {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd | |||
1352 | {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw | |||
1353 | {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw | |||
1354 | {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb | |||
1355 | ||||
1356 | {TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd | |||
1357 | {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps | |||
1358 | {TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq | |||
1359 | {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd | |||
1360 | {TTI::SK_Reverse, MVT::v32i16, 7}, // per mca | |||
1361 | {TTI::SK_Reverse, MVT::v32f16, 7}, // per mca | |||
1362 | {TTI::SK_Reverse, MVT::v64i8, 7}, // per mca | |||
1363 | ||||
1364 | {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd | |||
1365 | {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd | |||
1366 | {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // vpermpd | |||
1367 | {TTI::SK_PermuteSingleSrc, MVT::v16f32, 1}, // vpermps | |||
1368 | {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps | |||
1369 | {TTI::SK_PermuteSingleSrc, MVT::v4f32, 1}, // vpermps | |||
1370 | {TTI::SK_PermuteSingleSrc, MVT::v8i64, 1}, // vpermq | |||
1371 | {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq | |||
1372 | {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // vpermq | |||
1373 | {TTI::SK_PermuteSingleSrc, MVT::v16i32, 1}, // vpermd | |||
1374 | {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd | |||
1375 | {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // vpermd | |||
1376 | {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb | |||
1377 | ||||
1378 | {TTI::SK_PermuteTwoSrc, MVT::v8f64, 1}, // vpermt2pd | |||
1379 | {TTI::SK_PermuteTwoSrc, MVT::v16f32, 1}, // vpermt2ps | |||
1380 | {TTI::SK_PermuteTwoSrc, MVT::v8i64, 1}, // vpermt2q | |||
1381 | {TTI::SK_PermuteTwoSrc, MVT::v16i32, 1}, // vpermt2d | |||
1382 | {TTI::SK_PermuteTwoSrc, MVT::v4f64, 1}, // vpermt2pd | |||
1383 | {TTI::SK_PermuteTwoSrc, MVT::v8f32, 1}, // vpermt2ps | |||
1384 | {TTI::SK_PermuteTwoSrc, MVT::v4i64, 1}, // vpermt2q | |||
1385 | {TTI::SK_PermuteTwoSrc, MVT::v8i32, 1}, // vpermt2d | |||
1386 | {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1}, // vpermt2pd | |||
1387 | {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1}, // vpermt2ps | |||
1388 | {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1}, // vpermt2q | |||
1389 | {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1}, // vpermt2d | |||
1390 | ||||
1391 | // FIXME: This just applies the type legalization cost rules above | |||
1392 | // assuming these completely split. | |||
1393 | {TTI::SK_PermuteSingleSrc, MVT::v32i16, 14}, | |||
1394 | {TTI::SK_PermuteSingleSrc, MVT::v32f16, 14}, | |||
1395 | {TTI::SK_PermuteSingleSrc, MVT::v64i8, 14}, | |||
1396 | {TTI::SK_PermuteTwoSrc, MVT::v32i16, 42}, | |||
1397 | {TTI::SK_PermuteTwoSrc, MVT::v32f16, 42}, | |||
1398 | {TTI::SK_PermuteTwoSrc, MVT::v64i8, 42}, | |||
1399 | ||||
1400 | {TTI::SK_Select, MVT::v32i16, 1}, // vpternlogq | |||
1401 | {TTI::SK_Select, MVT::v32f16, 1}, // vpternlogq | |||
1402 | {TTI::SK_Select, MVT::v64i8, 1}, // vpternlogq | |||
1403 | {TTI::SK_Select, MVT::v8f64, 1}, // vblendmpd | |||
1404 | {TTI::SK_Select, MVT::v16f32, 1}, // vblendmps | |||
1405 | {TTI::SK_Select, MVT::v8i64, 1}, // vblendmq | |||
1406 | {TTI::SK_Select, MVT::v16i32, 1}, // vblendmd | |||
1407 | }; | |||
1408 | ||||
1409 | if (ST->hasAVX512()) | |||
1410 | if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second)) | |||
1411 | return LT.first * Entry->Cost; | |||
1412 | ||||
1413 | static const CostTblEntry AVX2ShuffleTbl[] = { | |||
1414 | {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd | |||
1415 | {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps | |||
1416 | {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq | |||
1417 | {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd | |||
1418 | {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw | |||
1419 | {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw | |||
1420 | {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb | |||
1421 | ||||
1422 | {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd | |||
1423 | {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps | |||
1424 | {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq | |||
1425 | {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd | |||
1426 | {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb | |||
1427 | {TTI::SK_Reverse, MVT::v16f16, 2}, // vperm2i128 + pshufb | |||
1428 | {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb | |||
1429 | ||||
1430 | {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb | |||
1431 | {TTI::SK_Select, MVT::v16f16, 1}, // vpblendvb | |||
1432 | {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb | |||
1433 | ||||
1434 | {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd | |||
1435 | {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps | |||
1436 | {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq | |||
1437 | {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd | |||
1438 | {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb | |||
1439 | // + vpblendvb | |||
1440 | {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vperm2i128 + 2*vpshufb | |||
1441 | // + vpblendvb | |||
1442 | {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb | |||
1443 | // + vpblendvb | |||
1444 | ||||
1445 | {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd | |||
1446 | {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps | |||
1447 | {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd | |||
1448 | {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd | |||
1449 | {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb | |||
1450 | // + vpblendvb | |||
1451 | {TTI::SK_PermuteTwoSrc, MVT::v16f16, 7}, // 2*vperm2i128 + 4*vpshufb | |||
1452 | // + vpblendvb | |||
1453 | {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb | |||
1454 | // + vpblendvb | |||
1455 | }; | |||
1456 | ||||
1457 | if (ST->hasAVX2()) | |||
1458 | if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second)) | |||
1459 | return LT.first * Entry->Cost; | |||
1460 | ||||
1461 | static const CostTblEntry XOPShuffleTbl[] = { | |||
1462 | {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd | |||
1463 | {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps | |||
1464 | {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd | |||
1465 | {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps | |||
1466 | {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm | |||
1467 | // + vinsertf128 | |||
1468 | {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm | |||
1469 | // + vinsertf128 | |||
1470 | ||||
1471 | {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm | |||
1472 | // + vinsertf128 | |||
1473 | {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm | |||
1474 | {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm | |||
1475 | // + vinsertf128 | |||
1476 | {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm | |||
1477 | }; | |||
1478 | ||||
1479 | if (ST->hasXOP()) | |||
1480 | if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second)) | |||
1481 | return LT.first * Entry->Cost; | |||
1482 | ||||
1483 | static const CostTblEntry AVX1ShuffleTbl[] = { | |||
1484 | {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd | |||
1485 | {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps | |||
1486 | {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd | |||
1487 | {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps | |||
1488 | {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128 | |||
1489 | {TTI::SK_Broadcast, MVT::v16f16, 3}, // vpshuflw + vpshufd + vinsertf128 | |||
1490 | {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128 | |||
1491 | ||||
1492 | {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd | |||
1493 | {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps | |||
1494 | {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd | |||
1495 | {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps | |||
1496 | {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb | |||
1497 | // + vinsertf128 | |||
1498 | {TTI::SK_Reverse, MVT::v16f16, 4}, // vextractf128 + 2*pshufb | |||
1499 | // + vinsertf128 | |||
1500 | {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb | |||
1501 | // + vinsertf128 | |||
1502 | ||||
1503 | {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd | |||
1504 | {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd | |||
1505 | {TTI::SK_Select, MVT::v8i32, 1}, // vblendps | |||
1506 | {TTI::SK_Select, MVT::v8f32, 1}, // vblendps | |||
1507 | {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor | |||
1508 | {TTI::SK_Select, MVT::v16f16, 3}, // vpand + vpandn + vpor | |||
1509 | {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor | |||
1510 | ||||
1511 | {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd | |||
1512 | {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd | |||
1513 | {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps | |||
1514 | {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps | |||
1515 | {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb | |||
1516 | // + 2*por + vinsertf128 | |||
1517 | {TTI::SK_PermuteSingleSrc, MVT::v16f16, 8}, // vextractf128 + 4*pshufb | |||
1518 | // + 2*por + vinsertf128 | |||
1519 | {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb | |||
1520 | // + 2*por + vinsertf128 | |||
1521 | ||||
1522 | {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd | |||
1523 | {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd | |||
1524 | {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps | |||
1525 | {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps | |||
1526 | {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb | |||
1527 | // + 4*por + vinsertf128 | |||
1528 | {TTI::SK_PermuteTwoSrc, MVT::v16f16, 15}, // 2*vextractf128 + 8*pshufb | |||
1529 | // + 4*por + vinsertf128 | |||
1530 | {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb | |||
1531 | // + 4*por + vinsertf128 | |||
1532 | }; | |||
1533 | ||||
1534 | if (ST->hasAVX()) | |||
1535 | if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second)) | |||
1536 | return LT.first * Entry->Cost; | |||
1537 | ||||
1538 | static const CostTblEntry SSE41ShuffleTbl[] = { | |||
1539 | {TTI::SK_Select, MVT::v2i64, 1}, // pblendw | |||
1540 | {TTI::SK_Select, MVT::v2f64, 1}, // movsd | |||
1541 | {TTI::SK_Select, MVT::v4i32, 1}, // pblendw | |||
1542 | {TTI::SK_Select, MVT::v4f32, 1}, // blendps | |||
1543 | {TTI::SK_Select, MVT::v8i16, 1}, // pblendw | |||
1544 | {TTI::SK_Select, MVT::v8f16, 1}, // pblendw | |||
1545 | {TTI::SK_Select, MVT::v16i8, 1} // pblendvb | |||
1546 | }; | |||
1547 | ||||
1548 | if (ST->hasSSE41()) | |||
1549 | if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second)) | |||
1550 | return LT.first * Entry->Cost; | |||
1551 | ||||
1552 | static const CostTblEntry SSSE3ShuffleTbl[] = { | |||
1553 | {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb | |||
1554 | {TTI::SK_Broadcast, MVT::v8f16, 1}, // pshufb | |||
1555 | {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb | |||
1556 | ||||
1557 | {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb | |||
1558 | {TTI::SK_Reverse, MVT::v8f16, 1}, // pshufb | |||
1559 | {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb | |||
1560 | ||||
1561 | {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por | |||
1562 | {TTI::SK_Select, MVT::v8f16, 3}, // 2*pshufb + por | |||
1563 | {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por | |||
1564 | ||||
1565 | {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb | |||
1566 | {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // pshufb | |||
1567 | {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb | |||
1568 | ||||
1569 | {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por | |||
1570 | {TTI::SK_PermuteTwoSrc, MVT::v8f16, 3}, // 2*pshufb + por | |||
1571 | {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por | |||
1572 | }; | |||
1573 | ||||
1574 | if (ST->hasSSSE3()) | |||
1575 | if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second)) | |||
1576 | return LT.first * Entry->Cost; | |||
1577 | ||||
1578 | static const CostTblEntry SSE2ShuffleTbl[] = { | |||
1579 | {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd | |||
1580 | {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd | |||
1581 | {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd | |||
1582 | {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd | |||
1583 | {TTI::SK_Broadcast, MVT::v8f16, 2}, // pshuflw + pshufd | |||
1584 | {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd | |||
1585 | ||||
1586 | {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd | |||
1587 | {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd | |||
1588 | {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd | |||
1589 | {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd | |||
1590 | {TTI::SK_Reverse, MVT::v8f16, 3}, // pshuflw + pshufhw + pshufd | |||
1591 | {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw | |||
1592 | // + 2*pshufd + 2*unpck + packus | |||
1593 | ||||
1594 | {TTI::SK_Select, MVT::v2i64, 1}, // movsd | |||
1595 | {TTI::SK_Select, MVT::v2f64, 1}, // movsd | |||
1596 | {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps | |||
1597 | {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por | |||
1598 | {TTI::SK_Select, MVT::v8f16, 3}, // pand + pandn + por | |||
1599 | {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por | |||
1600 | ||||
1601 | {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd | |||
1602 | {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd | |||
1603 | {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd | |||
1604 | {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw | |||
1605 | // + pshufd/unpck | |||
1606 | {TTI::SK_PermuteSingleSrc, MVT::v8f16, 5}, // 2*pshuflw + 2*pshufhw | |||
1607 | // + pshufd/unpck | |||
1608 | { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw | |||
1609 | // + 2*pshufd + 2*unpck + 2*packus | |||
1610 | ||||
1611 | { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd | |||
1612 | { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd | |||
1613 | { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd} | |||
1614 | { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute | |||
1615 | { TTI::SK_PermuteTwoSrc, MVT::v8f16, 8 }, // blend+permute | |||
1616 | { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute | |||
1617 | }; | |||
1618 | ||||
1619 | static const CostTblEntry SSE3BroadcastLoadTbl[] = { | |||
1620 | {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup | |||
1621 | }; | |||
1622 | ||||
1623 | if (ST->hasSSE2()) { | |||
1624 | bool IsLoad = | |||
1625 | llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); }); | |||
1626 | if (ST->hasSSE3() && IsLoad) | |||
1627 | if (const auto *Entry = | |||
1628 | CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) { | |||
1629 | assert(isLegalBroadcastLoad(BaseTp->getElementType(),(static_cast <bool> (isLegalBroadcastLoad(BaseTp->getElementType (), LT.second.getVectorElementCount()) && "Table entry missing from isLegalBroadcastLoad()" ) ? void (0) : __assert_fail ("isLegalBroadcastLoad(BaseTp->getElementType(), LT.second.getVectorElementCount()) && \"Table entry missing from isLegalBroadcastLoad()\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 1631, __extension__ __PRETTY_FUNCTION__)) | |||
1630 | LT.second.getVectorElementCount()) &&(static_cast <bool> (isLegalBroadcastLoad(BaseTp->getElementType (), LT.second.getVectorElementCount()) && "Table entry missing from isLegalBroadcastLoad()" ) ? void (0) : __assert_fail ("isLegalBroadcastLoad(BaseTp->getElementType(), LT.second.getVectorElementCount()) && \"Table entry missing from isLegalBroadcastLoad()\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 1631, __extension__ __PRETTY_FUNCTION__)) | |||
1631 | "Table entry missing from isLegalBroadcastLoad()")(static_cast <bool> (isLegalBroadcastLoad(BaseTp->getElementType (), LT.second.getVectorElementCount()) && "Table entry missing from isLegalBroadcastLoad()" ) ? void (0) : __assert_fail ("isLegalBroadcastLoad(BaseTp->getElementType(), LT.second.getVectorElementCount()) && \"Table entry missing from isLegalBroadcastLoad()\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 1631, __extension__ __PRETTY_FUNCTION__)); | |||
1632 | return LT.first * Entry->Cost; | |||
1633 | } | |||
1634 | ||||
1635 | if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second)) | |||
1636 | return LT.first * Entry->Cost; | |||
1637 | } | |||
1638 | ||||
1639 | static const CostTblEntry SSE1ShuffleTbl[] = { | |||
1640 | { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps | |||
1641 | { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps | |||
1642 | { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps | |||
1643 | { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps | |||
1644 | { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps | |||
1645 | }; | |||
1646 | ||||
1647 | if (ST->hasSSE1()) | |||
1648 | if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second)) | |||
1649 | return LT.first * Entry->Cost; | |||
1650 | ||||
1651 | return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp); | |||
1652 | } | |||
1653 | ||||
1654 | InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, | |||
1655 | Type *Src, | |||
1656 | TTI::CastContextHint CCH, | |||
1657 | TTI::TargetCostKind CostKind, | |||
1658 | const Instruction *I) { | |||
1659 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
1660 | assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ? void (0) : __assert_fail ("ISD && \"Invalid opcode\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 1660, __extension__ __PRETTY_FUNCTION__)); | |||
1661 | ||||
1662 | // TODO: Allow non-throughput costs that aren't binary. | |||
1663 | auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost { | |||
1664 | if (CostKind != TTI::TCK_RecipThroughput) | |||
1665 | return Cost == 0 ? 0 : 1; | |||
1666 | return Cost; | |||
1667 | }; | |||
1668 | ||||
1669 | // The cost tables include both specific, custom (non-legal) src/dst type | |||
1670 | // conversions and generic, legalized types. We test for customs first, before | |||
1671 | // falling back to legalization. | |||
1672 | // FIXME: Need a better design of the cost table to handle non-simple types of | |||
1673 | // potential massive combinations (elem_num x src_type x dst_type). | |||
1674 | static const TypeConversionCostTblEntry AVX512BWConversionTbl[] { | |||
1675 | { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 }, | |||
1676 | { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 }, | |||
1677 | ||||
1678 | // Mask sign extend has an instruction. | |||
1679 | { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 }, | |||
1680 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 }, | |||
1681 | { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 }, | |||
1682 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 }, | |||
1683 | { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 }, | |||
1684 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 }, | |||
1685 | { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 }, | |||
1686 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 }, | |||
1687 | { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 }, | |||
1688 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 }, | |||
1689 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 }, | |||
1690 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 }, | |||
1691 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, | |||
1692 | { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 }, | |||
1693 | { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 }, | |||
1694 | { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 }, | |||
1695 | { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, 1 }, | |||
1696 | ||||
1697 | // Mask zero extend is a sext + shift. | |||
1698 | { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 }, | |||
1699 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 }, | |||
1700 | { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 }, | |||
1701 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 }, | |||
1702 | { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 }, | |||
1703 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 }, | |||
1704 | { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 }, | |||
1705 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 }, | |||
1706 | { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 }, | |||
1707 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 }, | |||
1708 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 }, | |||
1709 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 }, | |||
1710 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 }, | |||
1711 | { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 }, | |||
1712 | { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 }, | |||
1713 | { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 }, | |||
1714 | { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, 2 }, | |||
1715 | ||||
1716 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, | |||
1717 | { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 }, | |||
1718 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, | |||
1719 | { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 }, | |||
1720 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, | |||
1721 | { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 }, | |||
1722 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, | |||
1723 | { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 }, | |||
1724 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, | |||
1725 | { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 }, | |||
1726 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, | |||
1727 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, | |||
1728 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, | |||
1729 | { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, | |||
1730 | { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, 2 }, | |||
1731 | { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, 2 }, | |||
1732 | { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, 2 }, | |||
1733 | ||||
1734 | { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 2 }, | |||
1735 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm | |||
1736 | { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // vpmovwb | |||
1737 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // vpmovwb | |||
1738 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // vpmovwb | |||
1739 | }; | |||
1740 | ||||
1741 | static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = { | |||
1742 | // Mask sign extend has an instruction. | |||
1743 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, | |||
1744 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 }, | |||
1745 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, | |||
1746 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, | |||
1747 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, | |||
1748 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, 1 }, | |||
1749 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, | |||
1750 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, | |||
1751 | ||||
1752 | // Mask zero extend is a sext + shift. | |||
1753 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, | |||
1754 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 }, | |||
1755 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, | |||
1756 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, | |||
1757 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, | |||
1758 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, 2 }, | |||
1759 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, | |||
1760 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, | |||
1761 | ||||
1762 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, | |||
1763 | { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 }, | |||
1764 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, | |||
1765 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, | |||
1766 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, | |||
1767 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, | |||
1768 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, | |||
1769 | { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, 2 }, | |||
1770 | ||||
1771 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, | |||
1772 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, | |||
1773 | ||||
1774 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, | |||
1775 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, | |||
1776 | ||||
1777 | { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 }, | |||
1778 | { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 }, | |||
1779 | ||||
1780 | { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 }, | |||
1781 | { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 }, | |||
1782 | }; | |||
1783 | ||||
1784 | // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and | |||
1785 | // 256-bit wide vectors. | |||
1786 | ||||
1787 | static const TypeConversionCostTblEntry AVX512FConversionTbl[] = { | |||
1788 | { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 }, | |||
1789 | { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 }, | |||
1790 | { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 }, | |||
1791 | ||||
1792 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd | |||
1793 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd | |||
1794 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd | |||
1795 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd | |||
1796 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq | |||
1797 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq | |||
1798 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq | |||
1799 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd | |||
1800 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd | |||
1801 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd | |||
1802 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd | |||
1803 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd | |||
1804 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq | |||
1805 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq | |||
1806 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq | |||
1807 | { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 2 }, // vpmovdb | |||
1808 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 2 }, // vpmovdb | |||
1809 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 }, // vpmovdb | |||
1810 | { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, 2 }, // vpmovdb | |||
1811 | { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, 2 }, // vpmovdb | |||
1812 | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 }, // vpmovdw | |||
1813 | { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, 2 }, // vpmovdw | |||
1814 | { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 2 }, // vpmovqb | |||
1815 | { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1 }, // vpshufb | |||
1816 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 }, // vpmovqb | |||
1817 | { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, 2 }, // vpmovqb | |||
1818 | { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, 2 }, // vpmovqb | |||
1819 | { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, 2 }, // vpmovqb | |||
1820 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 }, // vpmovqw | |||
1821 | { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, 2 }, // vpmovqw | |||
1822 | { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, 2 }, // vpmovqw | |||
1823 | { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, // vpmovqd | |||
1824 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd | |||
1825 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb | |||
1826 | ||||
1827 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32 | |||
1828 | { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 8 }, | |||
1829 | { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, 8 }, | |||
1830 | ||||
1831 | // Sign extend is zmm vpternlogd+vptruncdb. | |||
1832 | // Zero extend is zmm broadcast load+vptruncdw. | |||
1833 | { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 3 }, | |||
1834 | { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 4 }, | |||
1835 | { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 3 }, | |||
1836 | { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 4 }, | |||
1837 | { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 3 }, | |||
1838 | { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 4 }, | |||
1839 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 3 }, | |||
1840 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 4 }, | |||
1841 | ||||
1842 | // Sign extend is zmm vpternlogd+vptruncdw. | |||
1843 | // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw. | |||
1844 | { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 3 }, | |||
1845 | { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 4 }, | |||
1846 | { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 3 }, | |||
1847 | { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 4 }, | |||
1848 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 3 }, | |||
1849 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 4 }, | |||
1850 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 3 }, | |||
1851 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, | |||
1852 | ||||
1853 | { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd | |||
1854 | { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld | |||
1855 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd | |||
1856 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld | |||
1857 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd | |||
1858 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld | |||
1859 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq | |||
1860 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq | |||
1861 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq | |||
1862 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq | |||
1863 | ||||
1864 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd | |||
1865 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld | |||
1866 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq | |||
1867 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq | |||
1868 | ||||
1869 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, | |||
1870 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, | |||
1871 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, | |||
1872 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, | |||
1873 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 }, | |||
1874 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 }, | |||
1875 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, | |||
1876 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, | |||
1877 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, | |||
1878 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, | |||
1879 | ||||
1880 | { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right | |||
1881 | { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right | |||
1882 | ||||
1883 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, | |||
1884 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, | |||
1885 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 }, | |||
1886 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 }, | |||
1887 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, | |||
1888 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 }, | |||
1889 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, | |||
1890 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, | |||
1891 | ||||
1892 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, | |||
1893 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, | |||
1894 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 }, | |||
1895 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 }, | |||
1896 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, | |||
1897 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 }, | |||
1898 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, | |||
1899 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, | |||
1900 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 }, | |||
1901 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 }, | |||
1902 | ||||
1903 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 }, | |||
1904 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, 7 }, | |||
1905 | { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64,15 }, | |||
1906 | { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32,11 }, | |||
1907 | { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64,31 }, | |||
1908 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, 3 }, | |||
1909 | { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, 7 }, | |||
1910 | { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, 5 }, | |||
1911 | { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64,15 }, | |||
1912 | { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 1 }, | |||
1913 | { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, 3 }, | |||
1914 | ||||
1915 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 }, | |||
1916 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 3 }, | |||
1917 | { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 3 }, | |||
1918 | { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 }, | |||
1919 | { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 3 }, | |||
1920 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 3 }, | |||
1921 | }; | |||
1922 | ||||
1923 | static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] { | |||
1924 | // Mask sign extend has an instruction. | |||
1925 | { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 }, | |||
1926 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 }, | |||
1927 | { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 }, | |||
1928 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 }, | |||
1929 | { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 }, | |||
1930 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 }, | |||
1931 | { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 }, | |||
1932 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 }, | |||
1933 | { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 }, | |||
1934 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 }, | |||
1935 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 }, | |||
1936 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 }, | |||
1937 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, | |||
1938 | { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 }, | |||
1939 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, 1 }, | |||
1940 | { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, 1 }, | |||
1941 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, 1 }, | |||
1942 | ||||
1943 | // Mask zero extend is a sext + shift. | |||
1944 | { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 }, | |||
1945 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 }, | |||
1946 | { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 }, | |||
1947 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 }, | |||
1948 | { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 }, | |||
1949 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 }, | |||
1950 | { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 }, | |||
1951 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 }, | |||
1952 | { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 }, | |||
1953 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 }, | |||
1954 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 }, | |||
1955 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 }, | |||
1956 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 }, | |||
1957 | { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 }, | |||
1958 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, 2 }, | |||
1959 | { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, 2 }, | |||
1960 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, 2 }, | |||
1961 | ||||
1962 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, | |||
1963 | { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 }, | |||
1964 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, | |||
1965 | { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 }, | |||
1966 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, | |||
1967 | { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 }, | |||
1968 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, | |||
1969 | { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 }, | |||
1970 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, | |||
1971 | { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 }, | |||
1972 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, | |||
1973 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, | |||
1974 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, | |||
1975 | { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, | |||
1976 | { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, 2 }, | |||
1977 | { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, 2 }, | |||
1978 | { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, 2 }, | |||
1979 | ||||
1980 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, | |||
1981 | }; | |||
1982 | ||||
1983 | static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = { | |||
1984 | // Mask sign extend has an instruction. | |||
1985 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, | |||
1986 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 }, | |||
1987 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, | |||
1988 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, 1 }, | |||
1989 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, | |||
1990 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, 1 }, | |||
1991 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, 1 }, | |||
1992 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, | |||
1993 | ||||
1994 | // Mask zero extend is a sext + shift. | |||
1995 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, | |||
1996 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 }, | |||
1997 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, | |||
1998 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, 2 }, | |||
1999 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, | |||
2000 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, 2 }, | |||
2001 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, 2 }, | |||
2002 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, | |||
2003 | ||||
2004 | { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, 2 }, | |||
2005 | { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, 2 }, | |||
2006 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, | |||
2007 | { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 }, | |||
2008 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, | |||
2009 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, | |||
2010 | { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, 2 }, | |||
2011 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, | |||
2012 | ||||
2013 | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, | |||
2014 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, | |||
2015 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, | |||
2016 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, | |||
2017 | ||||
2018 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, | |||
2019 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, | |||
2020 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, | |||
2021 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, | |||
2022 | ||||
2023 | { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, 1 }, | |||
2024 | { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 }, | |||
2025 | { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, | |||
2026 | { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 }, | |||
2027 | ||||
2028 | { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, 1 }, | |||
2029 | { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 }, | |||
2030 | { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, | |||
2031 | { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 }, | |||
2032 | }; | |||
2033 | ||||
2034 | static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = { | |||
2035 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd | |||
2036 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd | |||
2037 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd | |||
2038 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8 | |||
2039 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq | |||
2040 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq | |||
2041 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq | |||
2042 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16 | |||
2043 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd | |||
2044 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd | |||
2045 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd | |||
2046 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq | |||
2047 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq | |||
2048 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd | |||
2049 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, // vpmovqb | |||
2050 | { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, // vpmovqw | |||
2051 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, // vpmovwb | |||
2052 | ||||
2053 | // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb | |||
2054 | // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb | |||
2055 | { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 5 }, | |||
2056 | { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 6 }, | |||
2057 | { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 5 }, | |||
2058 | { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 6 }, | |||
2059 | { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 5 }, | |||
2060 | { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 6 }, | |||
2061 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 10 }, | |||
2062 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 12 }, | |||
2063 | ||||
2064 | // sign extend is vpcmpeq+maskedmove+vpmovdw | |||
2065 | // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw | |||
2066 | { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 4 }, | |||
2067 | { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 5 }, | |||
2068 | { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 4 }, | |||
2069 | { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 5 }, | |||
2070 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 4 }, | |||
2071 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 5 }, | |||
2072 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 10 }, | |||
2073 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 12 }, | |||
2074 | ||||
2075 | { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd | |||
2076 | { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld | |||
2077 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd | |||
2078 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld | |||
2079 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd | |||
2080 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld | |||
2081 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq | |||
2082 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq | |||
2083 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq | |||
2084 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq | |||
2085 | ||||
2086 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 1 }, | |||
2087 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 1 }, | |||
2088 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 1 }, | |||
2089 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 1 }, | |||
2090 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, | |||
2091 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, | |||
2092 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 1 }, | |||
2093 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 1 }, | |||
2094 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, | |||
2095 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, | |||
2096 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, | |||
2097 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, | |||
2098 | ||||
2099 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, | |||
2100 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 }, | |||
2101 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, | |||
2102 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 }, | |||
2103 | ||||
2104 | { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 }, | |||
2105 | { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 }, | |||
2106 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, | |||
2107 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 }, | |||
2108 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, | |||
2109 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 }, | |||
2110 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, | |||
2111 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, | |||
2112 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, | |||
2113 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, | |||
2114 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 }, | |||
2115 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 }, | |||
2116 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 5 }, | |||
2117 | ||||
2118 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 }, | |||
2119 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 }, | |||
2120 | { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, 5 }, | |||
2121 | ||||
2122 | { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 }, | |||
2123 | { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 }, | |||
2124 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, | |||
2125 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 1 }, | |||
2126 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 }, | |||
2127 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 }, | |||
2128 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 }, | |||
2129 | }; | |||
2130 | ||||
2131 | static const TypeConversionCostTblEntry AVX2ConversionTbl[] = { | |||
2132 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, | |||
2133 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, | |||
2134 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, | |||
2135 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, | |||
2136 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, | |||
2137 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, | |||
2138 | ||||
2139 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 2 }, | |||
2140 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 2 }, | |||
2141 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 2 }, | |||
2142 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 2 }, | |||
2143 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, | |||
2144 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, | |||
2145 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 2 }, | |||
2146 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 2 }, | |||
2147 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, | |||
2148 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, | |||
2149 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 3 }, | |||
2150 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 3 }, | |||
2151 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, | |||
2152 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, | |||
2153 | ||||
2154 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, | |||
2155 | ||||
2156 | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 4 }, | |||
2157 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 4 }, | |||
2158 | { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 1 }, | |||
2159 | { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 1 }, | |||
2160 | { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 1 }, | |||
2161 | { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 4 }, | |||
2162 | { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 4 }, | |||
2163 | { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 1 }, | |||
2164 | { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 1 }, | |||
2165 | { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 5 }, | |||
2166 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, | |||
2167 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 }, | |||
2168 | ||||
2169 | { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 }, | |||
2170 | { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 }, | |||
2171 | ||||
2172 | { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 1 }, | |||
2173 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 1 }, | |||
2174 | { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 1 }, | |||
2175 | { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 3 }, | |||
2176 | ||||
2177 | { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 3 }, | |||
2178 | { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 3 }, | |||
2179 | { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 1 }, | |||
2180 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 }, | |||
2181 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 }, | |||
2182 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4 }, | |||
2183 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 3 }, | |||
2184 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 4 }, | |||
2185 | ||||
2186 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 }, | |||
2187 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 }, | |||
2188 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 }, | |||
2189 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 }, | |||
2190 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, | |||
2191 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, | |||
2192 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 3 }, | |||
2193 | ||||
2194 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 }, | |||
2195 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 }, | |||
2196 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 }, | |||
2197 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 }, | |||
2198 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 }, | |||
2199 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 }, | |||
2200 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 2 }, | |||
2201 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 }, | |||
2202 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 }, | |||
2203 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 }, | |||
2204 | }; | |||
2205 | ||||
2206 | static const TypeConversionCostTblEntry AVXConversionTbl[] = { | |||
2207 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 }, | |||
2208 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 }, | |||
2209 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 }, | |||
2210 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 }, | |||
2211 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, | |||
2212 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, | |||
2213 | ||||
2214 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 3 }, | |||
2215 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 3 }, | |||
2216 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 3 }, | |||
2217 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 3 }, | |||
2218 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 3 }, | |||
2219 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 }, | |||
2220 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 3 }, | |||
2221 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 3 }, | |||
2222 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 3 }, | |||
2223 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 }, | |||
2224 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 3 }, | |||
2225 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 }, | |||
2226 | ||||
2227 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 4 }, | |||
2228 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 5 }, | |||
2229 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 4 }, | |||
2230 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 9 }, | |||
2231 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, 11 }, | |||
2232 | ||||
2233 | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 }, | |||
2234 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 }, | |||
2235 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // and+extract+packuswb | |||
2236 | { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 5 }, | |||
2237 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, | |||
2238 | { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 5 }, | |||
2239 | { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 3 }, // and+extract+2*packusdw | |||
2240 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 }, | |||
2241 | ||||
2242 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, | |||
2243 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 }, | |||
2244 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 }, | |||
2245 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 }, | |||
2246 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 }, | |||
2247 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, | |||
2248 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 }, | |||
2249 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 }, | |||
2250 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 }, | |||
2251 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 }, | |||
2252 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 5 }, | |||
2253 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 8 }, | |||
2254 | ||||
2255 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 }, | |||
2256 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 }, | |||
2257 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 }, | |||
2258 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 }, | |||
2259 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 }, | |||
2260 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, | |||
2261 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 }, | |||
2262 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 4 }, | |||
2263 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 4 }, | |||
2264 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 }, | |||
2265 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 }, | |||
2266 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 }, | |||
2267 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 10 }, | |||
2268 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 10 }, | |||
2269 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 18 }, | |||
2270 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 }, | |||
2271 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 10 }, | |||
2272 | ||||
2273 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 }, | |||
2274 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, 2 }, | |||
2275 | { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, 2 }, | |||
2276 | { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, 2 }, | |||
2277 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 2 }, | |||
2278 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, 2 }, | |||
2279 | { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 2 }, | |||
2280 | { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, 2 }, | |||
2281 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 2 }, | |||
2282 | { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 2 }, | |||
2283 | { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 5 }, | |||
2284 | ||||
2285 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, 2 }, | |||
2286 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, 2 }, | |||
2287 | { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, 2 }, | |||
2288 | { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, 2 }, | |||
2289 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 2 }, | |||
2290 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, 2 }, | |||
2291 | { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 2 }, | |||
2292 | { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, 2 }, | |||
2293 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 }, | |||
2294 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 }, | |||
2295 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 6 }, | |||
2296 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 7 }, | |||
2297 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 7 }, | |||
2298 | ||||
2299 | { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 }, | |||
2300 | { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 }, | |||
2301 | }; | |||
2302 | ||||
2303 | static const TypeConversionCostTblEntry SSE41ConversionTbl[] = { | |||
2304 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 1 }, | |||
2305 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 1 }, | |||
2306 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 1 }, | |||
2307 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 1 }, | |||
2308 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 }, | |||
2309 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 1 }, | |||
2310 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 1 }, | |||
2311 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 1 }, | |||
2312 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 }, | |||
2313 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 1 }, | |||
2314 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 }, | |||
2315 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 1 }, | |||
2316 | ||||
2317 | // These truncates end up widening elements. | |||
2318 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ | |||
2319 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ | |||
2320 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD | |||
2321 | ||||
2322 | { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 2 }, | |||
2323 | { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 2 }, | |||
2324 | { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 2 }, | |||
2325 | ||||
2326 | { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 1 }, | |||
2327 | { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 1 }, | |||
2328 | { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 1 }, | |||
2329 | { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 1 }, | |||
2330 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 }, | |||
2331 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, | |||
2332 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 }, | |||
2333 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, | |||
2334 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, | |||
2335 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 1 }, | |||
2336 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 }, | |||
2337 | ||||
2338 | { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 1 }, | |||
2339 | { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 1 }, | |||
2340 | { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 4 }, | |||
2341 | { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 }, | |||
2342 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 }, | |||
2343 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, | |||
2344 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 }, | |||
2345 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, | |||
2346 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 3 }, | |||
2347 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 }, | |||
2348 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 2 }, | |||
2349 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 12 }, | |||
2350 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 22 }, | |||
2351 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 4 }, | |||
2352 | ||||
2353 | { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 1 }, | |||
2354 | { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 1 }, | |||
2355 | { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 1 }, | |||
2356 | { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 1 }, | |||
2357 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 2 }, | |||
2358 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 2 }, | |||
2359 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 1 }, | |||
2360 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 1 }, | |||
2361 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, | |||
2362 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 1 }, | |||
2363 | ||||
2364 | { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 1 }, | |||
2365 | { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 }, | |||
2366 | { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 1 }, | |||
2367 | { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 }, | |||
2368 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 2 }, | |||
2369 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 2 }, | |||
2370 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 1 }, | |||
2371 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 1 }, | |||
2372 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 4 }, | |||
2373 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 }, | |||
2374 | }; | |||
2375 | ||||
2376 | static const TypeConversionCostTblEntry SSE2ConversionTbl[] = { | |||
2377 | // These are somewhat magic numbers justified by comparing the | |||
2378 | // output of llvm-mca for our various supported scheduler models | |||
2379 | // and basing it off the worst case scenario. | |||
2380 | { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 3 }, | |||
2381 | { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 3 }, | |||
2382 | { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 3 }, | |||
2383 | { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 3 }, | |||
2384 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 3 }, | |||
2385 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 }, | |||
2386 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 3 }, | |||
2387 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 }, | |||
2388 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 }, | |||
2389 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4 }, | |||
2390 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 8 }, | |||
2391 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 8 }, | |||
2392 | ||||
2393 | { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 3 }, | |||
2394 | { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 3 }, | |||
2395 | { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 8 }, | |||
2396 | { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 9 }, | |||
2397 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 }, | |||
2398 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 4 }, | |||
2399 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 4 }, | |||
2400 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 }, | |||
2401 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 7 }, | |||
2402 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 7 }, | |||
2403 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 }, | |||
2404 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 15 }, | |||
2405 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 18 }, | |||
2406 | ||||
2407 | { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 4 }, | |||
2408 | { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 4 }, | |||
2409 | { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 4 }, | |||
2410 | { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 4 }, | |||
2411 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 6 }, | |||
2412 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 6 }, | |||
2413 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 5 }, | |||
2414 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 5 }, | |||
2415 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 4 }, | |||
2416 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 4 }, | |||
2417 | ||||
2418 | { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 4 }, | |||
2419 | { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 }, | |||
2420 | { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 4 }, | |||
2421 | { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 15 }, | |||
2422 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 6 }, | |||
2423 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 6 }, | |||
2424 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 5 }, | |||
2425 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 5 }, | |||
2426 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 8 }, | |||
2427 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 8 }, | |||
2428 | ||||
2429 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 4 }, | |||
2430 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 4 }, | |||
2431 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 2 }, | |||
2432 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 3 }, | |||
2433 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 }, | |||
2434 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 2 }, | |||
2435 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 2 }, | |||
2436 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 3 }, | |||
2437 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 }, | |||
2438 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 2 }, | |||
2439 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 }, | |||
2440 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 2 }, | |||
2441 | ||||
2442 | // These truncates are really widening elements. | |||
2443 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD | |||
2444 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // PUNPCKLWD+DQ | |||
2445 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // PUNPCKLBW+WD+PSHUFD | |||
2446 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 1 }, // PUNPCKLWD | |||
2447 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD | |||
2448 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW | |||
2449 | ||||
2450 | { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 2 }, // PAND+PACKUSWB | |||
2451 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, | |||
2452 | { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 3 }, // PAND+2*PACKUSWB | |||
2453 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 }, | |||
2454 | { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 }, | |||
2455 | { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 3 }, | |||
2456 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, | |||
2457 | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32,10 }, | |||
2458 | { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB | |||
2459 | { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW | |||
2460 | { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, 1 }, // PSHUFD | |||
2461 | }; | |||
2462 | ||||
2463 | // Attempt to map directly to (simple) MVT types to let us match custom entries. | |||
2464 | EVT SrcTy = TLI->getValueType(DL, Src); | |||
2465 | EVT DstTy = TLI->getValueType(DL, Dst); | |||
2466 | ||||
2467 | // The function getSimpleVT only handles simple value types. | |||
2468 | if (SrcTy.isSimple() && DstTy.isSimple()) { | |||
2469 | MVT SimpleSrcTy = SrcTy.getSimpleVT(); | |||
2470 | MVT SimpleDstTy = DstTy.getSimpleVT(); | |||
2471 | ||||
2472 | if (ST->useAVX512Regs()) { | |||
2473 | if (ST->hasBWI()) | |||
2474 | if (const auto *Entry = ConvertCostTableLookup( | |||
2475 | AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) | |||
2476 | return AdjustCost(Entry->Cost); | |||
2477 | ||||
2478 | if (ST->hasDQI()) | |||
2479 | if (const auto *Entry = ConvertCostTableLookup( | |||
2480 | AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) | |||
2481 | return AdjustCost(Entry->Cost); | |||
2482 | ||||
2483 | if (ST->hasAVX512()) | |||
2484 | if (const auto *Entry = ConvertCostTableLookup( | |||
2485 | AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) | |||
2486 | return AdjustCost(Entry->Cost); | |||
2487 | } | |||
2488 | ||||
2489 | if (ST->hasBWI()) | |||
2490 | if (const auto *Entry = ConvertCostTableLookup( | |||
2491 | AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) | |||
2492 | return AdjustCost(Entry->Cost); | |||
2493 | ||||
2494 | if (ST->hasDQI()) | |||
2495 | if (const auto *Entry = ConvertCostTableLookup( | |||
2496 | AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) | |||
2497 | return AdjustCost(Entry->Cost); | |||
2498 | ||||
2499 | if (ST->hasAVX512()) | |||
2500 | if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD, | |||
2501 | SimpleDstTy, SimpleSrcTy)) | |||
2502 | return AdjustCost(Entry->Cost); | |||
2503 | ||||
2504 | if (ST->hasAVX2()) { | |||
2505 | if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, | |||
2506 | SimpleDstTy, SimpleSrcTy)) | |||
2507 | return AdjustCost(Entry->Cost); | |||
2508 | } | |||
2509 | ||||
2510 | if (ST->hasAVX()) { | |||
2511 | if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, | |||
2512 | SimpleDstTy, SimpleSrcTy)) | |||
2513 | return AdjustCost(Entry->Cost); | |||
2514 | } | |||
2515 | ||||
2516 | if (ST->hasSSE41()) { | |||
2517 | if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, | |||
2518 | SimpleDstTy, SimpleSrcTy)) | |||
2519 | return AdjustCost(Entry->Cost); | |||
2520 | } | |||
2521 | ||||
2522 | if (ST->hasSSE2()) { | |||
2523 | if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, | |||
2524 | SimpleDstTy, SimpleSrcTy)) | |||
2525 | return AdjustCost(Entry->Cost); | |||
2526 | } | |||
2527 | } | |||
2528 | ||||
2529 | // Fall back to legalized types. | |||
2530 | std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src); | |||
2531 | std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst); | |||
2532 | ||||
2533 | // If we're truncating to the same legalized type - just assume its free. | |||
2534 | if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second) | |||
2535 | return TTI::TCC_Free; | |||
2536 | ||||
2537 | if (ST->useAVX512Regs()) { | |||
2538 | if (ST->hasBWI()) | |||
2539 | if (const auto *Entry = ConvertCostTableLookup( | |||
2540 | AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second)) | |||
2541 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | |||
2542 | ||||
2543 | if (ST->hasDQI()) | |||
2544 | if (const auto *Entry = ConvertCostTableLookup( | |||
2545 | AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second)) | |||
2546 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | |||
2547 | ||||
2548 | if (ST->hasAVX512()) | |||
2549 | if (const auto *Entry = ConvertCostTableLookup( | |||
2550 | AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second)) | |||
2551 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | |||
2552 | } | |||
2553 | ||||
2554 | if (ST->hasBWI()) | |||
2555 | if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD, | |||
2556 | LTDest.second, LTSrc.second)) | |||
2557 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | |||
2558 | ||||
2559 | if (ST->hasDQI()) | |||
2560 | if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD, | |||
2561 | LTDest.second, LTSrc.second)) | |||
2562 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | |||
2563 | ||||
2564 | if (ST->hasAVX512()) | |||
2565 | if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD, | |||
2566 | LTDest.second, LTSrc.second)) | |||
2567 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | |||
2568 | ||||
2569 | if (ST->hasAVX2()) | |||
2570 | if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, | |||
2571 | LTDest.second, LTSrc.second)) | |||
2572 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | |||
2573 | ||||
2574 | if (ST->hasAVX()) | |||
2575 | if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, | |||
2576 | LTDest.second, LTSrc.second)) | |||
2577 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | |||
2578 | ||||
2579 | if (ST->hasSSE41()) | |||
2580 | if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, | |||
2581 | LTDest.second, LTSrc.second)) | |||
2582 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | |||
2583 | ||||
2584 | if (ST->hasSSE2()) | |||
2585 | if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, | |||
2586 | LTDest.second, LTSrc.second)) | |||
2587 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | |||
2588 | ||||
2589 | // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for | |||
2590 | // sitofp. | |||
2591 | if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) && | |||
2592 | 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) { | |||
2593 | Type *ExtSrc = Src->getWithNewBitWidth(32); | |||
2594 | unsigned ExtOpc = | |||
2595 | (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt; | |||
2596 | ||||
2597 | // For scalar loads the extend would be free. | |||
2598 | InstructionCost ExtCost = 0; | |||
2599 | if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0)))) | |||
2600 | ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind); | |||
2601 | ||||
2602 | return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc, | |||
2603 | TTI::CastContextHint::None, CostKind); | |||
2604 | } | |||
2605 | ||||
2606 | // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi | |||
2607 | // i32. | |||
2608 | if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) && | |||
2609 | 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) { | |||
2610 | Type *TruncDst = Dst->getWithNewBitWidth(32); | |||
2611 | return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) + | |||
2612 | getCastInstrCost(Instruction::Trunc, Dst, TruncDst, | |||
2613 | TTI::CastContextHint::None, CostKind); | |||
2614 | } | |||
2615 | ||||
2616 | return AdjustCost( | |||
2617 | BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); | |||
2618 | } | |||
2619 | ||||
2620 | InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, | |||
2621 | Type *CondTy, | |||
2622 | CmpInst::Predicate VecPred, | |||
2623 | TTI::TargetCostKind CostKind, | |||
2624 | const Instruction *I) { | |||
2625 | // Assume a 3cy latency for fp select ops. | |||
2626 | if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select) | |||
2627 | if (ValTy->getScalarType()->isFloatingPointTy()) | |||
2628 | return 3; | |||
2629 | ||||
2630 | // TODO: Handle other cost kinds. | |||
2631 | if (CostKind != TTI::TCK_RecipThroughput) | |||
2632 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, | |||
2633 | I); | |||
2634 | ||||
2635 | // Legalize the type. | |||
2636 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); | |||
2637 | ||||
2638 | MVT MTy = LT.second; | |||
2639 | ||||
2640 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
2641 | assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ? void (0) : __assert_fail ("ISD && \"Invalid opcode\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 2641, __extension__ __PRETTY_FUNCTION__)); | |||
2642 | ||||
2643 | InstructionCost ExtraCost = 0; | |||
2644 | if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) { | |||
2645 | // Some vector comparison predicates cost extra instructions. | |||
2646 | // TODO: Should we invert this and assume worst case cmp costs | |||
2647 | // and reduce for particular predicates? | |||
2648 | if (MTy.isVector() && | |||
2649 | !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) || | |||
2650 | (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) || | |||
2651 | ST->hasBWI())) { | |||
2652 | // Fallback to I if a specific predicate wasn't specified. | |||
2653 | CmpInst::Predicate Pred = VecPred; | |||
2654 | if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE || | |||
2655 | Pred == CmpInst::BAD_FCMP_PREDICATE)) | |||
2656 | Pred = cast<CmpInst>(I)->getPredicate(); | |||
2657 | ||||
2658 | switch (Pred) { | |||
2659 | case CmpInst::Predicate::ICMP_NE: | |||
2660 | // xor(cmpeq(x,y),-1) | |||
2661 | ExtraCost = 1; | |||
2662 | break; | |||
2663 | case CmpInst::Predicate::ICMP_SGE: | |||
2664 | case CmpInst::Predicate::ICMP_SLE: | |||
2665 | // xor(cmpgt(x,y),-1) | |||
2666 | ExtraCost = 1; | |||
2667 | break; | |||
2668 | case CmpInst::Predicate::ICMP_ULT: | |||
2669 | case CmpInst::Predicate::ICMP_UGT: | |||
2670 | // cmpgt(xor(x,signbit),xor(y,signbit)) | |||
2671 | // xor(cmpeq(pmaxu(x,y),x),-1) | |||
2672 | ExtraCost = 2; | |||
2673 | break; | |||
2674 | case CmpInst::Predicate::ICMP_ULE: | |||
2675 | case CmpInst::Predicate::ICMP_UGE: | |||
2676 | if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) || | |||
2677 | (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) { | |||
2678 | // cmpeq(psubus(x,y),0) | |||
2679 | // cmpeq(pminu(x,y),x) | |||
2680 | ExtraCost = 1; | |||
2681 | } else { | |||
2682 | // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1) | |||
2683 | ExtraCost = 3; | |||
2684 | } | |||
2685 | break; | |||
2686 | case CmpInst::Predicate::BAD_ICMP_PREDICATE: | |||
2687 | case CmpInst::Predicate::BAD_FCMP_PREDICATE: | |||
2688 | // Assume worst case scenario and add the maximum extra cost. | |||
2689 | ExtraCost = 3; | |||
2690 | break; | |||
2691 | default: | |||
2692 | break; | |||
2693 | } | |||
2694 | } | |||
2695 | } | |||
2696 | ||||
2697 | static const CostTblEntry SLMCostTbl[] = { | |||
2698 | // slm pcmpeq/pcmpgt throughput is 2 | |||
2699 | { ISD::SETCC, MVT::v2i64, 2 }, | |||
2700 | // slm pblendvb/blendvpd/blendvps throughput is 4 | |||
2701 | { ISD::SELECT, MVT::v2f64, 4 }, // vblendvpd | |||
2702 | { ISD::SELECT, MVT::v4f32, 4 }, // vblendvps | |||
2703 | { ISD::SELECT, MVT::v2i64, 4 }, // pblendvb | |||
2704 | { ISD::SELECT, MVT::v8i32, 4 }, // pblendvb | |||
2705 | { ISD::SELECT, MVT::v8i16, 4 }, // pblendvb | |||
2706 | { ISD::SELECT, MVT::v16i8, 4 }, // pblendvb | |||
2707 | }; | |||
2708 | ||||
2709 | static const CostTblEntry AVX512BWCostTbl[] = { | |||
2710 | { ISD::SETCC, MVT::v32i16, 1 }, | |||
2711 | { ISD::SETCC, MVT::v64i8, 1 }, | |||
2712 | ||||
2713 | { ISD::SELECT, MVT::v32i16, 1 }, | |||
2714 | { ISD::SELECT, MVT::v64i8, 1 }, | |||
2715 | }; | |||
2716 | ||||
2717 | static const CostTblEntry AVX512CostTbl[] = { | |||
2718 | { ISD::SETCC, MVT::v8i64, 1 }, | |||
2719 | { ISD::SETCC, MVT::v16i32, 1 }, | |||
2720 | { ISD::SETCC, MVT::v8f64, 1 }, | |||
2721 | { ISD::SETCC, MVT::v16f32, 1 }, | |||
2722 | ||||
2723 | { ISD::SELECT, MVT::v8i64, 1 }, | |||
2724 | { ISD::SELECT, MVT::v4i64, 1 }, | |||
2725 | { ISD::SELECT, MVT::v2i64, 1 }, | |||
2726 | { ISD::SELECT, MVT::v16i32, 1 }, | |||
2727 | { ISD::SELECT, MVT::v8i32, 1 }, | |||
2728 | { ISD::SELECT, MVT::v4i32, 1 }, | |||
2729 | { ISD::SELECT, MVT::v8f64, 1 }, | |||
2730 | { ISD::SELECT, MVT::v4f64, 1 }, | |||
2731 | { ISD::SELECT, MVT::v2f64, 1 }, | |||
2732 | { ISD::SELECT, MVT::f64, 1 }, | |||
2733 | { ISD::SELECT, MVT::v16f32, 1 }, | |||
2734 | { ISD::SELECT, MVT::v8f32 , 1 }, | |||
2735 | { ISD::SELECT, MVT::v4f32, 1 }, | |||
2736 | { ISD::SELECT, MVT::f32 , 1 }, | |||
2737 | ||||
2738 | { ISD::SETCC, MVT::v32i16, 2 }, // FIXME: should probably be 4 | |||
2739 | { ISD::SETCC, MVT::v64i8, 2 }, // FIXME: should probably be 4 | |||
2740 | ||||
2741 | { ISD::SELECT, MVT::v32i16, 2 }, | |||
2742 | { ISD::SELECT, MVT::v16i16, 1 }, | |||
2743 | { ISD::SELECT, MVT::v8i16, 1 }, | |||
2744 | { ISD::SELECT, MVT::v64i8, 2 }, | |||
2745 | { ISD::SELECT, MVT::v32i8, 1 }, | |||
2746 | { ISD::SELECT, MVT::v16i8, 1 }, | |||
2747 | }; | |||
2748 | ||||
2749 | static const CostTblEntry AVX2CostTbl[] = { | |||
2750 | { ISD::SETCC, MVT::v4i64, 1 }, | |||
2751 | { ISD::SETCC, MVT::v8i32, 1 }, | |||
2752 | { ISD::SETCC, MVT::v16i16, 1 }, | |||
2753 | { ISD::SETCC, MVT::v32i8, 1 }, | |||
2754 | ||||
2755 | { ISD::SELECT, MVT::v4f64, 2 }, // vblendvpd | |||
2756 | { ISD::SELECT, MVT::v8f32, 2 }, // vblendvps | |||
2757 | { ISD::SELECT, MVT::v4i64, 2 }, // pblendvb | |||
2758 | { ISD::SELECT, MVT::v8i32, 2 }, // pblendvb | |||
2759 | { ISD::SELECT, MVT::v16i16, 2 }, // pblendvb | |||
2760 | { ISD::SELECT, MVT::v32i8, 2 }, // pblendvb | |||
2761 | }; | |||
2762 | ||||
2763 | static const CostTblEntry AVX1CostTbl[] = { | |||
2764 | { ISD::SETCC, MVT::v4f64, 1 }, | |||
2765 | { ISD::SETCC, MVT::v8f32, 1 }, | |||
2766 | // AVX1 does not support 8-wide integer compare. | |||
2767 | { ISD::SETCC, MVT::v4i64, 4 }, | |||
2768 | { ISD::SETCC, MVT::v8i32, 4 }, | |||
2769 | { ISD::SETCC, MVT::v16i16, 4 }, | |||
2770 | { ISD::SETCC, MVT::v32i8, 4 }, | |||
2771 | ||||
2772 | { ISD::SELECT, MVT::v4f64, 3 }, // vblendvpd | |||
2773 | { ISD::SELECT, MVT::v8f32, 3 }, // vblendvps | |||
2774 | { ISD::SELECT, MVT::v4i64, 3 }, // vblendvpd | |||
2775 | { ISD::SELECT, MVT::v8i32, 3 }, // vblendvps | |||
2776 | { ISD::SELECT, MVT::v16i16, 3 }, // vandps + vandnps + vorps | |||
2777 | { ISD::SELECT, MVT::v32i8, 3 }, // vandps + vandnps + vorps | |||
2778 | }; | |||
2779 | ||||
2780 | static const CostTblEntry SSE42CostTbl[] = { | |||
2781 | { ISD::SETCC, MVT::v2i64, 1 }, | |||
2782 | }; | |||
2783 | ||||
2784 | static const CostTblEntry SSE41CostTbl[] = { | |||
2785 | { ISD::SETCC, MVT::v2f64, 1 }, | |||
2786 | { ISD::SETCC, MVT::v4f32, 1 }, | |||
2787 | ||||
2788 | { ISD::SELECT, MVT::v2f64, 2 }, // blendvpd | |||
2789 | { ISD::SELECT, MVT::f64, 2 }, // blendvpd | |||
2790 | { ISD::SELECT, MVT::v4f32, 2 }, // blendvps | |||
2791 | { ISD::SELECT, MVT::f32 , 2 }, // blendvps | |||
2792 | { ISD::SELECT, MVT::v2i64, 2 }, // pblendvb | |||
2793 | { ISD::SELECT, MVT::v4i32, 2 }, // pblendvb | |||
2794 | { ISD::SELECT, MVT::v8i16, 2 }, // pblendvb | |||
2795 | { ISD::SELECT, MVT::v16i8, 2 }, // pblendvb | |||
2796 | }; | |||
2797 | ||||
2798 | static const CostTblEntry SSE2CostTbl[] = { | |||
2799 | { ISD::SETCC, MVT::v2f64, 2 }, | |||
2800 | { ISD::SETCC, MVT::f64, 1 }, | |||
2801 | { ISD::SETCC, MVT::v2i64, 5 }, // pcmpeqd/pcmpgtd expansion | |||
2802 | { ISD::SETCC, MVT::v4i32, 1 }, | |||
2803 | { ISD::SETCC, MVT::v8i16, 1 }, | |||
2804 | { ISD::SETCC, MVT::v16i8, 1 }, | |||
2805 | ||||
2806 | { ISD::SELECT, MVT::v2f64, 2 }, // andpd + andnpd + orpd | |||
2807 | { ISD::SELECT, MVT::f64, 2 }, // andpd + andnpd + orpd | |||
2808 | { ISD::SELECT, MVT::v2i64, 2 }, // pand + pandn + por | |||
2809 | { ISD::SELECT, MVT::v4i32, 2 }, // pand + pandn + por | |||
2810 | { ISD::SELECT, MVT::v8i16, 2 }, // pand + pandn + por | |||
2811 | { ISD::SELECT, MVT::v16i8, 2 }, // pand + pandn + por | |||
2812 | }; | |||
2813 | ||||
2814 | static const CostTblEntry SSE1CostTbl[] = { | |||
2815 | { ISD::SETCC, MVT::v4f32, 2 }, | |||
2816 | { ISD::SETCC, MVT::f32, 1 }, | |||
2817 | ||||
2818 | { ISD::SELECT, MVT::v4f32, 2 }, // andps + andnps + orps | |||
2819 | { ISD::SELECT, MVT::f32, 2 }, // andps + andnps + orps | |||
2820 | }; | |||
2821 | ||||
2822 | if (ST->useSLMArithCosts()) | |||
2823 | if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) | |||
2824 | return LT.first * (ExtraCost + Entry->Cost); | |||
2825 | ||||
2826 | if (ST->hasBWI()) | |||
2827 | if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) | |||
2828 | return LT.first * (ExtraCost + Entry->Cost); | |||
2829 | ||||
2830 | if (ST->hasAVX512()) | |||
2831 | if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) | |||
2832 | return LT.first * (ExtraCost + Entry->Cost); | |||
2833 | ||||
2834 | if (ST->hasAVX2()) | |||
2835 | if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) | |||
2836 | return LT.first * (ExtraCost + Entry->Cost); | |||
2837 | ||||
2838 | if (ST->hasAVX()) | |||
2839 | if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) | |||
2840 | return LT.first * (ExtraCost + Entry->Cost); | |||
2841 | ||||
2842 | if (ST->hasSSE42()) | |||
2843 | if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) | |||
2844 | return LT.first * (ExtraCost + Entry->Cost); | |||
2845 | ||||
2846 | if (ST->hasSSE41()) | |||
2847 | if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) | |||
2848 | return LT.first * (ExtraCost + Entry->Cost); | |||
2849 | ||||
2850 | if (ST->hasSSE2()) | |||
2851 | if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) | |||
2852 | return LT.first * (ExtraCost + Entry->Cost); | |||
2853 | ||||
2854 | if (ST->hasSSE1()) | |||
2855 | if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) | |||
2856 | return LT.first * (ExtraCost + Entry->Cost); | |||
2857 | ||||
2858 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); | |||
2859 | } | |||
2860 | ||||
2861 | unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; } | |||
2862 | ||||
2863 | InstructionCost | |||
2864 | X86TTIImpl::getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, | |||
2865 | TTI::TargetCostKind CostKind) { | |||
2866 | ||||
2867 | // Costs should match the codegen from: | |||
2868 | // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll | |||
2869 | // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll | |||
2870 | // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll | |||
2871 | // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll | |||
2872 | // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll | |||
2873 | ||||
2874 | // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not | |||
2875 | // specialized in these tables yet. | |||
2876 | static const CostTblEntry AVX512BITALGCostTbl[] = { | |||
2877 | { ISD::CTPOP, MVT::v32i16, 1 }, | |||
2878 | { ISD::CTPOP, MVT::v64i8, 1 }, | |||
2879 | { ISD::CTPOP, MVT::v16i16, 1 }, | |||
2880 | { ISD::CTPOP, MVT::v32i8, 1 }, | |||
2881 | { ISD::CTPOP, MVT::v8i16, 1 }, | |||
2882 | { ISD::CTPOP, MVT::v16i8, 1 }, | |||
2883 | }; | |||
2884 | static const CostTblEntry AVX512VPOPCNTDQCostTbl[] = { | |||
2885 | { ISD::CTPOP, MVT::v8i64, 1 }, | |||
2886 | { ISD::CTPOP, MVT::v16i32, 1 }, | |||
2887 | { ISD::CTPOP, MVT::v4i64, 1 }, | |||
2888 | { ISD::CTPOP, MVT::v8i32, 1 }, | |||
2889 | { ISD::CTPOP, MVT::v2i64, 1 }, | |||
2890 | { ISD::CTPOP, MVT::v4i32, 1 }, | |||
2891 | }; | |||
2892 | static const CostTblEntry AVX512CDCostTbl[] = { | |||
2893 | { ISD::CTLZ, MVT::v8i64, 1 }, | |||
2894 | { ISD::CTLZ, MVT::v16i32, 1 }, | |||
2895 | { ISD::CTLZ, MVT::v32i16, 8 }, | |||
2896 | { ISD::CTLZ, MVT::v64i8, 20 }, | |||
2897 | { ISD::CTLZ, MVT::v4i64, 1 }, | |||
2898 | { ISD::CTLZ, MVT::v8i32, 1 }, | |||
2899 | { ISD::CTLZ, MVT::v16i16, 4 }, | |||
2900 | { ISD::CTLZ, MVT::v32i8, 10 }, | |||
2901 | { ISD::CTLZ, MVT::v2i64, 1 }, | |||
2902 | { ISD::CTLZ, MVT::v4i32, 1 }, | |||
2903 | { ISD::CTLZ, MVT::v8i16, 4 }, | |||
2904 | { ISD::CTLZ, MVT::v16i8, 4 }, | |||
2905 | }; | |||
2906 | static const CostTblEntry AVX512BWCostTbl[] = { | |||
2907 | { ISD::ABS, MVT::v32i16, 1 }, | |||
2908 | { ISD::ABS, MVT::v64i8, 1 }, | |||
2909 | { ISD::BITREVERSE, MVT::v8i64, 3 }, | |||
2910 | { ISD::BITREVERSE, MVT::v16i32, 3 }, | |||
2911 | { ISD::BITREVERSE, MVT::v32i16, 3 }, | |||
2912 | { ISD::BITREVERSE, MVT::v64i8, 2 }, | |||
2913 | { ISD::BSWAP, MVT::v8i64, 1 }, | |||
2914 | { ISD::BSWAP, MVT::v16i32, 1 }, | |||
2915 | { ISD::BSWAP, MVT::v32i16, 1 }, | |||
2916 | { ISD::CTLZ, MVT::v8i64, 23 }, | |||
2917 | { ISD::CTLZ, MVT::v16i32, 22 }, | |||
2918 | { ISD::CTLZ, MVT::v32i16, 18 }, | |||
2919 | { ISD::CTLZ, MVT::v64i8, 17 }, | |||
2920 | { ISD::CTPOP, MVT::v8i64, 7 }, | |||
2921 | { ISD::CTPOP, MVT::v16i32, 11 }, | |||
2922 | { ISD::CTPOP, MVT::v32i16, 9 }, | |||
2923 | { ISD::CTPOP, MVT::v64i8, 6 }, | |||
2924 | { ISD::CTTZ, MVT::v8i64, 10 }, | |||
2925 | { ISD::CTTZ, MVT::v16i32, 14 }, | |||
2926 | { ISD::CTTZ, MVT::v32i16, 12 }, | |||
2927 | { ISD::CTTZ, MVT::v64i8, 9 }, | |||
2928 | { ISD::SADDSAT, MVT::v32i16, 1 }, | |||
2929 | { ISD::SADDSAT, MVT::v64i8, 1 }, | |||
2930 | { ISD::SMAX, MVT::v32i16, 1 }, | |||
2931 | { ISD::SMAX, MVT::v64i8, 1 }, | |||
2932 | { ISD::SMIN, MVT::v32i16, 1 }, | |||
2933 | { ISD::SMIN, MVT::v64i8, 1 }, | |||
2934 | { ISD::SSUBSAT, MVT::v32i16, 1 }, | |||
2935 | { ISD::SSUBSAT, MVT::v64i8, 1 }, | |||
2936 | { ISD::UADDSAT, MVT::v32i16, 1 }, | |||
2937 | { ISD::UADDSAT, MVT::v64i8, 1 }, | |||
2938 | { ISD::UMAX, MVT::v32i16, 1 }, | |||
2939 | { ISD::UMAX, MVT::v64i8, 1 }, | |||
2940 | { ISD::UMIN, MVT::v32i16, 1 }, | |||
2941 | { ISD::UMIN, MVT::v64i8, 1 }, | |||
2942 | { ISD::USUBSAT, MVT::v32i16, 1 }, | |||
2943 | { ISD::USUBSAT, MVT::v64i8, 1 }, | |||
2944 | }; | |||
2945 | static const CostTblEntry AVX512CostTbl[] = { | |||
2946 | { ISD::ABS, MVT::v8i64, 1 }, | |||
2947 | { ISD::ABS, MVT::v16i32, 1 }, | |||
2948 | { ISD::ABS, MVT::v32i16, 2 }, | |||
2949 | { ISD::ABS, MVT::v64i8, 2 }, | |||
2950 | { ISD::ABS, MVT::v4i64, 1 }, | |||
2951 | { ISD::ABS, MVT::v2i64, 1 }, | |||
2952 | { ISD::BITREVERSE, MVT::v8i64, 36 }, | |||
2953 | { ISD::BITREVERSE, MVT::v16i32, 24 }, | |||
2954 | { ISD::BITREVERSE, MVT::v32i16, 10 }, | |||
2955 | { ISD::BITREVERSE, MVT::v64i8, 10 }, | |||
2956 | { ISD::BSWAP, MVT::v8i64, 4 }, | |||
2957 | { ISD::BSWAP, MVT::v16i32, 4 }, | |||
2958 | { ISD::BSWAP, MVT::v32i16, 4 }, | |||
2959 | { ISD::CTLZ, MVT::v8i64, 29 }, | |||
2960 | { ISD::CTLZ, MVT::v16i32, 35 }, | |||
2961 | { ISD::CTLZ, MVT::v32i16, 28 }, | |||
2962 | { ISD::CTLZ, MVT::v64i8, 18 }, | |||
2963 | { ISD::CTPOP, MVT::v8i64, 16 }, | |||
2964 | { ISD::CTPOP, MVT::v16i32, 24 }, | |||
2965 | { ISD::CTPOP, MVT::v32i16, 18 }, | |||
2966 | { ISD::CTPOP, MVT::v64i8, 12 }, | |||
2967 | { ISD::CTTZ, MVT::v8i64, 20 }, | |||
2968 | { ISD::CTTZ, MVT::v16i32, 28 }, | |||
2969 | { ISD::CTTZ, MVT::v32i16, 24 }, | |||
2970 | { ISD::CTTZ, MVT::v64i8, 18 }, | |||
2971 | { ISD::SMAX, MVT::v8i64, 1 }, | |||
2972 | { ISD::SMAX, MVT::v16i32, 1 }, | |||
2973 | { ISD::SMAX, MVT::v32i16, 2 }, | |||
2974 | { ISD::SMAX, MVT::v64i8, 2 }, | |||
2975 | { ISD::SMAX, MVT::v4i64, 1 }, | |||
2976 | { ISD::SMAX, MVT::v2i64, 1 }, | |||
2977 | { ISD::SMIN, MVT::v8i64, 1 }, | |||
2978 | { ISD::SMIN, MVT::v16i32, 1 }, | |||
2979 | { ISD::SMIN, MVT::v32i16, 2 }, | |||
2980 | { ISD::SMIN, MVT::v64i8, 2 }, | |||
2981 | { ISD::SMIN, MVT::v4i64, 1 }, | |||
2982 | { ISD::SMIN, MVT::v2i64, 1 }, | |||
2983 | { ISD::UMAX, MVT::v8i64, 1 }, | |||
2984 | { ISD::UMAX, MVT::v16i32, 1 }, | |||
2985 | { ISD::UMAX, MVT::v32i16, 2 }, | |||
2986 | { ISD::UMAX, MVT::v64i8, 2 }, | |||
2987 | { ISD::UMAX, MVT::v4i64, 1 }, | |||
2988 | { ISD::UMAX, MVT::v2i64, 1 }, | |||
2989 | { ISD::UMIN, MVT::v8i64, 1 }, | |||
2990 | { ISD::UMIN, MVT::v16i32, 1 }, | |||
2991 | { ISD::UMIN, MVT::v32i16, 2 }, | |||
2992 | { ISD::UMIN, MVT::v64i8, 2 }, | |||
2993 | { ISD::UMIN, MVT::v4i64, 1 }, | |||
2994 | { ISD::UMIN, MVT::v2i64, 1 }, | |||
2995 | { ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd | |||
2996 | { ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq | |||
2997 | { ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq | |||
2998 | { ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq | |||
2999 | { ISD::UADDSAT, MVT::v16i32, 3 }, // not + pminud + paddd | |||
3000 | { ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq | |||
3001 | { ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq | |||
3002 | { ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq | |||
3003 | { ISD::SADDSAT, MVT::v32i16, 2 }, | |||
3004 | { ISD::SADDSAT, MVT::v64i8, 2 }, | |||
3005 | { ISD::SSUBSAT, MVT::v32i16, 2 }, | |||
3006 | { ISD::SSUBSAT, MVT::v64i8, 2 }, | |||
3007 | { ISD::UADDSAT, MVT::v32i16, 2 }, | |||
3008 | { ISD::UADDSAT, MVT::v64i8, 2 }, | |||
3009 | { ISD::USUBSAT, MVT::v32i16, 2 }, | |||
3010 | { ISD::USUBSAT, MVT::v64i8, 2 }, | |||
3011 | { ISD::FMAXNUM, MVT::f32, 2 }, | |||
3012 | { ISD::FMAXNUM, MVT::v4f32, 2 }, | |||
3013 | { ISD::FMAXNUM, MVT::v8f32, 2 }, | |||
3014 | { ISD::FMAXNUM, MVT::v16f32, 2 }, | |||
3015 | { ISD::FMAXNUM, MVT::f64, 2 }, | |||
3016 | { ISD::FMAXNUM, MVT::v2f64, 2 }, | |||
3017 | { ISD::FMAXNUM, MVT::v4f64, 2 }, | |||
3018 | { ISD::FMAXNUM, MVT::v8f64, 2 }, | |||
3019 | }; | |||
3020 | static const CostTblEntry XOPCostTbl[] = { | |||
3021 | { ISD::BITREVERSE, MVT::v4i64, 4 }, | |||
3022 | { ISD::BITREVERSE, MVT::v8i32, 4 }, | |||
3023 | { ISD::BITREVERSE, MVT::v16i16, 4 }, | |||
3024 | { ISD::BITREVERSE, MVT::v32i8, 4 }, | |||
3025 | { ISD::BITREVERSE, MVT::v2i64, 1 }, | |||
3026 | { ISD::BITREVERSE, MVT::v4i32, 1 }, | |||
3027 | { ISD::BITREVERSE, MVT::v8i16, 1 }, | |||
3028 | { ISD::BITREVERSE, MVT::v16i8, 1 }, | |||
3029 | { ISD::BITREVERSE, MVT::i64, 3 }, | |||
3030 | { ISD::BITREVERSE, MVT::i32, 3 }, | |||
3031 | { ISD::BITREVERSE, MVT::i16, 3 }, | |||
3032 | { ISD::BITREVERSE, MVT::i8, 3 } | |||
3033 | }; | |||
3034 | static const CostTblEntry AVX2CostTbl[] = { | |||
3035 | { ISD::ABS, MVT::v4i64, 2 }, // VBLENDVPD(X,VPSUBQ(0,X),X) | |||
3036 | { ISD::ABS, MVT::v8i32, 1 }, | |||
3037 | { ISD::ABS, MVT::v16i16, 1 }, | |||
3038 | { ISD::ABS, MVT::v32i8, 1 }, | |||
3039 | { ISD::BITREVERSE, MVT::v2i64, 3 }, | |||
3040 | { ISD::BITREVERSE, MVT::v4i64, 3 }, | |||
3041 | { ISD::BITREVERSE, MVT::v4i32, 3 }, | |||
3042 | { ISD::BITREVERSE, MVT::v8i32, 3 }, | |||
3043 | { ISD::BITREVERSE, MVT::v8i16, 3 }, | |||
3044 | { ISD::BITREVERSE, MVT::v16i16, 3 }, | |||
3045 | { ISD::BITREVERSE, MVT::v16i8, 3 }, | |||
3046 | { ISD::BITREVERSE, MVT::v32i8, 3 }, | |||
3047 | { ISD::BSWAP, MVT::v4i64, 1 }, | |||
3048 | { ISD::BSWAP, MVT::v8i32, 1 }, | |||
3049 | { ISD::BSWAP, MVT::v16i16, 1 }, | |||
3050 | { ISD::CTLZ, MVT::v2i64, 7 }, | |||
3051 | { ISD::CTLZ, MVT::v4i64, 7 }, | |||
3052 | { ISD::CTLZ, MVT::v4i32, 5 }, | |||
3053 | { ISD::CTLZ, MVT::v8i32, 5 }, | |||
3054 | { ISD::CTLZ, MVT::v8i16, 4 }, | |||
3055 | { ISD::CTLZ, MVT::v16i16, 4 }, | |||
3056 | { ISD::CTLZ, MVT::v16i8, 3 }, | |||
3057 | { ISD::CTLZ, MVT::v32i8, 3 }, | |||
3058 | { ISD::CTPOP, MVT::v2i64, 3 }, | |||
3059 | { ISD::CTPOP, MVT::v4i64, 3 }, | |||
3060 | { ISD::CTPOP, MVT::v4i32, 7 }, | |||
3061 | { ISD::CTPOP, MVT::v8i32, 7 }, | |||
3062 | { ISD::CTPOP, MVT::v8i16, 3 }, | |||
3063 | { ISD::CTPOP, MVT::v16i16, 3 }, | |||
3064 | { ISD::CTPOP, MVT::v16i8, 2 }, | |||
3065 | { ISD::CTPOP, MVT::v32i8, 2 }, | |||
3066 | { ISD::CTTZ, MVT::v2i64, 4 }, | |||
3067 | { ISD::CTTZ, MVT::v4i64, 4 }, | |||
3068 | { ISD::CTTZ, MVT::v4i32, 7 }, | |||
3069 | { ISD::CTTZ, MVT::v8i32, 7 }, | |||
3070 | { ISD::CTTZ, MVT::v8i16, 4 }, | |||
3071 | { ISD::CTTZ, MVT::v16i16, 4 }, | |||
3072 | { ISD::CTTZ, MVT::v16i8, 3 }, | |||
3073 | { ISD::CTTZ, MVT::v32i8, 3 }, | |||
3074 | { ISD::SADDSAT, MVT::v16i16, 1 }, | |||
3075 | { ISD::SADDSAT, MVT::v32i8, 1 }, | |||
3076 | { ISD::SMAX, MVT::v8i32, 1 }, | |||
3077 | { ISD::SMAX, MVT::v16i16, 1 }, | |||
3078 | { ISD::SMAX, MVT::v32i8, 1 }, | |||
3079 | { ISD::SMIN, MVT::v8i32, 1 }, | |||
3080 | { ISD::SMIN, MVT::v16i16, 1 }, | |||
3081 | { ISD::SMIN, MVT::v32i8, 1 }, | |||
3082 | { ISD::SSUBSAT, MVT::v16i16, 1 }, | |||
3083 | { ISD::SSUBSAT, MVT::v32i8, 1 }, | |||
3084 | { ISD::UADDSAT, MVT::v16i16, 1 }, | |||
3085 | { ISD::UADDSAT, MVT::v32i8, 1 }, | |||
3086 | { ISD::UADDSAT, MVT::v8i32, 3 }, // not + pminud + paddd | |||
3087 | { ISD::UMAX, MVT::v8i32, 1 }, | |||
3088 | { ISD::UMAX, MVT::v16i16, 1 }, | |||
3089 | { ISD::UMAX, MVT::v32i8, 1 }, | |||
3090 | { ISD::UMIN, MVT::v8i32, 1 }, | |||
3091 | { ISD::UMIN, MVT::v16i16, 1 }, | |||
3092 | { ISD::UMIN, MVT::v32i8, 1 }, | |||
3093 | { ISD::USUBSAT, MVT::v16i16, 1 }, | |||
3094 | { ISD::USUBSAT, MVT::v32i8, 1 }, | |||
3095 | { ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd | |||
3096 | { ISD::FMAXNUM, MVT::v8f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS | |||
3097 | { ISD::FMAXNUM, MVT::v4f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD | |||
3098 | { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/ | |||
3099 | { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/ | |||
3100 | { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/ | |||
3101 | { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/ | |||
3102 | { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/ | |||
3103 | { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/ | |||
3104 | }; | |||
3105 | static const CostTblEntry AVX1CostTbl[] = { | |||
3106 | { ISD::ABS, MVT::v4i64, 5 }, // VBLENDVPD(X,VPSUBQ(0,X),X) | |||
3107 | { ISD::ABS, MVT::v8i32, 3 }, | |||
3108 | { ISD::ABS, MVT::v16i16, 3 }, | |||
3109 | { ISD::ABS, MVT::v32i8, 3 }, | |||
3110 | { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert | |||
3111 | { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert | |||
3112 | { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert | |||
3113 | { ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert | |||
3114 | { ISD::BSWAP, MVT::v4i64, 4 }, | |||
3115 | { ISD::BSWAP, MVT::v8i32, 4 }, | |||
3116 | { ISD::BSWAP, MVT::v16i16, 4 }, | |||
3117 | { ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert | |||
3118 | { ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert | |||
3119 | { ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert | |||
3120 | { ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert | |||
3121 | { ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert | |||
3122 | { ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert | |||
3123 | { ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert | |||
3124 | { ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert | |||
3125 | { ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert | |||
3126 | { ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert | |||
3127 | { ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert | |||
3128 | { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert | |||
3129 | { ISD::SADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert | |||
3130 | { ISD::SADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert | |||
3131 | { ISD::SMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert | |||
3132 | { ISD::SMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert | |||
3133 | { ISD::SMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert | |||
3134 | { ISD::SMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert | |||
3135 | { ISD::SMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert | |||
3136 | { ISD::SMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert | |||
3137 | { ISD::SSUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert | |||
3138 | { ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert | |||
3139 | { ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert | |||
3140 | { ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert | |||
3141 | { ISD::UADDSAT, MVT::v8i32, 8 }, // 2 x 128-bit Op + extract/insert | |||
3142 | { ISD::UMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert | |||
3143 | { ISD::UMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert | |||
3144 | { ISD::UMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert | |||
3145 | { ISD::UMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert | |||
3146 | { ISD::UMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert | |||
3147 | { ISD::UMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert | |||
3148 | { ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert | |||
3149 | { ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert | |||
3150 | { ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert | |||
3151 | { ISD::FMAXNUM, MVT::f32, 3 }, // MAXSS + CMPUNORDSS + BLENDVPS | |||
3152 | { ISD::FMAXNUM, MVT::v4f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS | |||
3153 | { ISD::FMAXNUM, MVT::v8f32, 5 }, // MAXPS + CMPUNORDPS + BLENDVPS + ? | |||
3154 | { ISD::FMAXNUM, MVT::f64, 3 }, // MAXSD + CMPUNORDSD + BLENDVPD | |||
3155 | { ISD::FMAXNUM, MVT::v2f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD | |||
3156 | { ISD::FMAXNUM, MVT::v4f64, 5 }, // MAXPD + CMPUNORDPD + BLENDVPD + ? | |||
3157 | { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/ | |||
3158 | { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/ | |||
3159 | { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/ | |||
3160 | { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/ | |||
3161 | { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/ | |||
3162 | { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/ | |||
3163 | }; | |||
3164 | static const CostTblEntry GLMCostTbl[] = { | |||
3165 | { ISD::FSQRT, MVT::f32, 19 }, // sqrtss | |||
3166 | { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps | |||
3167 | { ISD::FSQRT, MVT::f64, 34 }, // sqrtsd | |||
3168 | { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd | |||
3169 | }; | |||
3170 | static const CostTblEntry SLMCostTbl[] = { | |||
3171 | { ISD::FSQRT, MVT::f32, 20 }, // sqrtss | |||
3172 | { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps | |||
3173 | { ISD::FSQRT, MVT::f64, 35 }, // sqrtsd | |||
3174 | { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd | |||
3175 | }; | |||
3176 | static const CostTblEntry SSE42CostTbl[] = { | |||
3177 | { ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd | |||
3178 | { ISD::UADDSAT, MVT::v4i32, 3 }, // not + pminud + paddd | |||
3179 | { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/ | |||
3180 | { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/ | |||
3181 | }; | |||
3182 | static const CostTblEntry SSE41CostTbl[] = { | |||
3183 | { ISD::ABS, MVT::v2i64, 2 }, // BLENDVPD(X,PSUBQ(0,X),X) | |||
3184 | { ISD::SMAX, MVT::v4i32, 1 }, | |||
3185 | { ISD::SMAX, MVT::v16i8, 1 }, | |||
3186 | { ISD::SMIN, MVT::v4i32, 1 }, | |||
3187 | { ISD::SMIN, MVT::v16i8, 1 }, | |||
3188 | { ISD::UMAX, MVT::v4i32, 1 }, | |||
3189 | { ISD::UMAX, MVT::v8i16, 1 }, | |||
3190 | { ISD::UMIN, MVT::v4i32, 1 }, | |||
3191 | { ISD::UMIN, MVT::v8i16, 1 }, | |||
3192 | }; | |||
3193 | static const CostTblEntry SSSE3CostTbl[] = { | |||
3194 | { ISD::ABS, MVT::v4i32, 1 }, | |||
3195 | { ISD::ABS, MVT::v8i16, 1 }, | |||
3196 | { ISD::ABS, MVT::v16i8, 1 }, | |||
3197 | { ISD::BITREVERSE, MVT::v2i64, 5 }, | |||
3198 | { ISD::BITREVERSE, MVT::v4i32, 5 }, | |||
3199 | { ISD::BITREVERSE, MVT::v8i16, 5 }, | |||
3200 | { ISD::BITREVERSE, MVT::v16i8, 5 }, | |||
3201 | { ISD::BSWAP, MVT::v2i64, 1 }, | |||
3202 | { ISD::BSWAP, MVT::v4i32, 1 }, | |||
3203 | { ISD::BSWAP, MVT::v8i16, 1 }, | |||
3204 | { ISD::CTLZ, MVT::v2i64, 23 }, | |||
3205 | { ISD::CTLZ, MVT::v4i32, 18 }, | |||
3206 | { ISD::CTLZ, MVT::v8i16, 14 }, | |||
3207 | { ISD::CTLZ, MVT::v16i8, 9 }, | |||
3208 | { ISD::CTPOP, MVT::v2i64, 7 }, | |||
3209 | { ISD::CTPOP, MVT::v4i32, 11 }, | |||
3210 | { ISD::CTPOP, MVT::v8i16, 9 }, | |||
3211 | { ISD::CTPOP, MVT::v16i8, 6 }, | |||
3212 | { ISD::CTTZ, MVT::v2i64, 10 }, | |||
3213 | { ISD::CTTZ, MVT::v4i32, 14 }, | |||
3214 | { ISD::CTTZ, MVT::v8i16, 12 }, | |||
3215 | { ISD::CTTZ, MVT::v16i8, 9 } | |||
3216 | }; | |||
3217 | static const CostTblEntry SSE2CostTbl[] = { | |||
3218 | { ISD::ABS, MVT::v2i64, 4 }, | |||
3219 | { ISD::ABS, MVT::v4i32, 3 }, | |||
3220 | { ISD::ABS, MVT::v8i16, 2 }, | |||
3221 | { ISD::ABS, MVT::v16i8, 2 }, | |||
3222 | { ISD::BITREVERSE, MVT::v2i64, 29 }, | |||
3223 | { ISD::BITREVERSE, MVT::v4i32, 27 }, | |||
3224 | { ISD::BITREVERSE, MVT::v8i16, 27 }, | |||
3225 | { ISD::BITREVERSE, MVT::v16i8, 20 }, | |||
3226 | { ISD::BSWAP, MVT::v2i64, 7 }, | |||
3227 | { ISD::BSWAP, MVT::v4i32, 7 }, | |||
3228 | { ISD::BSWAP, MVT::v8i16, 7 }, | |||
3229 | { ISD::CTLZ, MVT::v2i64, 25 }, | |||
3230 | { ISD::CTLZ, MVT::v4i32, 26 }, | |||
3231 | { ISD::CTLZ, MVT::v8i16, 20 }, | |||
3232 | { ISD::CTLZ, MVT::v16i8, 17 }, | |||
3233 | { ISD::CTPOP, MVT::v2i64, 12 }, | |||
3234 | { ISD::CTPOP, MVT::v4i32, 15 }, | |||
3235 | { ISD::CTPOP, MVT::v8i16, 13 }, | |||
3236 | { ISD::CTPOP, MVT::v16i8, 10 }, | |||
3237 | { ISD::CTTZ, MVT::v2i64, 14 }, | |||
3238 | { ISD::CTTZ, MVT::v4i32, 18 }, | |||
3239 | { ISD::CTTZ, MVT::v8i16, 16 }, | |||
3240 | { ISD::CTTZ, MVT::v16i8, 13 }, | |||
3241 | { ISD::SADDSAT, MVT::v8i16, 1 }, | |||
3242 | { ISD::SADDSAT, MVT::v16i8, 1 }, | |||
3243 | { ISD::SMAX, MVT::v8i16, 1 }, | |||
3244 | { ISD::SMIN, MVT::v8i16, 1 }, | |||
3245 | { ISD::SSUBSAT, MVT::v8i16, 1 }, | |||
3246 | { ISD::SSUBSAT, MVT::v16i8, 1 }, | |||
3247 | { ISD::UADDSAT, MVT::v8i16, 1 }, | |||
3248 | { ISD::UADDSAT, MVT::v16i8, 1 }, | |||
3249 | { ISD::UMAX, MVT::v8i16, 2 }, | |||
3250 | { ISD::UMAX, MVT::v16i8, 1 }, | |||
3251 | { ISD::UMIN, MVT::v8i16, 2 }, | |||
3252 | { ISD::UMIN, MVT::v16i8, 1 }, | |||
3253 | { ISD::USUBSAT, MVT::v8i16, 1 }, | |||
3254 | { ISD::USUBSAT, MVT::v16i8, 1 }, | |||
3255 | { ISD::FMAXNUM, MVT::f64, 4 }, | |||
3256 | { ISD::FMAXNUM, MVT::v2f64, 4 }, | |||
3257 | { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/ | |||
3258 | { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/ | |||
3259 | }; | |||
3260 | static const CostTblEntry SSE1CostTbl[] = { | |||
3261 | { ISD::FMAXNUM, MVT::f32, 4 }, | |||
3262 | { ISD::FMAXNUM, MVT::v4f32, 4 }, | |||
3263 | { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/ | |||
3264 | { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/ | |||
3265 | }; | |||
3266 | static const CostTblEntry BMI64CostTbl[] = { // 64-bit targets | |||
3267 | { ISD::CTTZ, MVT::i64, 1 }, | |||
3268 | }; | |||
3269 | static const CostTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets | |||
3270 | { ISD::CTTZ, MVT::i32, 1 }, | |||
3271 | { ISD::CTTZ, MVT::i16, 1 }, | |||
3272 | { ISD::CTTZ, MVT::i8, 1 }, | |||
3273 | }; | |||
3274 | static const CostTblEntry LZCNT64CostTbl[] = { // 64-bit targets | |||
3275 | { ISD::CTLZ, MVT::i64, 1 }, | |||
3276 | }; | |||
3277 | static const CostTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets | |||
3278 | { ISD::CTLZ, MVT::i32, 1 }, | |||
3279 | { ISD::CTLZ, MVT::i16, 1 }, | |||
3280 | { ISD::CTLZ, MVT::i8, 1 }, | |||
3281 | }; | |||
3282 | static const CostTblEntry POPCNT64CostTbl[] = { // 64-bit targets | |||
3283 | { ISD::CTPOP, MVT::i64, 1 }, | |||
3284 | }; | |||
3285 | static const CostTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets | |||
3286 | { ISD::CTPOP, MVT::i32, 1 }, | |||
3287 | { ISD::CTPOP, MVT::i16, 1 }, | |||
3288 | { ISD::CTPOP, MVT::i8, 1 }, | |||
3289 | }; | |||
3290 | static const CostTblEntry X64CostTbl[] = { // 64-bit targets | |||
3291 | { ISD::ABS, MVT::i64, 2 }, // SUB+CMOV | |||
3292 | { ISD::BITREVERSE, MVT::i64, 14 }, | |||
3293 | { ISD::BSWAP, MVT::i64, 1 }, | |||
3294 | { ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV | |||
3295 | { ISD::CTTZ, MVT::i64, 3 }, // TEST+BSF+CMOV/BRANCH | |||
3296 | { ISD::CTPOP, MVT::i64, 10 }, | |||
3297 | { ISD::SADDO, MVT::i64, 1 }, | |||
3298 | { ISD::UADDO, MVT::i64, 1 }, | |||
3299 | { ISD::UMULO, MVT::i64, 2 }, // mulq + seto | |||
3300 | }; | |||
3301 | static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets | |||
3302 | { ISD::ABS, MVT::i32, 2 }, // SUB+CMOV | |||
3303 | { ISD::ABS, MVT::i16, 2 }, // SUB+CMOV | |||
3304 | { ISD::BITREVERSE, MVT::i32, 14 }, | |||
3305 | { ISD::BITREVERSE, MVT::i16, 14 }, | |||
3306 | { ISD::BITREVERSE, MVT::i8, 11 }, | |||
3307 | { ISD::BSWAP, MVT::i32, 1 }, | |||
3308 | { ISD::BSWAP, MVT::i16, 1 }, // ROL | |||
3309 | { ISD::CTLZ, MVT::i32, 4 }, // BSR+XOR or BSR+XOR+CMOV | |||
3310 | { ISD::CTLZ, MVT::i16, 4 }, // BSR+XOR or BSR+XOR+CMOV | |||
3311 | { ISD::CTLZ, MVT::i8, 4 }, // BSR+XOR or BSR+XOR+CMOV | |||
3312 | { ISD::CTTZ, MVT::i32, 3 }, // TEST+BSF+CMOV/BRANCH | |||
3313 | { ISD::CTTZ, MVT::i16, 3 }, // TEST+BSF+CMOV/BRANCH | |||
3314 | { ISD::CTTZ, MVT::i8, 3 }, // TEST+BSF+CMOV/BRANCH | |||
3315 | { ISD::CTPOP, MVT::i32, 8 }, | |||
3316 | { ISD::CTPOP, MVT::i16, 9 }, | |||
3317 | { ISD::CTPOP, MVT::i8, 7 }, | |||
3318 | { ISD::SADDO, MVT::i32, 1 }, | |||
3319 | { ISD::SADDO, MVT::i16, 1 }, | |||
3320 | { ISD::SADDO, MVT::i8, 1 }, | |||
3321 | { ISD::UADDO, MVT::i32, 1 }, | |||
3322 | { ISD::UADDO, MVT::i16, 1 }, | |||
3323 | { ISD::UADDO, MVT::i8, 1 }, | |||
3324 | { ISD::UMULO, MVT::i32, 2 }, // mul + seto | |||
3325 | { ISD::UMULO, MVT::i16, 2 }, | |||
3326 | { ISD::UMULO, MVT::i8, 2 }, | |||
3327 | }; | |||
3328 | ||||
3329 | Type *RetTy = ICA.getReturnType(); | |||
3330 | Type *OpTy = RetTy; | |||
3331 | Intrinsic::ID IID = ICA.getID(); | |||
3332 | unsigned ISD = ISD::DELETED_NODE; | |||
3333 | switch (IID) { | |||
3334 | default: | |||
3335 | break; | |||
3336 | case Intrinsic::abs: | |||
3337 | ISD = ISD::ABS; | |||
3338 | break; | |||
3339 | case Intrinsic::bitreverse: | |||
3340 | ISD = ISD::BITREVERSE; | |||
3341 | break; | |||
3342 | case Intrinsic::bswap: | |||
3343 | ISD = ISD::BSWAP; | |||
3344 | break; | |||
3345 | case Intrinsic::ctlz: | |||
3346 | ISD = ISD::CTLZ; | |||
3347 | break; | |||
3348 | case Intrinsic::ctpop: | |||
3349 | ISD = ISD::CTPOP; | |||
3350 | break; | |||
3351 | case Intrinsic::cttz: | |||
3352 | ISD = ISD::CTTZ; | |||
3353 | break; | |||
3354 | case Intrinsic::maxnum: | |||
3355 | case Intrinsic::minnum: | |||
3356 | // FMINNUM has same costs so don't duplicate. | |||
3357 | ISD = ISD::FMAXNUM; | |||
3358 | break; | |||
3359 | case Intrinsic::sadd_sat: | |||
3360 | ISD = ISD::SADDSAT; | |||
3361 | break; | |||
3362 | case Intrinsic::smax: | |||
3363 | ISD = ISD::SMAX; | |||
3364 | break; | |||
3365 | case Intrinsic::smin: | |||
3366 | ISD = ISD::SMIN; | |||
3367 | break; | |||
3368 | case Intrinsic::ssub_sat: | |||
3369 | ISD = ISD::SSUBSAT; | |||
3370 | break; | |||
3371 | case Intrinsic::uadd_sat: | |||
3372 | ISD = ISD::UADDSAT; | |||
3373 | break; | |||
3374 | case Intrinsic::umax: | |||
3375 | ISD = ISD::UMAX; | |||
3376 | break; | |||
3377 | case Intrinsic::umin: | |||
3378 | ISD = ISD::UMIN; | |||
3379 | break; | |||
3380 | case Intrinsic::usub_sat: | |||
3381 | ISD = ISD::USUBSAT; | |||
3382 | break; | |||
3383 | case Intrinsic::sqrt: | |||
3384 | ISD = ISD::FSQRT; | |||
3385 | break; | |||
3386 | case Intrinsic::sadd_with_overflow: | |||
3387 | case Intrinsic::ssub_with_overflow: | |||
3388 | // SSUBO has same costs so don't duplicate. | |||
3389 | ISD = ISD::SADDO; | |||
3390 | OpTy = RetTy->getContainedType(0); | |||
3391 | break; | |||
3392 | case Intrinsic::uadd_with_overflow: | |||
3393 | case Intrinsic::usub_with_overflow: | |||
3394 | // USUBO has same costs so don't duplicate. | |||
3395 | ISD = ISD::UADDO; | |||
3396 | OpTy = RetTy->getContainedType(0); | |||
3397 | break; | |||
3398 | case Intrinsic::umul_with_overflow: | |||
3399 | case Intrinsic::smul_with_overflow: | |||
3400 | // SMULO has same costs so don't duplicate. | |||
3401 | ISD = ISD::UMULO; | |||
3402 | OpTy = RetTy->getContainedType(0); | |||
3403 | break; | |||
3404 | } | |||
3405 | ||||
3406 | if (ISD != ISD::DELETED_NODE) { | |||
3407 | // Legalize the type. | |||
3408 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy); | |||
3409 | MVT MTy = LT.second; | |||
3410 | ||||
3411 | // Attempt to lookup cost. | |||
3412 | if (ISD == ISD::BITREVERSE && ST->hasGFNI() && ST->hasSSSE3() && | |||
3413 | MTy.isVector()) { | |||
3414 | // With PSHUFB the code is very similar for all types. If we have integer | |||
3415 | // byte operations, we just need a GF2P8AFFINEQB for vXi8. For other types | |||
3416 | // we also need a PSHUFB. | |||
3417 | unsigned Cost = MTy.getVectorElementType() == MVT::i8 ? 1 : 2; | |||
3418 | ||||
3419 | // Without byte operations, we need twice as many GF2P8AFFINEQB and PSHUFB | |||
3420 | // instructions. We also need an extract and an insert. | |||
3421 | if (!(MTy.is128BitVector() || (ST->hasAVX2() && MTy.is256BitVector()) || | |||
3422 | (ST->hasBWI() && MTy.is512BitVector()))) | |||
3423 | Cost = Cost * 2 + 2; | |||
3424 | ||||
3425 | return LT.first * Cost; | |||
3426 | } | |||
3427 | ||||
3428 | auto adjustTableCost = [](const CostTblEntry &Entry, | |||
3429 | InstructionCost LegalizationCost, | |||
3430 | FastMathFlags FMF) { | |||
3431 | // If there are no NANs to deal with, then these are reduced to a | |||
3432 | // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we | |||
3433 | // assume is used in the non-fast case. | |||
3434 | if (Entry.ISD == ISD::FMAXNUM || Entry.ISD == ISD::FMINNUM) { | |||
3435 | if (FMF.noNaNs()) | |||
3436 | return LegalizationCost * 1; | |||
3437 | } | |||
3438 | return LegalizationCost * (int)Entry.Cost; | |||
3439 | }; | |||
3440 | ||||
3441 | if (ST->useGLMDivSqrtCosts()) | |||
3442 | if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy)) | |||
3443 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
3444 | ||||
3445 | if (ST->useSLMArithCosts()) | |||
3446 | if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) | |||
3447 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
3448 | ||||
3449 | if (ST->hasBITALG()) | |||
3450 | if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy)) | |||
3451 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
3452 | ||||
3453 | if (ST->hasVPOPCNTDQ()) | |||
3454 | if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy)) | |||
3455 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
3456 | ||||
3457 | if (ST->hasCDI()) | |||
3458 | if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy)) | |||
3459 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
3460 | ||||
3461 | if (ST->hasBWI()) | |||
3462 | if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) | |||
3463 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
3464 | ||||
3465 | if (ST->hasAVX512()) | |||
3466 | if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) | |||
3467 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
3468 | ||||
3469 | if (ST->hasXOP()) | |||
3470 | if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy)) | |||
3471 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
3472 | ||||
3473 | if (ST->hasAVX2()) | |||
3474 | if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) | |||
3475 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
3476 | ||||
3477 | if (ST->hasAVX()) | |||
3478 | if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) | |||
3479 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
3480 | ||||
3481 | if (ST->hasSSE42()) | |||
3482 | if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) | |||
3483 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
3484 | ||||
3485 | if (ST->hasSSE41()) | |||
3486 | if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) | |||
3487 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
3488 | ||||
3489 | if (ST->hasSSSE3()) | |||
3490 | if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy)) | |||
3491 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
3492 | ||||
3493 | if (ST->hasSSE2()) | |||
3494 | if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) | |||
3495 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
3496 | ||||
3497 | if (ST->hasSSE1()) | |||
3498 | if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) | |||
3499 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
3500 | ||||
3501 | if (ST->hasBMI()) { | |||
3502 | if (ST->is64Bit()) | |||
3503 | if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy)) | |||
3504 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
3505 | ||||
3506 | if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy)) | |||
3507 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
3508 | } | |||
3509 | ||||
3510 | if (ST->hasLZCNT()) { | |||
3511 | if (ST->is64Bit()) | |||
3512 | if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy)) | |||
3513 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
3514 | ||||
3515 | if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy)) | |||
3516 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
3517 | } | |||
3518 | ||||
3519 | if (ST->hasPOPCNT()) { | |||
3520 | if (ST->is64Bit()) | |||
3521 | if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy)) | |||
3522 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
3523 | ||||
3524 | if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy)) | |||
3525 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
3526 | } | |||
3527 | ||||
3528 | if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) { | |||
3529 | if (const Instruction *II = ICA.getInst()) { | |||
3530 | if (II->hasOneUse() && isa<StoreInst>(II->user_back())) | |||
3531 | return TTI::TCC_Free; | |||
3532 | if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) { | |||
3533 | if (LI->hasOneUse()) | |||
3534 | return TTI::TCC_Free; | |||
3535 | } | |||
3536 | } | |||
3537 | } | |||
3538 | ||||
3539 | if (ST->is64Bit()) | |||
3540 | if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy)) | |||
3541 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
3542 | ||||
3543 | if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy)) | |||
3544 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
3545 | } | |||
3546 | ||||
3547 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); | |||
3548 | } | |||
3549 | ||||
3550 | InstructionCost | |||
3551 | X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, | |||
3552 | TTI::TargetCostKind CostKind) { | |||
3553 | if (ICA.isTypeBasedOnly()) | |||
3554 | return getTypeBasedIntrinsicInstrCost(ICA, CostKind); | |||
3555 | ||||
3556 | static const CostTblEntry AVX512BWCostTbl[] = { | |||
3557 | { ISD::ROTL, MVT::v32i16, 2 }, | |||
3558 | { ISD::ROTL, MVT::v16i16, 2 }, | |||
3559 | { ISD::ROTL, MVT::v8i16, 2 }, | |||
3560 | { ISD::ROTL, MVT::v64i8, 5 }, | |||
3561 | { ISD::ROTL, MVT::v32i8, 5 }, | |||
3562 | { ISD::ROTL, MVT::v16i8, 5 }, | |||
3563 | { ISD::ROTR, MVT::v32i16, 2 }, | |||
3564 | { ISD::ROTR, MVT::v16i16, 2 }, | |||
3565 | { ISD::ROTR, MVT::v8i16, 2 }, | |||
3566 | { ISD::ROTR, MVT::v64i8, 5 }, | |||
3567 | { ISD::ROTR, MVT::v32i8, 5 }, | |||
3568 | { ISD::ROTR, MVT::v16i8, 5 } | |||
3569 | }; | |||
3570 | static const CostTblEntry AVX512CostTbl[] = { | |||
3571 | { ISD::ROTL, MVT::v8i64, 1 }, | |||
3572 | { ISD::ROTL, MVT::v4i64, 1 }, | |||
3573 | { ISD::ROTL, MVT::v2i64, 1 }, | |||
3574 | { ISD::ROTL, MVT::v16i32, 1 }, | |||
3575 | { ISD::ROTL, MVT::v8i32, 1 }, | |||
3576 | { ISD::ROTL, MVT::v4i32, 1 }, | |||
3577 | { ISD::ROTR, MVT::v8i64, 1 }, | |||
3578 | { ISD::ROTR, MVT::v4i64, 1 }, | |||
3579 | { ISD::ROTR, MVT::v2i64, 1 }, | |||
3580 | { ISD::ROTR, MVT::v16i32, 1 }, | |||
3581 | { ISD::ROTR, MVT::v8i32, 1 }, | |||
3582 | { ISD::ROTR, MVT::v4i32, 1 } | |||
3583 | }; | |||
3584 | // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y)) | |||
3585 | static const CostTblEntry XOPCostTbl[] = { | |||
3586 | { ISD::ROTL, MVT::v4i64, 4 }, | |||
3587 | { ISD::ROTL, MVT::v8i32, 4 }, | |||
3588 | { ISD::ROTL, MVT::v16i16, 4 }, | |||
3589 | { ISD::ROTL, MVT::v32i8, 4 }, | |||
3590 | { ISD::ROTL, MVT::v2i64, 1 }, | |||
3591 | { ISD::ROTL, MVT::v4i32, 1 }, | |||
3592 | { ISD::ROTL, MVT::v8i16, 1 }, | |||
3593 | { ISD::ROTL, MVT::v16i8, 1 }, | |||
3594 | { ISD::ROTR, MVT::v4i64, 6 }, | |||
3595 | { ISD::ROTR, MVT::v8i32, 6 }, | |||
3596 | { ISD::ROTR, MVT::v16i16, 6 }, | |||
3597 | { ISD::ROTR, MVT::v32i8, 6 }, | |||
3598 | { ISD::ROTR, MVT::v2i64, 2 }, | |||
3599 | { ISD::ROTR, MVT::v4i32, 2 }, | |||
3600 | { ISD::ROTR, MVT::v8i16, 2 }, | |||
3601 | { ISD::ROTR, MVT::v16i8, 2 } | |||
3602 | }; | |||
3603 | static const CostTblEntry X64CostTbl[] = { // 64-bit targets | |||
3604 | { ISD::ROTL, MVT::i64, 1 }, | |||
3605 | { ISD::ROTR, MVT::i64, 1 }, | |||
3606 | { ISD::FSHL, MVT::i64, 4 } | |||
3607 | }; | |||
3608 | static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets | |||
3609 | { ISD::ROTL, MVT::i32, 1 }, | |||
3610 | { ISD::ROTL, MVT::i16, 1 }, | |||
3611 | { ISD::ROTL, MVT::i8, 1 }, | |||
3612 | { ISD::ROTR, MVT::i32, 1 }, | |||
3613 | { ISD::ROTR, MVT::i16, 1 }, | |||
3614 | { ISD::ROTR, MVT::i8, 1 }, | |||
3615 | { ISD::FSHL, MVT::i32, 4 }, | |||
3616 | { ISD::FSHL, MVT::i16, 4 }, | |||
3617 | { ISD::FSHL, MVT::i8, 4 } | |||
3618 | }; | |||
3619 | ||||
3620 | Intrinsic::ID IID = ICA.getID(); | |||
3621 | Type *RetTy = ICA.getReturnType(); | |||
3622 | const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); | |||
3623 | unsigned ISD = ISD::DELETED_NODE; | |||
3624 | switch (IID) { | |||
3625 | default: | |||
3626 | break; | |||
3627 | case Intrinsic::fshl: | |||
3628 | ISD = ISD::FSHL; | |||
3629 | if (Args[0] == Args[1]) | |||
3630 | ISD = ISD::ROTL; | |||
3631 | break; | |||
3632 | case Intrinsic::fshr: | |||
3633 | // FSHR has same costs so don't duplicate. | |||
3634 | ISD = ISD::FSHL; | |||
3635 | if (Args[0] == Args[1]) | |||
3636 | ISD = ISD::ROTR; | |||
3637 | break; | |||
3638 | } | |||
3639 | ||||
3640 | if (ISD != ISD::DELETED_NODE) { | |||
3641 | // Legalize the type. | |||
3642 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy); | |||
3643 | MVT MTy = LT.second; | |||
3644 | ||||
3645 | // Attempt to lookup cost. | |||
3646 | if (ST->hasBWI()) | |||
3647 | if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) | |||
3648 | return LT.first * Entry->Cost; | |||
3649 | ||||
3650 | if (ST->hasAVX512()) | |||
3651 | if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) | |||
3652 | return LT.first * Entry->Cost; | |||
3653 | ||||
3654 | if (ST->hasXOP()) | |||
3655 | if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy)) | |||
3656 | return LT.first * Entry->Cost; | |||
3657 | ||||
3658 | if (ST->is64Bit()) | |||
3659 | if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy)) | |||
3660 | return LT.first * Entry->Cost; | |||
3661 | ||||
3662 | if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy)) | |||
3663 | return LT.first * Entry->Cost; | |||
3664 | } | |||
3665 | ||||
3666 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); | |||
3667 | } | |||
3668 | ||||
3669 | InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, | |||
3670 | unsigned Index) { | |||
3671 | static const CostTblEntry SLMCostTbl[] = { | |||
3672 | { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 }, | |||
3673 | { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 }, | |||
3674 | { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 }, | |||
3675 | { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 } | |||
3676 | }; | |||
3677 | ||||
3678 | assert(Val->isVectorTy() && "This must be a vector type")(static_cast <bool> (Val->isVectorTy() && "This must be a vector type" ) ? void (0) : __assert_fail ("Val->isVectorTy() && \"This must be a vector type\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3678, __extension__ __PRETTY_FUNCTION__)); | |||
3679 | Type *ScalarType = Val->getScalarType(); | |||
3680 | InstructionCost RegisterFileMoveCost = 0; | |||
3681 | ||||
3682 | // Non-immediate extraction/insertion can be handled as a sequence of | |||
3683 | // aliased loads+stores via the stack. | |||
3684 | if (Index == -1U && (Opcode == Instruction::ExtractElement || | |||
3685 | Opcode == Instruction::InsertElement)) { | |||
3686 | // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns: | |||
3687 | // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0. | |||
3688 | ||||
3689 | // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling. | |||
3690 | assert(isa<FixedVectorType>(Val) && "Fixed vector type expected")(static_cast <bool> (isa<FixedVectorType>(Val) && "Fixed vector type expected") ? void (0) : __assert_fail ("isa<FixedVectorType>(Val) && \"Fixed vector type expected\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3690, __extension__ __PRETTY_FUNCTION__)); | |||
3691 | Align VecAlign = DL.getPrefTypeAlign(Val); | |||
3692 | Align SclAlign = DL.getPrefTypeAlign(ScalarType); | |||
3693 | ||||
3694 | // Extract - store vector to stack, load scalar. | |||
3695 | if (Opcode == Instruction::ExtractElement) { | |||
3696 | return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, | |||
3697 | TTI::TargetCostKind::TCK_RecipThroughput) + | |||
3698 | getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0, | |||
3699 | TTI::TargetCostKind::TCK_RecipThroughput); | |||
3700 | } | |||
3701 | // Insert - store vector to stack, store scalar, load vector. | |||
3702 | if (Opcode == Instruction::InsertElement) { | |||
3703 | return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, | |||
3704 | TTI::TargetCostKind::TCK_RecipThroughput) + | |||
3705 | getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0, | |||
3706 | TTI::TargetCostKind::TCK_RecipThroughput) + | |||
3707 | getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, | |||
3708 | TTI::TargetCostKind::TCK_RecipThroughput); | |||
3709 | } | |||
3710 | } | |||
3711 | ||||
3712 | if (Index != -1U && (Opcode
| |||
3713 | Opcode == Instruction::InsertElement)) { | |||
3714 | // Extraction of vXi1 elements are now efficiently handled by MOVMSK. | |||
3715 | if (Opcode
| |||
3716 | ScalarType->getScalarSizeInBits() == 1 && | |||
3717 | cast<FixedVectorType>(Val)->getNumElements() > 1) | |||
3718 | return 1; | |||
3719 | ||||
3720 | // Legalize the type. | |||
3721 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val); | |||
3722 | ||||
3723 | // This type is legalized to a scalar type. | |||
3724 | if (!LT.second.isVector()) | |||
3725 | return 0; | |||
3726 | ||||
3727 | // The type may be split. Normalize the index to the new type. | |||
3728 | unsigned SizeInBits = LT.second.getSizeInBits(); | |||
3729 | unsigned NumElts = LT.second.getVectorNumElements(); | |||
3730 | unsigned SubNumElts = NumElts; | |||
3731 | Index = Index % NumElts; | |||
3732 | ||||
3733 | // For >128-bit vectors, we need to extract higher 128-bit subvectors. | |||
3734 | // For inserts, we also need to insert the subvector back. | |||
3735 | if (SizeInBits > 128) { | |||
3736 | assert((SizeInBits % 128) == 0 && "Illegal vector")(static_cast <bool> ((SizeInBits % 128) == 0 && "Illegal vector") ? void (0) : __assert_fail ("(SizeInBits % 128) == 0 && \"Illegal vector\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3736, __extension__ __PRETTY_FUNCTION__)); | |||
3737 | unsigned NumSubVecs = SizeInBits / 128; | |||
3738 | SubNumElts = NumElts / NumSubVecs; | |||
3739 | if (SubNumElts <= Index) { | |||
3740 | RegisterFileMoveCost += (Opcode
| |||
3741 | Index %= SubNumElts; | |||
| ||||
3742 | } | |||
3743 | } | |||
3744 | ||||
3745 | if (Index == 0) { | |||
3746 | // Floating point scalars are already located in index #0. | |||
3747 | // Many insertions to #0 can fold away for scalar fp-ops, so let's assume | |||
3748 | // true for all. | |||
3749 | if (ScalarType->isFloatingPointTy()) | |||
3750 | return RegisterFileMoveCost; | |||
3751 | ||||
3752 | // Assume movd/movq XMM -> GPR is relatively cheap on all targets. | |||
3753 | if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement) | |||
3754 | return 1 + RegisterFileMoveCost; | |||
3755 | } | |||
3756 | ||||
3757 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
3758 | assert(ISD && "Unexpected vector opcode")(static_cast <bool> (ISD && "Unexpected vector opcode" ) ? void (0) : __assert_fail ("ISD && \"Unexpected vector opcode\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3758, __extension__ __PRETTY_FUNCTION__)); | |||
3759 | MVT MScalarTy = LT.second.getScalarType(); | |||
3760 | if (ST->useSLMArithCosts()) | |||
3761 | if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy)) | |||
3762 | return Entry->Cost + RegisterFileMoveCost; | |||
3763 | ||||
3764 | // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets. | |||
3765 | if ((MScalarTy == MVT::i16 && ST->hasSSE2()) || | |||
3766 | (MScalarTy.isInteger() && ST->hasSSE41())) | |||
3767 | return 1 + RegisterFileMoveCost; | |||
3768 | ||||
3769 | // Assume insertps is relatively cheap on all targets. | |||
3770 | if (MScalarTy == MVT::f32 && ST->hasSSE41() && | |||
3771 | Opcode == Instruction::InsertElement) | |||
3772 | return 1 + RegisterFileMoveCost; | |||
3773 | ||||
3774 | // For extractions we just need to shuffle the element to index 0, which | |||
3775 | // should be very cheap (assume cost = 1). For insertions we need to shuffle | |||
3776 | // the elements to its destination. In both cases we must handle the | |||
3777 | // subvector move(s). | |||
3778 | // If the vector type is already less than 128-bits then don't reduce it. | |||
3779 | // TODO: Under what circumstances should we shuffle using the full width? | |||
3780 | InstructionCost ShuffleCost = 1; | |||
3781 | if (Opcode == Instruction::InsertElement) { | |||
3782 | auto *SubTy = cast<VectorType>(Val); | |||
3783 | EVT VT = TLI->getValueType(DL, Val); | |||
3784 | if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128) | |||
3785 | SubTy = FixedVectorType::get(ScalarType, SubNumElts); | |||
3786 | ShuffleCost = | |||
3787 | getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, None, 0, SubTy); | |||
3788 | } | |||
3789 | int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1; | |||
3790 | return ShuffleCost + IntOrFpCost + RegisterFileMoveCost; | |||
3791 | } | |||
3792 | ||||
3793 | // Add to the base cost if we know that the extracted element of a vector is | |||
3794 | // destined to be moved to and used in the integer register file. | |||
3795 | if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy()) | |||
3796 | RegisterFileMoveCost += 1; | |||
3797 | ||||
3798 | return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost; | |||
3799 | } | |||
3800 | ||||
3801 | InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty, | |||
3802 | const APInt &DemandedElts, | |||
3803 | bool Insert, | |||
3804 | bool Extract) { | |||
3805 | assert(DemandedElts.getBitWidth() ==(static_cast <bool> (DemandedElts.getBitWidth() == cast <FixedVectorType>(Ty)->getNumElements() && "Vector size mismatch" ) ? void (0) : __assert_fail ("DemandedElts.getBitWidth() == cast<FixedVectorType>(Ty)->getNumElements() && \"Vector size mismatch\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3807, __extension__ __PRETTY_FUNCTION__)) | |||
3806 | cast<FixedVectorType>(Ty)->getNumElements() &&(static_cast <bool> (DemandedElts.getBitWidth() == cast <FixedVectorType>(Ty)->getNumElements() && "Vector size mismatch" ) ? void (0) : __assert_fail ("DemandedElts.getBitWidth() == cast<FixedVectorType>(Ty)->getNumElements() && \"Vector size mismatch\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3807, __extension__ __PRETTY_FUNCTION__)) | |||
3807 | "Vector size mismatch")(static_cast <bool> (DemandedElts.getBitWidth() == cast <FixedVectorType>(Ty)->getNumElements() && "Vector size mismatch" ) ? void (0) : __assert_fail ("DemandedElts.getBitWidth() == cast<FixedVectorType>(Ty)->getNumElements() && \"Vector size mismatch\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3807, __extension__ __PRETTY_FUNCTION__)); | |||
3808 | ||||
3809 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); | |||
3810 | MVT MScalarTy = LT.second.getScalarType(); | |||
3811 | unsigned SizeInBits = LT.second.getSizeInBits(); | |||
3812 | ||||
3813 | InstructionCost Cost = 0; | |||
3814 | ||||
3815 | // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much | |||
3816 | // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT. | |||
3817 | if (Insert) { | |||
3818 | if ((MScalarTy == MVT::i16 && ST->hasSSE2()) || | |||
3819 | (MScalarTy.isInteger() && ST->hasSSE41()) || | |||
3820 | (MScalarTy == MVT::f32 && ST->hasSSE41())) { | |||
3821 | // For types we can insert directly, insertion into 128-bit sub vectors is | |||
3822 | // cheap, followed by a cheap chain of concatenations. | |||
3823 | if (SizeInBits <= 128) { | |||
3824 | Cost += | |||
3825 | BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false); | |||
3826 | } else { | |||
3827 | // In each 128-lane, if at least one index is demanded but not all | |||
3828 | // indices are demanded and this 128-lane is not the first 128-lane of | |||
3829 | // the legalized-vector, then this 128-lane needs a extracti128; If in | |||
3830 | // each 128-lane, there is at least one demanded index, this 128-lane | |||
3831 | // needs a inserti128. | |||
3832 | ||||
3833 | // The following cases will help you build a better understanding: | |||
3834 | // Assume we insert several elements into a v8i32 vector in avx2, | |||
3835 | // Case#1: inserting into 1th index needs vpinsrd + inserti128. | |||
3836 | // Case#2: inserting into 5th index needs extracti128 + vpinsrd + | |||
3837 | // inserti128. | |||
3838 | // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128. | |||
3839 | const int CostValue = *LT.first.getValue(); | |||
3840 | assert(CostValue >= 0 && "Negative cost!")(static_cast <bool> (CostValue >= 0 && "Negative cost!" ) ? void (0) : __assert_fail ("CostValue >= 0 && \"Negative cost!\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3840, __extension__ __PRETTY_FUNCTION__)); | |||
3841 | unsigned Num128Lanes = SizeInBits / 128 * CostValue; | |||
3842 | unsigned NumElts = LT.second.getVectorNumElements() * CostValue; | |||
3843 | APInt WidenedDemandedElts = DemandedElts.zext(NumElts); | |||
3844 | unsigned Scale = NumElts / Num128Lanes; | |||
3845 | // We iterate each 128-lane, and check if we need a | |||
3846 | // extracti128/inserti128 for this 128-lane. | |||
3847 | for (unsigned I = 0; I < NumElts; I += Scale) { | |||
3848 | APInt Mask = WidenedDemandedElts.getBitsSet(NumElts, I, I + Scale); | |||
3849 | APInt MaskedDE = Mask & WidenedDemandedElts; | |||
3850 | unsigned Population = MaskedDE.countPopulation(); | |||
3851 | Cost += (Population > 0 && Population != Scale && | |||
3852 | I % LT.second.getVectorNumElements() != 0); | |||
3853 | Cost += Population > 0; | |||
3854 | } | |||
3855 | Cost += DemandedElts.countPopulation(); | |||
3856 | ||||
3857 | // For vXf32 cases, insertion into the 0'th index in each v4f32 | |||
3858 | // 128-bit vector is free. | |||
3859 | // NOTE: This assumes legalization widens vXf32 vectors. | |||
3860 | if (MScalarTy == MVT::f32) | |||
3861 | for (unsigned i = 0, e = cast<FixedVectorType>(Ty)->getNumElements(); | |||
3862 | i < e; i += 4) | |||
3863 | if (DemandedElts[i]) | |||
3864 | Cost--; | |||
3865 | } | |||
3866 | } else if (LT.second.isVector()) { | |||
3867 | // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded | |||
3868 | // integer element as a SCALAR_TO_VECTOR, then we build the vector as a | |||
3869 | // series of UNPCK followed by CONCAT_VECTORS - all of these can be | |||
3870 | // considered cheap. | |||
3871 | if (Ty->isIntOrIntVectorTy()) | |||
3872 | Cost += DemandedElts.countPopulation(); | |||
3873 | ||||
3874 | // Get the smaller of the legalized or original pow2-extended number of | |||
3875 | // vector elements, which represents the number of unpacks we'll end up | |||
3876 | // performing. | |||
3877 | unsigned NumElts = LT.second.getVectorNumElements(); | |||
3878 | unsigned Pow2Elts = | |||
3879 | PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements()); | |||
3880 | Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first; | |||
3881 | } | |||
3882 | } | |||
3883 | ||||
3884 | if (Extract) { | |||
3885 | // vXi1 can be efficiently extracted with MOVMSK. | |||
3886 | // TODO: AVX512 predicate mask handling. | |||
3887 | // NOTE: This doesn't work well for roundtrip scalarization. | |||
3888 | if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) { | |||
3889 | unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements(); | |||
3890 | unsigned MaxElts = ST->hasAVX2() ? 32 : 16; | |||
3891 | unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts; | |||
3892 | return MOVMSKCost; | |||
3893 | } | |||
3894 | ||||
3895 | if (LT.second.isVector()) { | |||
3896 | int CostValue = *LT.first.getValue(); | |||
3897 | assert(CostValue >= 0 && "Negative cost!")(static_cast <bool> (CostValue >= 0 && "Negative cost!" ) ? void (0) : __assert_fail ("CostValue >= 0 && \"Negative cost!\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3897, __extension__ __PRETTY_FUNCTION__)); | |||
3898 | ||||
3899 | unsigned NumElts = LT.second.getVectorNumElements() * CostValue; | |||
3900 | assert(NumElts >= DemandedElts.getBitWidth() &&(static_cast <bool> (NumElts >= DemandedElts.getBitWidth () && "Vector has been legalized to smaller element count" ) ? void (0) : __assert_fail ("NumElts >= DemandedElts.getBitWidth() && \"Vector has been legalized to smaller element count\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3901, __extension__ __PRETTY_FUNCTION__)) | |||
3901 | "Vector has been legalized to smaller element count")(static_cast <bool> (NumElts >= DemandedElts.getBitWidth () && "Vector has been legalized to smaller element count" ) ? void (0) : __assert_fail ("NumElts >= DemandedElts.getBitWidth() && \"Vector has been legalized to smaller element count\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3901, __extension__ __PRETTY_FUNCTION__)); | |||
3902 | ||||
3903 | // If we're extracting elements from a 128-bit subvector lane, we only need | |||
3904 | // to extract each lane once, not for every element. | |||
3905 | if (SizeInBits > 128) { | |||
3906 | assert((SizeInBits % 128) == 0 && "Illegal vector")(static_cast <bool> ((SizeInBits % 128) == 0 && "Illegal vector") ? void (0) : __assert_fail ("(SizeInBits % 128) == 0 && \"Illegal vector\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3906, __extension__ __PRETTY_FUNCTION__)); | |||
3907 | unsigned NumLegal128Lanes = SizeInBits / 128; | |||
3908 | unsigned Num128Lanes = NumLegal128Lanes * CostValue; | |||
3909 | APInt WidenedDemandedElts = DemandedElts.zext(NumElts); | |||
3910 | unsigned Scale = NumElts / Num128Lanes; | |||
3911 | ||||
3912 | // Add cost for each demanded 128-bit subvector extraction. | |||
3913 | // Luckily this is a lot easier than for insertion. | |||
3914 | APInt DemandedUpper128Lanes = | |||
3915 | APIntOps::ScaleBitMask(WidenedDemandedElts, Num128Lanes); | |||
3916 | auto *Ty128 = FixedVectorType::get(Ty->getElementType(), Scale); | |||
3917 | for (unsigned I = 0; I != Num128Lanes; ++I) | |||
3918 | if (DemandedUpper128Lanes[I]) | |||
3919 | Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, | |||
3920 | I * Scale, Ty128); | |||
3921 | ||||
3922 | // Add all the demanded element extractions together, but adjust the | |||
3923 | // index to use the equivalent of the bottom 128 bit lane. | |||
3924 | for (unsigned I = 0; I != NumElts; ++I) | |||
3925 | if (WidenedDemandedElts[I]) { | |||
3926 | unsigned Idx = I % Scale; | |||
3927 | Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, Idx); | |||
3928 | } | |||
3929 | ||||
3930 | return Cost; | |||
3931 | } | |||
3932 | } | |||
3933 | ||||
3934 | // Fallback to default extraction. | |||
3935 | Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, false, Extract); | |||
3936 | } | |||
3937 | ||||
3938 | return Cost; | |||
3939 | } | |||
3940 | ||||
3941 | InstructionCost | |||
3942 | X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, | |||
3943 | int VF, const APInt &DemandedDstElts, | |||
3944 | TTI::TargetCostKind CostKind) { | |||
3945 | const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy); | |||
3946 | // We don't differentiate element types here, only element bit width. | |||
3947 | EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits); | |||
3948 | ||||
3949 | auto bailout = [&]() { | |||
3950 | return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF, | |||
3951 | DemandedDstElts, CostKind); | |||
3952 | }; | |||
3953 | ||||
3954 | // For now, only deal with AVX512 cases. | |||
3955 | if (!ST->hasAVX512()) | |||
3956 | return bailout(); | |||
3957 | ||||
3958 | // Do we have a native shuffle for this element type, or should we promote? | |||
3959 | unsigned PromEltTyBits = EltTyBits; | |||
3960 | switch (EltTyBits) { | |||
3961 | case 32: | |||
3962 | case 64: | |||
3963 | break; // AVX512F. | |||
3964 | case 16: | |||
3965 | if (!ST->hasBWI()) | |||
3966 | PromEltTyBits = 32; // promote to i32, AVX512F. | |||
3967 | break; // AVX512BW | |||
3968 | case 8: | |||
3969 | if (!ST->hasVBMI()) | |||
3970 | PromEltTyBits = 32; // promote to i32, AVX512F. | |||
3971 | break; // AVX512VBMI | |||
3972 | case 1: | |||
3973 | // There is no support for shuffling i1 elements. We *must* promote. | |||
3974 | if (ST->hasBWI()) { | |||
3975 | if (ST->hasVBMI()) | |||
3976 | PromEltTyBits = 8; // promote to i8, AVX512VBMI. | |||
3977 | else | |||
3978 | PromEltTyBits = 16; // promote to i16, AVX512BW. | |||
3979 | break; | |||
3980 | } | |||
3981 | if (ST->hasDQI()) { | |||
3982 | PromEltTyBits = 32; // promote to i32, AVX512F. | |||
3983 | break; | |||
3984 | } | |||
3985 | return bailout(); | |||
3986 | default: | |||
3987 | return bailout(); | |||
3988 | } | |||
3989 | auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits); | |||
3990 | ||||
3991 | auto *SrcVecTy = FixedVectorType::get(EltTy, VF); | |||
3992 | auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF); | |||
3993 | ||||
3994 | int NumDstElements = VF * ReplicationFactor; | |||
3995 | auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements); | |||
3996 | auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements); | |||
3997 | ||||
3998 | // Legalize the types. | |||
3999 | MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second; | |||
4000 | MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second; | |||
4001 | MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second; | |||
4002 | MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second; | |||
4003 | // They should have legalized into vector types. | |||
4004 | if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() || | |||
4005 | !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector()) | |||
4006 | return bailout(); | |||
4007 | ||||
4008 | if (PromEltTyBits != EltTyBits) { | |||
4009 | // If we have to perform the shuffle with wider elt type than our data type, | |||
4010 | // then we will first need to anyext (we don't care about the new bits) | |||
4011 | // the source elements, and then truncate Dst elements. | |||
4012 | InstructionCost PromotionCost; | |||
4013 | PromotionCost += getCastInstrCost( | |||
4014 | Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy, | |||
4015 | TargetTransformInfo::CastContextHint::None, CostKind); | |||
4016 | PromotionCost += | |||
4017 | getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy, | |||
4018 | /*Src=*/PromDstVecTy, | |||
4019 | TargetTransformInfo::CastContextHint::None, CostKind); | |||
4020 | return PromotionCost + getReplicationShuffleCost(PromEltTy, | |||
4021 | ReplicationFactor, VF, | |||
4022 | DemandedDstElts, CostKind); | |||
4023 | } | |||
4024 | ||||
4025 | assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&(static_cast <bool> (LegalSrcVecTy.getScalarSizeInBits( ) == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy .getScalarType() && "We expect that the legalization doesn't affect the element width, " "doesn't coalesce/split elements.") ? void (0) : __assert_fail ("LegalSrcVecTy.getScalarSizeInBits() == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() && \"We expect that the legalization doesn't affect the element width, \" \"doesn't coalesce/split elements.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4028, __extension__ __PRETTY_FUNCTION__)) | |||
4026 | LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&(static_cast <bool> (LegalSrcVecTy.getScalarSizeInBits( ) == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy .getScalarType() && "We expect that the legalization doesn't affect the element width, " "doesn't coalesce/split elements.") ? void (0) : __assert_fail ("LegalSrcVecTy.getScalarSizeInBits() == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() && \"We expect that the legalization doesn't affect the element width, \" \"doesn't coalesce/split elements.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4028, __extension__ __PRETTY_FUNCTION__)) | |||
4027 | "We expect that the legalization doesn't affect the element width, "(static_cast <bool> (LegalSrcVecTy.getScalarSizeInBits( ) == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy .getScalarType() && "We expect that the legalization doesn't affect the element width, " "doesn't coalesce/split elements.") ? void (0) : __assert_fail ("LegalSrcVecTy.getScalarSizeInBits() == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() && \"We expect that the legalization doesn't affect the element width, \" \"doesn't coalesce/split elements.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4028, __extension__ __PRETTY_FUNCTION__)) | |||
4028 | "doesn't coalesce/split elements.")(static_cast <bool> (LegalSrcVecTy.getScalarSizeInBits( ) == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy .getScalarType() && "We expect that the legalization doesn't affect the element width, " "doesn't coalesce/split elements.") ? void (0) : __assert_fail ("LegalSrcVecTy.getScalarSizeInBits() == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() && \"We expect that the legalization doesn't affect the element width, \" \"doesn't coalesce/split elements.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4028, __extension__ __PRETTY_FUNCTION__)); | |||
4029 | ||||
4030 | unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements(); | |||
4031 | unsigned NumDstVectors = | |||
4032 | divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec); | |||
4033 | ||||
4034 | auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec); | |||
4035 | ||||
4036 | // Not all the produced Dst elements may be demanded. In our case, | |||
4037 | // given that a single Dst vector is formed by a single shuffle, | |||
4038 | // if all elements that will form a single Dst vector aren't demanded, | |||
4039 | // then we won't need to do that shuffle, so adjust the cost accordingly. | |||
4040 | APInt DemandedDstVectors = APIntOps::ScaleBitMask( | |||
4041 | DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors); | |||
4042 | unsigned NumDstVectorsDemanded = DemandedDstVectors.countPopulation(); | |||
4043 | ||||
4044 | InstructionCost SingleShuffleCost = | |||
4045 | getShuffleCost(TTI::SK_PermuteSingleSrc, SingleDstVecTy, | |||
4046 | /*Mask=*/None, /*Index=*/0, /*SubTp=*/nullptr); | |||
4047 | return NumDstVectorsDemanded * SingleShuffleCost; | |||
4048 | } | |||
4049 | ||||
4050 | InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, | |||
4051 | MaybeAlign Alignment, | |||
4052 | unsigned AddressSpace, | |||
4053 | TTI::TargetCostKind CostKind, | |||
4054 | TTI::OperandValueKind OpdInfo, | |||
4055 | const Instruction *I) { | |||
4056 | // TODO: Handle other cost kinds. | |||
4057 | if (CostKind != TTI::TCK_RecipThroughput) { | |||
4058 | if (auto *SI = dyn_cast_or_null<StoreInst>(I)) { | |||
4059 | // Store instruction with index and scale costs 2 Uops. | |||
4060 | // Check the preceding GEP to identify non-const indices. | |||
4061 | if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) { | |||
4062 | if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); })) | |||
4063 | return TTI::TCC_Basic * 2; | |||
4064 | } | |||
4065 | } | |||
4066 | return TTI::TCC_Basic; | |||
4067 | } | |||
4068 | ||||
4069 | assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&(static_cast <bool> ((Opcode == Instruction::Load || Opcode == Instruction::Store) && "Invalid Opcode") ? void ( 0) : __assert_fail ("(Opcode == Instruction::Load || Opcode == Instruction::Store) && \"Invalid Opcode\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4070, __extension__ __PRETTY_FUNCTION__)) | |||
4070 | "Invalid Opcode")(static_cast <bool> ((Opcode == Instruction::Load || Opcode == Instruction::Store) && "Invalid Opcode") ? void ( 0) : __assert_fail ("(Opcode == Instruction::Load || Opcode == Instruction::Store) && \"Invalid Opcode\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4070, __extension__ __PRETTY_FUNCTION__)); | |||
4071 | // Type legalization can't handle structs | |||
4072 | if (TLI->getValueType(DL, Src, true) == MVT::Other) | |||
4073 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, | |||
4074 | CostKind); | |||
4075 | ||||
4076 | // Legalize the type. | |||
4077 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src); | |||
4078 | ||||
4079 | auto *VTy = dyn_cast<FixedVectorType>(Src); | |||
4080 | ||||
4081 | InstructionCost Cost = 0; | |||
4082 | ||||
4083 | // Add a cost for constant load to vector. | |||
4084 | if (Opcode == Instruction::Store && | |||
4085 | (OpdInfo == TTI::OK_UniformConstantValue || | |||
4086 | OpdInfo == TTI::OK_NonUniformConstantValue)) | |||
4087 | Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src), | |||
4088 | /*AddressSpace=*/0, CostKind); | |||
4089 | ||||
4090 | // Handle the simple case of non-vectors. | |||
4091 | // NOTE: this assumes that legalization never creates vector from scalars! | |||
4092 | if (!VTy || !LT.second.isVector()) { | |||
4093 | // Each load/store unit costs 1. | |||
4094 | return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1; | |||
4095 | } | |||
4096 | ||||
4097 | bool IsLoad = Opcode == Instruction::Load; | |||
4098 | ||||
4099 | Type *EltTy = VTy->getElementType(); | |||
4100 | ||||
4101 | const int EltTyBits = DL.getTypeSizeInBits(EltTy); | |||
4102 | ||||
4103 | // Source of truth: how many elements were there in the original IR vector? | |||
4104 | const unsigned SrcNumElt = VTy->getNumElements(); | |||
4105 | ||||
4106 | // How far have we gotten? | |||
4107 | int NumEltRemaining = SrcNumElt; | |||
4108 | // Note that we intentionally capture by-reference, NumEltRemaining changes. | |||
4109 | auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; }; | |||
4110 | ||||
4111 | const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8); | |||
4112 | ||||
4113 | // Note that even if we can store 64 bits of an XMM, we still operate on XMM. | |||
4114 | const unsigned XMMBits = 128; | |||
4115 | if (XMMBits % EltTyBits != 0) | |||
4116 | // Vector size must be a multiple of the element size. I.e. no padding. | |||
4117 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, | |||
4118 | CostKind); | |||
4119 | const int NumEltPerXMM = XMMBits / EltTyBits; | |||
4120 | ||||
4121 | auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM); | |||
4122 | ||||
4123 | for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0; | |||
4124 | NumEltRemaining > 0; CurrOpSizeBytes /= 2) { | |||
4125 | // How many elements would a single op deal with at once? | |||
4126 | if ((8 * CurrOpSizeBytes) % EltTyBits != 0) | |||
4127 | // Vector size must be a multiple of the element size. I.e. no padding. | |||
4128 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, | |||
4129 | CostKind); | |||
4130 | int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits; | |||
4131 | ||||
4132 | assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?")(static_cast <bool> (CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?") ? void (0) : __assert_fail ("CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && \"How'd we get here?\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4132, __extension__ __PRETTY_FUNCTION__)); | |||
4133 | assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||(static_cast <bool> ((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes )) && "Unless we haven't halved the op size yet, " "we have less than two op's sized units of work left." ) ? void (0) : __assert_fail ("(((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes)) && \"Unless we haven't halved the op size yet, \" \"we have less than two op's sized units of work left.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4136, __extension__ __PRETTY_FUNCTION__)) | |||
4134 | (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&(static_cast <bool> ((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes )) && "Unless we haven't halved the op size yet, " "we have less than two op's sized units of work left." ) ? void (0) : __assert_fail ("(((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes)) && \"Unless we haven't halved the op size yet, \" \"we have less than two op's sized units of work left.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4136, __extension__ __PRETTY_FUNCTION__)) | |||
4135 | "Unless we haven't halved the op size yet, "(static_cast <bool> ((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes )) && "Unless we haven't halved the op size yet, " "we have less than two op's sized units of work left." ) ? void (0) : __assert_fail ("(((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes)) && \"Unless we haven't halved the op size yet, \" \"we have less than two op's sized units of work left.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4136, __extension__ __PRETTY_FUNCTION__)) | |||
4136 | "we have less than two op's sized units of work left.")(static_cast <bool> ((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes )) && "Unless we haven't halved the op size yet, " "we have less than two op's sized units of work left." ) ? void (0) : __assert_fail ("(((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes)) && \"Unless we haven't halved the op size yet, \" \"we have less than two op's sized units of work left.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4136, __extension__ __PRETTY_FUNCTION__)); | |||
4137 | ||||
4138 | auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM | |||
4139 | ? FixedVectorType::get(EltTy, CurrNumEltPerOp) | |||
4140 | : XMMVecTy; | |||
4141 | ||||
4142 | assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&(static_cast <bool> (CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && "After halving sizes, the vector elt count is no longer a multiple " "of number of elements per operation?") ? void (0) : __assert_fail ("CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && \"After halving sizes, the vector elt count is no longer a multiple \" \"of number of elements per operation?\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4144, __extension__ __PRETTY_FUNCTION__)) | |||
4143 | "After halving sizes, the vector elt count is no longer a multiple "(static_cast <bool> (CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && "After halving sizes, the vector elt count is no longer a multiple " "of number of elements per operation?") ? void (0) : __assert_fail ("CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && \"After halving sizes, the vector elt count is no longer a multiple \" \"of number of elements per operation?\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4144, __extension__ __PRETTY_FUNCTION__)) | |||
4144 | "of number of elements per operation?")(static_cast <bool> (CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && "After halving sizes, the vector elt count is no longer a multiple " "of number of elements per operation?") ? void (0) : __assert_fail ("CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && \"After halving sizes, the vector elt count is no longer a multiple \" \"of number of elements per operation?\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4144, __extension__ __PRETTY_FUNCTION__)); | |||
4145 | auto *CoalescedVecTy = | |||
4146 | CurrNumEltPerOp == 1 | |||
4147 | ? CurrVecTy | |||
4148 | : FixedVectorType::get( | |||
4149 | IntegerType::get(Src->getContext(), | |||
4150 | EltTyBits * CurrNumEltPerOp), | |||
4151 | CurrVecTy->getNumElements() / CurrNumEltPerOp); | |||
4152 | assert(DL.getTypeSizeInBits(CoalescedVecTy) ==(static_cast <bool> (DL.getTypeSizeInBits(CoalescedVecTy ) == DL.getTypeSizeInBits(CurrVecTy) && "coalesciing elements doesn't change vector width." ) ? void (0) : __assert_fail ("DL.getTypeSizeInBits(CoalescedVecTy) == DL.getTypeSizeInBits(CurrVecTy) && \"coalesciing elements doesn't change vector width.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4154, __extension__ __PRETTY_FUNCTION__)) | |||
4153 | DL.getTypeSizeInBits(CurrVecTy) &&(static_cast <bool> (DL.getTypeSizeInBits(CoalescedVecTy ) == DL.getTypeSizeInBits(CurrVecTy) && "coalesciing elements doesn't change vector width." ) ? void (0) : __assert_fail ("DL.getTypeSizeInBits(CoalescedVecTy) == DL.getTypeSizeInBits(CurrVecTy) && \"coalesciing elements doesn't change vector width.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4154, __extension__ __PRETTY_FUNCTION__)) | |||
4154 | "coalesciing elements doesn't change vector width.")(static_cast <bool> (DL.getTypeSizeInBits(CoalescedVecTy ) == DL.getTypeSizeInBits(CurrVecTy) && "coalesciing elements doesn't change vector width." ) ? void (0) : __assert_fail ("DL.getTypeSizeInBits(CoalescedVecTy) == DL.getTypeSizeInBits(CurrVecTy) && \"coalesciing elements doesn't change vector width.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4154, __extension__ __PRETTY_FUNCTION__)); | |||
4155 | ||||
4156 | while (NumEltRemaining > 0) { | |||
4157 | assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?")(static_cast <bool> (SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?" ) ? void (0) : __assert_fail ("SubVecEltsLeft >= 0 && \"Subreg element count overconsumtion?\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4157, __extension__ __PRETTY_FUNCTION__)); | |||
4158 | ||||
4159 | // Can we use this vector size, as per the remaining element count? | |||
4160 | // Iff the vector is naturally aligned, we can do a wide load regardless. | |||
4161 | if (NumEltRemaining < CurrNumEltPerOp && | |||
4162 | (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) && | |||
4163 | CurrOpSizeBytes != 1) | |||
4164 | break; // Try smalled vector size. | |||
4165 | ||||
4166 | bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0; | |||
4167 | ||||
4168 | // If we have fully processed the previous reg, we need to replenish it. | |||
4169 | if (SubVecEltsLeft == 0) { | |||
4170 | SubVecEltsLeft += CurrVecTy->getNumElements(); | |||
4171 | // And that's free only for the 0'th subvector of a legalized vector. | |||
4172 | if (!Is0thSubVec) | |||
4173 | Cost += getShuffleCost(IsLoad ? TTI::ShuffleKind::SK_InsertSubvector | |||
4174 | : TTI::ShuffleKind::SK_ExtractSubvector, | |||
4175 | VTy, None, NumEltDone(), CurrVecTy); | |||
4176 | } | |||
4177 | ||||
4178 | // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM, | |||
4179 | // for smaller widths (32/16/8) we have to insert/extract them separately. | |||
4180 | // Again, it's free for the 0'th subreg (if op is 32/64 bit wide, | |||
4181 | // but let's pretend that it is also true for 16/8 bit wide ops...) | |||
4182 | if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) { | |||
4183 | int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM; | |||
4184 | assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "")(static_cast <bool> (NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "") ? void (0) : __assert_fail ("NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && \"\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4184, __extension__ __PRETTY_FUNCTION__)); | |||
4185 | int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp; | |||
4186 | APInt DemandedElts = | |||
4187 | APInt::getBitsSet(CoalescedVecTy->getNumElements(), | |||
4188 | CoalescedVecEltIdx, CoalescedVecEltIdx + 1); | |||
4189 | assert(DemandedElts.countPopulation() == 1 && "Inserting single value")(static_cast <bool> (DemandedElts.countPopulation() == 1 && "Inserting single value") ? void (0) : __assert_fail ("DemandedElts.countPopulation() == 1 && \"Inserting single value\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4189, __extension__ __PRETTY_FUNCTION__)); | |||
4190 | Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad, | |||
4191 | !IsLoad); | |||
4192 | } | |||
4193 | ||||
4194 | // This isn't exactly right. We're using slow unaligned 32-byte accesses | |||
4195 | // as a proxy for a double-pumped AVX memory interface such as on | |||
4196 | // Sandybridge. | |||
4197 | if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow()) | |||
4198 | Cost += 2; | |||
4199 | else | |||
4200 | Cost += 1; | |||
4201 | ||||
4202 | SubVecEltsLeft -= CurrNumEltPerOp; | |||
4203 | NumEltRemaining -= CurrNumEltPerOp; | |||
4204 | Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes); | |||
4205 | } | |||
4206 | } | |||
4207 | ||||
4208 | assert(NumEltRemaining <= 0 && "Should have processed all the elements.")(static_cast <bool> (NumEltRemaining <= 0 && "Should have processed all the elements.") ? void (0) : __assert_fail ("NumEltRemaining <= 0 && \"Should have processed all the elements.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4208, __extension__ __PRETTY_FUNCTION__)); | |||
4209 | ||||
4210 | return Cost; | |||
4211 | } | |||
4212 | ||||
4213 | InstructionCost | |||
4214 | X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment, | |||
4215 | unsigned AddressSpace, | |||
4216 | TTI::TargetCostKind CostKind) { | |||
4217 | bool IsLoad = (Instruction::Load == Opcode); | |||
4218 | bool IsStore = (Instruction::Store == Opcode); | |||
4219 | ||||
4220 | auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy); | |||
4221 | if (!SrcVTy) | |||
4222 | // To calculate scalar take the regular cost, without mask | |||
4223 | return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind); | |||
4224 | ||||
4225 | unsigned NumElem = SrcVTy->getNumElements(); | |||
4226 | auto *MaskTy = | |||
4227 | FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem); | |||
4228 | if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) || | |||
4229 | (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) { | |||
4230 | // Scalarization | |||
4231 | APInt DemandedElts = APInt::getAllOnes(NumElem); | |||
4232 | InstructionCost MaskSplitCost = | |||
4233 | getScalarizationOverhead(MaskTy, DemandedElts, false, true); | |||
4234 | InstructionCost ScalarCompareCost = getCmpSelInstrCost( | |||
4235 | Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr, | |||
4236 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | |||
4237 | InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind); | |||
4238 | InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); | |||
4239 | InstructionCost ValueSplitCost = | |||
4240 | getScalarizationOverhead(SrcVTy, DemandedElts, IsLoad, IsStore); | |||
4241 | InstructionCost MemopCost = | |||
4242 | NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(), | |||
4243 | Alignment, AddressSpace, CostKind); | |||
4244 | return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost; | |||
4245 | } | |||
4246 | ||||
4247 | // Legalize the type. | |||
4248 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy); | |||
4249 | auto VT = TLI->getValueType(DL, SrcVTy); | |||
4250 | InstructionCost Cost = 0; | |||
4251 | if (VT.isSimple() && LT.second != VT.getSimpleVT() && | |||
4252 | LT.second.getVectorNumElements() == NumElem) | |||
4253 | // Promotion requires extend/truncate for data and a shuffle for mask. | |||
4254 | Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, None, 0, nullptr) + | |||
4255 | getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, None, 0, nullptr); | |||
4256 | ||||
4257 | else if (LT.first * LT.second.getVectorNumElements() > NumElem) { | |||
4258 | auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(), | |||
4259 | LT.second.getVectorNumElements()); | |||
4260 | // Expanding requires fill mask with zeroes | |||
4261 | Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, None, 0, MaskTy); | |||
4262 | } | |||
4263 | ||||
4264 | // Pre-AVX512 - each maskmov load costs 2 + store costs ~8. | |||
4265 | if (!ST->hasAVX512()) | |||
4266 | return Cost + LT.first * (IsLoad ? 2 : 8); | |||
4267 | ||||
4268 | // AVX-512 masked load/store is cheaper | |||
4269 | return Cost + LT.first; | |||
4270 | } | |||
4271 | ||||
4272 | InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty, | |||
4273 | ScalarEvolution *SE, | |||
4274 | const SCEV *Ptr) { | |||
4275 | // Address computations in vectorized code with non-consecutive addresses will | |||
4276 | // likely result in more instructions compared to scalar code where the | |||
4277 | // computation can more often be merged into the index mode. The resulting | |||
4278 | // extra micro-ops can significantly decrease throughput. | |||
4279 | const unsigned NumVectorInstToHideOverhead = 10; | |||
4280 | ||||
4281 | // Cost modeling of Strided Access Computation is hidden by the indexing | |||
4282 | // modes of X86 regardless of the stride value. We dont believe that there | |||
4283 | // is a difference between constant strided access in gerenal and constant | |||
4284 | // strided value which is less than or equal to 64. | |||
4285 | // Even in the case of (loop invariant) stride whose value is not known at | |||
4286 | // compile time, the address computation will not incur more than one extra | |||
4287 | // ADD instruction. | |||
4288 | if (Ty->isVectorTy() && SE && !ST->hasAVX2()) { | |||
4289 | // TODO: AVX2 is the current cut-off because we don't have correct | |||
4290 | // interleaving costs for prior ISA's. | |||
4291 | if (!BaseT::isStridedAccess(Ptr)) | |||
4292 | return NumVectorInstToHideOverhead; | |||
4293 | if (!BaseT::getConstantStrideStep(SE, Ptr)) | |||
4294 | return 1; | |||
4295 | } | |||
4296 | ||||
4297 | return BaseT::getAddressComputationCost(Ty, SE, Ptr); | |||
4298 | } | |||
4299 | ||||
4300 | InstructionCost | |||
4301 | X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, | |||
4302 | Optional<FastMathFlags> FMF, | |||
4303 | TTI::TargetCostKind CostKind) { | |||
4304 | if (TTI::requiresOrderedReduction(FMF)) | |||
4305 | return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); | |||
4306 | ||||
4307 | // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput | |||
4308 | // and make it as the cost. | |||
4309 | ||||
4310 | static const CostTblEntry SLMCostTblNoPairWise[] = { | |||
4311 | { ISD::FADD, MVT::v2f64, 3 }, | |||
4312 | { ISD::ADD, MVT::v2i64, 5 }, | |||
4313 | }; | |||
4314 | ||||
4315 | static const CostTblEntry SSE2CostTblNoPairWise[] = { | |||
4316 | { ISD::FADD, MVT::v2f64, 2 }, | |||
4317 | { ISD::FADD, MVT::v2f32, 2 }, | |||
4318 | { ISD::FADD, MVT::v4f32, 4 }, | |||
4319 | { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". | |||
4320 | { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32 | |||
4321 | { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3". | |||
4322 | { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3". | |||
4323 | { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3". | |||
4324 | { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3". | |||
4325 | { ISD::ADD, MVT::v2i8, 2 }, | |||
4326 | { ISD::ADD, MVT::v4i8, 2 }, | |||
4327 | { ISD::ADD, MVT::v8i8, 2 }, | |||
4328 | { ISD::ADD, MVT::v16i8, 3 }, | |||
4329 | }; | |||
4330 | ||||
4331 | static const CostTblEntry AVX1CostTblNoPairWise[] = { | |||
4332 | { ISD::FADD, MVT::v4f64, 3 }, | |||
4333 | { ISD::FADD, MVT::v4f32, 3 }, | |||
4334 | { ISD::FADD, MVT::v8f32, 4 }, | |||
4335 | { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". | |||
4336 | { ISD::ADD, MVT::v4i64, 3 }, | |||
4337 | { ISD::ADD, MVT::v8i32, 5 }, | |||
4338 | { ISD::ADD, MVT::v16i16, 5 }, | |||
4339 | { ISD::ADD, MVT::v32i8, 4 }, | |||
4340 | }; | |||
4341 | ||||
4342 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
4343 | assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ? void (0) : __assert_fail ("ISD && \"Invalid opcode\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4343, __extension__ __PRETTY_FUNCTION__)); | |||
4344 | ||||
4345 | // Before legalizing the type, give a chance to look up illegal narrow types | |||
4346 | // in the table. | |||
4347 | // FIXME: Is there a better way to do this? | |||
4348 | EVT VT = TLI->getValueType(DL, ValTy); | |||
4349 | if (VT.isSimple()) { | |||
4350 | MVT MTy = VT.getSimpleVT(); | |||
4351 | if (ST->useSLMArithCosts()) | |||
4352 | if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy)) | |||
4353 | return Entry->Cost; | |||
4354 | ||||
4355 | if (ST->hasAVX()) | |||
4356 | if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) | |||
4357 | return Entry->Cost; | |||
4358 | ||||
4359 | if (ST->hasSSE2()) | |||
4360 | if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) | |||
4361 | return Entry->Cost; | |||
4362 | } | |||
4363 | ||||
4364 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); | |||
4365 | ||||
4366 | MVT MTy = LT.second; | |||
4367 | ||||
4368 | auto *ValVTy = cast<FixedVectorType>(ValTy); | |||
4369 | ||||
4370 | // Special case: vXi8 mul reductions are performed as vXi16. | |||
4371 | if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) { | |||
4372 | auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16); | |||
4373 | auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements()); | |||
4374 | return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy, | |||
4375 | TargetTransformInfo::CastContextHint::None, | |||
4376 | CostKind) + | |||
4377 | getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind); | |||
4378 | } | |||
4379 | ||||
4380 | InstructionCost ArithmeticCost = 0; | |||
4381 | if (LT.first != 1 && MTy.isVector() && | |||
4382 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { | |||
4383 | // Type needs to be split. We need LT.first - 1 arithmetic ops. | |||
4384 | auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(), | |||
4385 | MTy.getVectorNumElements()); | |||
4386 | ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind); | |||
4387 | ArithmeticCost *= LT.first - 1; | |||
4388 | } | |||
4389 | ||||
4390 | if (ST->useSLMArithCosts()) | |||
4391 | if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy)) | |||
4392 | return ArithmeticCost + Entry->Cost; | |||
4393 | ||||
4394 | if (ST->hasAVX()) | |||
4395 | if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) | |||
4396 | return ArithmeticCost + Entry->Cost; | |||
4397 | ||||
4398 | if (ST->hasSSE2()) | |||
4399 | if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) | |||
4400 | return ArithmeticCost + Entry->Cost; | |||
4401 | ||||
4402 | // FIXME: These assume a naive kshift+binop lowering, which is probably | |||
4403 | // conservative in most cases. | |||
4404 | static const CostTblEntry AVX512BoolReduction[] = { | |||
4405 | { ISD::AND, MVT::v2i1, 3 }, | |||
4406 | { ISD::AND, MVT::v4i1, 5 }, | |||
4407 | { ISD::AND, MVT::v8i1, 7 }, | |||
4408 | { ISD::AND, MVT::v16i1, 9 }, | |||
4409 | { ISD::AND, MVT::v32i1, 11 }, | |||
4410 | { ISD::AND, MVT::v64i1, 13 }, | |||
4411 | { ISD::OR, MVT::v2i1, 3 }, | |||
4412 | { ISD::OR, MVT::v4i1, 5 }, | |||
4413 | { ISD::OR, MVT::v8i1, 7 }, | |||
4414 | { ISD::OR, MVT::v16i1, 9 }, | |||
4415 | { ISD::OR, MVT::v32i1, 11 }, | |||
4416 | { ISD::OR, MVT::v64i1, 13 }, | |||
4417 | }; | |||
4418 | ||||
4419 | static const CostTblEntry AVX2BoolReduction[] = { | |||
4420 | { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp | |||
4421 | { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp | |||
4422 | { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp | |||
4423 | { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp | |||
4424 | }; | |||
4425 | ||||
4426 | static const CostTblEntry AVX1BoolReduction[] = { | |||
4427 | { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp | |||
4428 | { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp | |||
4429 | { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp | |||
4430 | { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp | |||
4431 | { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp | |||
4432 | { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp | |||
4433 | { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp | |||
4434 | { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp | |||
4435 | }; | |||
4436 | ||||
4437 | static const CostTblEntry SSE2BoolReduction[] = { | |||
4438 | { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp | |||
4439 | { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp | |||
4440 | { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp | |||
4441 | { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp | |||
4442 | { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp | |||
4443 | { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp | |||
4444 | { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp | |||
4445 | { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp | |||
4446 | }; | |||
4447 | ||||
4448 | // Handle bool allof/anyof patterns. | |||
4449 | if (ValVTy->getElementType()->isIntegerTy(1)) { | |||
4450 | InstructionCost ArithmeticCost = 0; | |||
4451 | if (LT.first != 1 && MTy.isVector() && | |||
4452 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { | |||
4453 | // Type needs to be split. We need LT.first - 1 arithmetic ops. | |||
4454 | auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(), | |||
4455 | MTy.getVectorNumElements()); | |||
4456 | ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind); | |||
4457 | ArithmeticCost *= LT.first - 1; | |||
4458 | } | |||
4459 | ||||
4460 | if (ST->hasAVX512()) | |||
4461 | if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy)) | |||
4462 | return ArithmeticCost + Entry->Cost; | |||
4463 | if (ST->hasAVX2()) | |||
4464 | if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy)) | |||
4465 | return ArithmeticCost + Entry->Cost; | |||
4466 | if (ST->hasAVX()) | |||
4467 | if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy)) | |||
4468 | return ArithmeticCost + Entry->Cost; | |||
4469 | if (ST->hasSSE2()) | |||
4470 | if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy)) | |||
4471 | return ArithmeticCost + Entry->Cost; | |||
4472 | ||||
4473 | return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind); | |||
4474 | } | |||
4475 | ||||
4476 | unsigned NumVecElts = ValVTy->getNumElements(); | |||
4477 | unsigned ScalarSize = ValVTy->getScalarSizeInBits(); | |||
4478 | ||||
4479 | // Special case power of 2 reductions where the scalar type isn't changed | |||
4480 | // by type legalization. | |||
4481 | if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits()) | |||
4482 | return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind); | |||
4483 | ||||
4484 | InstructionCost ReductionCost = 0; | |||
4485 | ||||
4486 | auto *Ty = ValVTy; | |||
4487 | if (LT.first != 1 && MTy.isVector() && | |||
4488 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { | |||
4489 | // Type needs to be split. We need LT.first - 1 arithmetic ops. | |||
4490 | Ty = FixedVectorType::get(ValVTy->getElementType(), | |||
4491 | MTy.getVectorNumElements()); | |||
4492 | ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind); | |||
4493 | ReductionCost *= LT.first - 1; | |||
4494 | NumVecElts = MTy.getVectorNumElements(); | |||
4495 | } | |||
4496 | ||||
4497 | // Now handle reduction with the legal type, taking into account size changes | |||
4498 | // at each level. | |||
4499 | while (NumVecElts > 1) { | |||
4500 | // Determine the size of the remaining vector we need to reduce. | |||
4501 | unsigned Size = NumVecElts * ScalarSize; | |||
4502 | NumVecElts /= 2; | |||
4503 | // If we're reducing from 256/512 bits, use an extract_subvector. | |||
4504 | if (Size > 128) { | |||
4505 | auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts); | |||
4506 | ReductionCost += | |||
4507 | getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, NumVecElts, SubTy); | |||
4508 | Ty = SubTy; | |||
4509 | } else if (Size == 128) { | |||
4510 | // Reducing from 128 bits is a permute of v2f64/v2i64. | |||
4511 | FixedVectorType *ShufTy; | |||
4512 | if (ValVTy->isFloatingPointTy()) | |||
4513 | ShufTy = | |||
4514 | FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2); | |||
4515 | else | |||
4516 | ShufTy = | |||
4517 | FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2); | |||
4518 | ReductionCost += | |||
4519 | getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr); | |||
4520 | } else if (Size == 64) { | |||
4521 | // Reducing from 64 bits is a shuffle of v4f32/v4i32. | |||
4522 | FixedVectorType *ShufTy; | |||
4523 | if (ValVTy->isFloatingPointTy()) | |||
4524 | ShufTy = | |||
4525 | FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4); | |||
4526 | else | |||
4527 | ShufTy = | |||
4528 | FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4); | |||
4529 | ReductionCost += | |||
4530 | getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr); | |||
4531 | } else { | |||
4532 | // Reducing from smaller size is a shift by immediate. | |||
4533 | auto *ShiftTy = FixedVectorType::get( | |||
4534 | Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size); | |||
4535 | ReductionCost += getArithmeticInstrCost( | |||
4536 | Instruction::LShr, ShiftTy, CostKind, | |||
4537 | TargetTransformInfo::OK_AnyValue, | |||
4538 | TargetTransformInfo::OK_UniformConstantValue, | |||
4539 | TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); | |||
4540 | } | |||
4541 | ||||
4542 | // Add the arithmetic op for this level. | |||
4543 | ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind); | |||
4544 | } | |||
4545 | ||||
4546 | // Add the final extract element to the cost. | |||
4547 | return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0); | |||
4548 | } | |||
4549 | ||||
4550 | InstructionCost X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy, | |||
4551 | bool IsUnsigned) { | |||
4552 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); | |||
4553 | ||||
4554 | MVT MTy = LT.second; | |||
4555 | ||||
4556 | int ISD; | |||
4557 | if (Ty->isIntOrIntVectorTy()) { | |||
4558 | ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN; | |||
4559 | } else { | |||
4560 | assert(Ty->isFPOrFPVectorTy() &&(static_cast <bool> (Ty->isFPOrFPVectorTy() && "Expected float point or integer vector type.") ? void (0) : __assert_fail ("Ty->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4561, __extension__ __PRETTY_FUNCTION__)) | |||
4561 | "Expected float point or integer vector type.")(static_cast <bool> (Ty->isFPOrFPVectorTy() && "Expected float point or integer vector type.") ? void (0) : __assert_fail ("Ty->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4561, __extension__ __PRETTY_FUNCTION__)); | |||
4562 | ISD = ISD::FMINNUM; | |||
4563 | } | |||
4564 | ||||
4565 | static const CostTblEntry SSE1CostTbl[] = { | |||
4566 | {ISD::FMINNUM, MVT::v4f32, 1}, | |||
4567 | }; | |||
4568 | ||||
4569 | static const CostTblEntry SSE2CostTbl[] = { | |||
4570 | {ISD::FMINNUM, MVT::v2f64, 1}, | |||
4571 | {ISD::SMIN, MVT::v8i16, 1}, | |||
4572 | {ISD::UMIN, MVT::v16i8, 1}, | |||
4573 | }; | |||
4574 | ||||
4575 | static const CostTblEntry SSE41CostTbl[] = { | |||
4576 | {ISD::SMIN, MVT::v4i32, 1}, | |||
4577 | {ISD::UMIN, MVT::v4i32, 1}, | |||
4578 | {ISD::UMIN, MVT::v8i16, 1}, | |||
4579 | {ISD::SMIN, MVT::v16i8, 1}, | |||
4580 | }; | |||
4581 | ||||
4582 | static const CostTblEntry SSE42CostTbl[] = { | |||
4583 | {ISD::UMIN, MVT::v2i64, 3}, // xor+pcmpgtq+blendvpd | |||
4584 | }; | |||
4585 | ||||
4586 | static const CostTblEntry AVX1CostTbl[] = { | |||
4587 | {ISD::FMINNUM, MVT::v8f32, 1}, | |||
4588 | {ISD::FMINNUM, MVT::v4f64, 1}, | |||
4589 | {ISD::SMIN, MVT::v8i32, 3}, | |||
4590 | {ISD::UMIN, MVT::v8i32, 3}, | |||
4591 | {ISD::SMIN, MVT::v16i16, 3}, | |||
4592 | {ISD::UMIN, MVT::v16i16, 3}, | |||
4593 | {ISD::SMIN, MVT::v32i8, 3}, | |||
4594 | {ISD::UMIN, MVT::v32i8, 3}, | |||
4595 | }; | |||
4596 | ||||
4597 | static const CostTblEntry AVX2CostTbl[] = { | |||
4598 | {ISD::SMIN, MVT::v8i32, 1}, | |||
4599 | {ISD::UMIN, MVT::v8i32, 1}, | |||
4600 | {ISD::SMIN, MVT::v16i16, 1}, | |||
4601 | {ISD::UMIN, MVT::v16i16, 1}, | |||
4602 | {ISD::SMIN, MVT::v32i8, 1}, | |||
4603 | {ISD::UMIN, MVT::v32i8, 1}, | |||
4604 | }; | |||
4605 | ||||
4606 | static const CostTblEntry AVX512CostTbl[] = { | |||
4607 | {ISD::FMINNUM, MVT::v16f32, 1}, | |||
4608 | {ISD::FMINNUM, MVT::v8f64, 1}, | |||
4609 | {ISD::SMIN, MVT::v2i64, 1}, | |||
4610 | {ISD::UMIN, MVT::v2i64, 1}, | |||
4611 | {ISD::SMIN, MVT::v4i64, 1}, | |||
4612 | {ISD::UMIN, MVT::v4i64, 1}, | |||
4613 | {ISD::SMIN, MVT::v8i64, 1}, | |||
4614 | {ISD::UMIN, MVT::v8i64, 1}, | |||
4615 | {ISD::SMIN, MVT::v16i32, 1}, | |||
4616 | {ISD::UMIN, MVT::v16i32, 1}, | |||
4617 | }; | |||
4618 | ||||
4619 | static const CostTblEntry AVX512BWCostTbl[] = { | |||
4620 | {ISD::SMIN, MVT::v32i16, 1}, | |||
4621 | {ISD::UMIN, MVT::v32i16, 1}, | |||
4622 | {ISD::SMIN, MVT::v64i8, 1}, | |||
4623 | {ISD::UMIN, MVT::v64i8, 1}, | |||
4624 | }; | |||
4625 | ||||
4626 | // If we have a native MIN/MAX instruction for this type, use it. | |||
4627 | if (ST->hasBWI()) | |||
4628 | if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) | |||
4629 | return LT.first * Entry->Cost; | |||
4630 | ||||
4631 | if (ST->hasAVX512()) | |||
4632 | if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) | |||
4633 | return LT.first * Entry->Cost; | |||
4634 | ||||
4635 | if (ST->hasAVX2()) | |||
4636 | if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) | |||
4637 | return LT.first * Entry->Cost; | |||
4638 | ||||
4639 | if (ST->hasAVX()) | |||
4640 | if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) | |||
4641 | return LT.first * Entry->Cost; | |||
4642 | ||||
4643 | if (ST->hasSSE42()) | |||
4644 | if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) | |||
4645 | return LT.first * Entry->Cost; | |||
4646 | ||||
4647 | if (ST->hasSSE41()) | |||
4648 | if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) | |||
4649 | return LT.first * Entry->Cost; | |||
4650 | ||||
4651 | if (ST->hasSSE2()) | |||
4652 | if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) | |||
4653 | return LT.first * Entry->Cost; | |||
4654 | ||||
4655 | if (ST->hasSSE1()) | |||
4656 | if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) | |||
4657 | return LT.first * Entry->Cost; | |||
4658 | ||||
4659 | unsigned CmpOpcode; | |||
4660 | if (Ty->isFPOrFPVectorTy()) { | |||
4661 | CmpOpcode = Instruction::FCmp; | |||
4662 | } else { | |||
4663 | assert(Ty->isIntOrIntVectorTy() &&(static_cast <bool> (Ty->isIntOrIntVectorTy() && "expecting floating point or integer type for min/max reduction" ) ? void (0) : __assert_fail ("Ty->isIntOrIntVectorTy() && \"expecting floating point or integer type for min/max reduction\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4664, __extension__ __PRETTY_FUNCTION__)) | |||
4664 | "expecting floating point or integer type for min/max reduction")(static_cast <bool> (Ty->isIntOrIntVectorTy() && "expecting floating point or integer type for min/max reduction" ) ? void (0) : __assert_fail ("Ty->isIntOrIntVectorTy() && \"expecting floating point or integer type for min/max reduction\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4664, __extension__ __PRETTY_FUNCTION__)); | |||
4665 | CmpOpcode = Instruction::ICmp; | |||
4666 | } | |||
4667 | ||||
4668 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; | |||
4669 | // Otherwise fall back to cmp+select. | |||
4670 | InstructionCost Result = | |||
4671 | getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CmpInst::BAD_ICMP_PREDICATE, | |||
4672 | CostKind) + | |||
4673 | getCmpSelInstrCost(Instruction::Select, Ty, CondTy, | |||
4674 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | |||
4675 | return Result; | |||
4676 | } | |||
4677 | ||||
4678 | InstructionCost | |||
4679 | X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy, | |||
4680 | bool IsUnsigned, | |||
4681 | TTI::TargetCostKind CostKind) { | |||
4682 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); | |||
4683 | ||||
4684 | MVT MTy = LT.second; | |||
4685 | ||||
4686 | int ISD; | |||
4687 | if (ValTy->isIntOrIntVectorTy()) { | |||
4688 | ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN; | |||
| ||||
4689 | } else { | |||
4690 | assert(ValTy->isFPOrFPVectorTy() &&(static_cast <bool> (ValTy->isFPOrFPVectorTy() && "Expected float point or integer vector type.") ? void (0) : __assert_fail ("ValTy->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4691, __extension__ __PRETTY_FUNCTION__)) | |||
4691 | "Expected float point or integer vector type.")(static_cast <bool> (ValTy->isFPOrFPVectorTy() && "Expected float point or integer vector type.") ? void (0) : __assert_fail ("ValTy->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4691, __extension__ __PRETTY_FUNCTION__)); | |||
4692 | ISD = ISD::FMINNUM; | |||
4693 | } | |||
4694 | ||||
4695 | // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput | |||
4696 | // and make it as the cost. | |||
4697 | ||||
4698 | static const CostTblEntry SSE2CostTblNoPairWise[] = { | |||
4699 | {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw | |||
4700 | {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw | |||
4701 | {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw | |||
4702 | }; | |||
4703 | ||||
4704 | static const CostTblEntry SSE41CostTblNoPairWise[] = { | |||
4705 | {ISD::SMIN, MVT::v2i16, 3}, // same as sse2 | |||
4706 | {ISD::SMIN, MVT::v4i16, 5}, // same as sse2 | |||
4707 | {ISD::UMIN, MVT::v2i16, 5}, // same as sse2 | |||
4708 | {ISD::UMIN, MVT::v4i16, 7}, // same as sse2 | |||
4709 | {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor | |||
4710 | {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax | |||
4711 | {ISD::SMIN, MVT::v2i8, 3}, // pminsb | |||
4712 | {ISD::SMIN, MVT::v4i8, 5}, // pminsb | |||
4713 | {ISD::SMIN, MVT::v8i8, 7}, // pminsb | |||
4714 | {ISD::SMIN, MVT::v16i8, 6}, | |||
4715 | {ISD::UMIN, MVT::v2i8, 3}, // same as sse2 | |||
4716 | {ISD::UMIN, MVT::v4i8, 5}, // same as sse2 | |||
4717 | {ISD::UMIN, MVT::v8i8, 7}, // same as sse2 | |||
4718 | {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax | |||
4719 | }; | |||
4720 | ||||
4721 | static const CostTblEntry AVX1CostTblNoPairWise[] = { | |||
4722 | {ISD::SMIN, MVT::v16i16, 6}, | |||
4723 | {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax | |||
4724 | {ISD::SMIN, MVT::v32i8, 8}, | |||
4725 | {ISD::UMIN, MVT::v32i8, 8}, | |||
4726 | }; | |||
4727 | ||||
4728 | static const CostTblEntry AVX512BWCostTblNoPairWise[] = { | |||
4729 | {ISD::SMIN, MVT::v32i16, 8}, | |||
4730 | {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax | |||
4731 | {ISD::SMIN, MVT::v64i8, 10}, | |||
4732 | {ISD::UMIN, MVT::v64i8, 10}, | |||
4733 | }; | |||
4734 | ||||
4735 | // Before legalizing the type, give a chance to look up illegal narrow types | |||
4736 | // in the table. | |||
4737 | // FIXME: Is there a better way to do this? | |||
4738 | EVT VT = TLI->getValueType(DL, ValTy); | |||
4739 | if (VT.isSimple()) { | |||
4740 | MVT MTy = VT.getSimpleVT(); | |||
4741 | if (ST->hasBWI()) | |||
4742 | if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy)) | |||
4743 | return Entry->Cost; | |||
4744 | ||||
4745 | if (ST->hasAVX()) | |||
4746 | if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) | |||
4747 | return Entry->Cost; | |||
4748 | ||||
4749 | if (ST->hasSSE41()) | |||
4750 | if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy)) | |||
4751 | return Entry->Cost; | |||
4752 | ||||
4753 | if (ST->hasSSE2()) | |||
4754 | if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) | |||
4755 | return Entry->Cost; | |||
4756 | } | |||
4757 | ||||
4758 | auto *ValVTy = cast<FixedVectorType>(ValTy); | |||
4759 | unsigned NumVecElts = ValVTy->getNumElements(); | |||
4760 | ||||
4761 | auto *Ty = ValVTy; | |||
4762 | InstructionCost MinMaxCost = 0; | |||
4763 | if (LT.first != 1 && MTy.isVector() && | |||
4764 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { | |||
4765 | // Type needs to be split. We need LT.first - 1 operations ops. | |||
4766 | Ty = FixedVectorType::get(ValVTy->getElementType(), | |||
4767 | MTy.getVectorNumElements()); | |||
4768 | auto *SubCondTy = FixedVectorType::get(CondTy->getElementType(), | |||
4769 | MTy.getVectorNumElements()); | |||
4770 | MinMaxCost = getMinMaxCost(Ty, SubCondTy, IsUnsigned); | |||
4771 | MinMaxCost *= LT.first - 1; | |||
4772 | NumVecElts = MTy.getVectorNumElements(); | |||
4773 | } | |||
4774 | ||||
4775 | if (ST->hasBWI()) | |||
4776 | if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy)) | |||
4777 | return MinMaxCost + Entry->Cost; | |||
4778 | ||||
4779 | if (ST->hasAVX()) | |||
4780 | if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) | |||
4781 | return MinMaxCost + Entry->Cost; | |||
4782 | ||||
4783 | if (ST->hasSSE41()) | |||
4784 | if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy)) | |||
4785 | return MinMaxCost + Entry->Cost; | |||
4786 | ||||
4787 | if (ST->hasSSE2()) | |||
4788 | if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) | |||
4789 | return MinMaxCost + Entry->Cost; | |||
4790 | ||||
4791 | unsigned ScalarSize = ValTy->getScalarSizeInBits(); | |||
4792 | ||||
4793 | // Special case power of 2 reductions where the scalar type isn't changed | |||
4794 | // by type legalization. | |||
4795 | if (!isPowerOf2_32(ValVTy->getNumElements()) || | |||
4796 | ScalarSize != MTy.getScalarSizeInBits()) | |||
4797 | return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsUnsigned, CostKind); | |||
4798 | ||||
4799 | // Now handle reduction with the legal type, taking into account size changes | |||
4800 | // at each level. | |||
4801 | while (NumVecElts > 1) { | |||
4802 | // Determine the size of the remaining vector we need to reduce. | |||
4803 | unsigned Size = NumVecElts * ScalarSize; | |||
4804 | NumVecElts /= 2; | |||
4805 | // If we're reducing from 256/512 bits, use an extract_subvector. | |||
4806 | if (Size > 128) { | |||
4807 | auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts); | |||
4808 | MinMaxCost += | |||
4809 | getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, NumVecElts, SubTy); | |||
4810 | Ty = SubTy; | |||
4811 | } else if (Size == 128) { | |||
4812 | // Reducing from 128 bits is a permute of v2f64/v2i64. | |||
4813 | VectorType *ShufTy; | |||
4814 | if (ValTy->isFloatingPointTy()) | |||
4815 | ShufTy = | |||
4816 | FixedVectorType::get(Type::getDoubleTy(ValTy->getContext()), 2); | |||
4817 | else | |||
4818 | ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2); | |||
4819 | MinMaxCost += | |||
4820 | getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr); | |||
4821 | } else if (Size == 64) { | |||
4822 | // Reducing from 64 bits is a shuffle of v4f32/v4i32. | |||
4823 | FixedVectorType *ShufTy; | |||
4824 | if (ValTy->isFloatingPointTy()) | |||
4825 | ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4); | |||
4826 | else | |||
4827 | ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4); | |||
4828 | MinMaxCost += | |||
4829 | getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr); | |||
4830 | } else { | |||
4831 | // Reducing from smaller size is a shift by immediate. | |||
4832 | auto *ShiftTy = FixedVectorType::get( | |||
4833 | Type::getIntNTy(ValTy->getContext(), Size), 128 / Size); | |||
4834 | MinMaxCost += getArithmeticInstrCost( | |||
4835 | Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput, | |||
4836 | TargetTransformInfo::OK_AnyValue, | |||
4837 | TargetTransformInfo::OK_UniformConstantValue, | |||
4838 | TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); | |||
4839 | } | |||
4840 | ||||
4841 | // Add the arithmetic op for this level. | |||
4842 | auto *SubCondTy = | |||
4843 | FixedVectorType::get(CondTy->getElementType(), Ty->getNumElements()); | |||
4844 | MinMaxCost += getMinMaxCost(Ty, SubCondTy, IsUnsigned); | |||
4845 | } | |||
4846 | ||||
4847 | // Add the final extract element to the cost. | |||
4848 | return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0); | |||
4849 | } | |||
4850 | ||||
4851 | /// Calculate the cost of materializing a 64-bit value. This helper | |||
4852 | /// method might only calculate a fraction of a larger immediate. Therefore it | |||
4853 | /// is valid to return a cost of ZERO. | |||
4854 | InstructionCost X86TTIImpl::getIntImmCost(int64_t Val) { | |||
4855 | if (Val == 0) | |||
4856 | return TTI::TCC_Free; | |||
4857 | ||||
4858 | if (isInt<32>(Val)) | |||
4859 | return TTI::TCC_Basic; | |||
4860 | ||||
4861 | return 2 * TTI::TCC_Basic; | |||
4862 | } | |||
4863 | ||||
4864 | InstructionCost X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, | |||
4865 | TTI::TargetCostKind CostKind) { | |||
4866 | assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) : __assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 4866, __extension__ __PRETTY_FUNCTION__)); | |||
4867 | ||||
4868 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); | |||
4869 | if (BitSize == 0) | |||
4870 | return ~0U; | |||
4871 | ||||
4872 | // Never hoist constants larger than 128bit, because this might lead to | |||
4873 | // incorrect code generation or assertions in codegen. | |||
4874 | // Fixme: Create a cost model for types larger than i128 once the codegen | |||
4875 | // issues have been fixed. | |||
4876 | if (BitSize > 128) | |||
4877 | return TTI::TCC_Free; | |||
4878 | ||||
4879 | if (Imm == 0) | |||
4880 | return TTI::TCC_Free; | |||
4881 | ||||
4882 | // Sign-extend all constants to a multiple of 64-bit. | |||
4883 | APInt ImmVal = Imm; | |||
4884 | if (BitSize % 64 != 0) | |||
4885 | ImmVal = Imm.sext(alignTo(BitSize, 64)); | |||
4886 | ||||
4887 | // Split the constant into 64-bit chunks and calculate the cost for each | |||
4888 | // chunk. | |||
4889 | InstructionCost Cost = 0; | |||
4890 | for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { | |||
4891 | APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); | |||
4892 | int64_t Val = Tmp.getSExtValue(); | |||
4893 | Cost += getIntImmCost(Val); | |||
4894 | } | |||
4895 | // We need at least one instruction to materialize the constant. | |||
4896 | return std::max<InstructionCost>(1, Cost); | |||
4897 | } | |||
4898 | ||||
4899 | InstructionCost X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, | |||
4900 | const APInt &Imm, Type *Ty, | |||
4901 | TTI::TargetCostKind CostKind, | |||
4902 | Instruction *Inst) { | |||
4903 | assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) : __assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 4903, __extension__ __PRETTY_FUNCTION__)); | |||
4904 | ||||
4905 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); | |||
4906 | // There is no cost model for constants with a bit size of 0. Return TCC_Free | |||
4907 | // here, so that constant hoisting will ignore this constant. | |||
4908 | if (BitSize == 0) | |||
4909 | return TTI::TCC_Free; | |||
4910 | ||||
4911 | unsigned ImmIdx = ~0U; | |||
4912 | switch (Opcode) { | |||
4913 | default: | |||
4914 | return TTI::TCC_Free; | |||
4915 | case Instruction::GetElementPtr: | |||
4916 | // Always hoist the base address of a GetElementPtr. This prevents the | |||
4917 | // creation of new constants for every base constant that gets constant | |||
4918 | // folded with the offset. | |||
4919 | if (Idx == 0) | |||
4920 | return 2 * TTI::TCC_Basic; | |||
4921 | return TTI::TCC_Free; | |||
4922 | case Instruction::Store: | |||
4923 | ImmIdx = 0; | |||
4924 | break; | |||
4925 | case Instruction::ICmp: | |||
4926 | // This is an imperfect hack to prevent constant hoisting of | |||
4927 | // compares that might be trying to check if a 64-bit value fits in | |||
4928 | // 32-bits. The backend can optimize these cases using a right shift by 32. | |||
4929 | // Ideally we would check the compare predicate here. There also other | |||
4930 | // similar immediates the backend can use shifts for. | |||
4931 | if (Idx == 1 && Imm.getBitWidth() == 64) { | |||
4932 | uint64_t ImmVal = Imm.getZExtValue(); | |||
4933 | if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff) | |||
4934 | return TTI::TCC_Free; | |||
4935 | } | |||
4936 | ImmIdx = 1; | |||
4937 | break; | |||
4938 | case Instruction::And: | |||
4939 | // We support 64-bit ANDs with immediates with 32-bits of leading zeroes | |||
4940 | // by using a 32-bit operation with implicit zero extension. Detect such | |||
4941 | // immediates here as the normal path expects bit 31 to be sign extended. | |||
4942 | if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue())) | |||
4943 | return TTI::TCC_Free; | |||
4944 | ImmIdx = 1; | |||
4945 | break; | |||
4946 | case Instruction::Add: | |||
4947 | case Instruction::Sub: | |||
4948 | // For add/sub, we can use the opposite instruction for INT32_MIN. | |||
4949 | if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000) | |||
4950 | return TTI::TCC_Free; | |||
4951 | ImmIdx = 1; | |||
4952 | break; | |||
4953 | case Instruction::UDiv: | |||
4954 | case Instruction::SDiv: | |||
4955 | case Instruction::URem: | |||
4956 | case Instruction::SRem: | |||
4957 | // Division by constant is typically expanded later into a different | |||
4958 | // instruction sequence. This completely changes the constants. | |||
4959 | // Report them as "free" to stop ConstantHoist from marking them as opaque. | |||
4960 | return TTI::TCC_Free; | |||
4961 | case Instruction::Mul: | |||
4962 | case Instruction::Or: | |||
4963 | case Instruction::Xor: | |||
4964 | ImmIdx = 1; | |||
4965 | break; | |||
4966 | // Always return TCC_Free for the shift value of a shift instruction. | |||
4967 | case Instruction::Shl: | |||
4968 | case Instruction::LShr: | |||
4969 | case Instruction::AShr: | |||
4970 | if (Idx == 1) | |||
4971 | return TTI::TCC_Free; | |||
4972 | break; | |||
4973 | case Instruction::Trunc: | |||
4974 | case Instruction::ZExt: | |||
4975 | case Instruction::SExt: | |||
4976 | case Instruction::IntToPtr: | |||
4977 | case Instruction::PtrToInt: | |||
4978 | case Instruction::BitCast: | |||
4979 | case Instruction::PHI: | |||
4980 | case Instruction::Call: | |||
4981 | case Instruction::Select: | |||
4982 | case Instruction::Ret: | |||
4983 | case Instruction::Load: | |||
4984 | break; | |||
4985 | } | |||
4986 | ||||
4987 | if (Idx == ImmIdx) { | |||
4988 | int NumConstants = divideCeil(BitSize, 64); | |||
4989 | InstructionCost Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); | |||
4990 | return (Cost <= NumConstants * TTI::TCC_Basic) | |||
4991 | ? static_cast<int>(TTI::TCC_Free) | |||
4992 | : Cost; | |||
4993 | } | |||
4994 | ||||
4995 | return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); | |||
4996 | } | |||
4997 | ||||
4998 | InstructionCost X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, | |||
4999 | const APInt &Imm, Type *Ty, | |||
5000 | TTI::TargetCostKind CostKind) { | |||
5001 | assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) : __assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 5001, __extension__ __PRETTY_FUNCTION__)); | |||
5002 | ||||
5003 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); | |||
5004 | // There is no cost model for constants with a bit size of 0. Return TCC_Free | |||
5005 | // here, so that constant hoisting will ignore this constant. | |||
5006 | if (BitSize == 0) | |||
5007 | return TTI::TCC_Free; | |||
5008 | ||||
5009 | switch (IID) { | |||
5010 | default: | |||
5011 | return TTI::TCC_Free; | |||
5012 | case Intrinsic::sadd_with_overflow: | |||
5013 | case Intrinsic::uadd_with_overflow: | |||
5014 | case Intrinsic::ssub_with_overflow: | |||
5015 | case Intrinsic::usub_with_overflow: | |||
5016 | case Intrinsic::smul_with_overflow: | |||
5017 | case Intrinsic::umul_with_overflow: | |||
5018 | if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue())) | |||
5019 | return TTI::TCC_Free; | |||
5020 | break; | |||
5021 | case Intrinsic::experimental_stackmap: | |||
5022 | if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) | |||
5023 | return TTI::TCC_Free; | |||
5024 | break; | |||
5025 | case Intrinsic::experimental_patchpoint_void: | |||
5026 | case Intrinsic::experimental_patchpoint_i64: | |||
5027 | if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) | |||
5028 | return TTI::TCC_Free; | |||
5029 | break; | |||
5030 | } | |||
5031 | return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); | |||
5032 | } | |||
5033 | ||||
5034 | InstructionCost X86TTIImpl::getCFInstrCost(unsigned Opcode, | |||
5035 | TTI::TargetCostKind CostKind, | |||
5036 | const Instruction *I) { | |||
5037 | if (CostKind != TTI::TCK_RecipThroughput) | |||
5038 | return Opcode == Instruction::PHI ? 0 : 1; | |||
5039 | // Branches are assumed to be predicted. | |||
5040 | return 0; | |||
5041 | } | |||
5042 | ||||
5043 | int X86TTIImpl::getGatherOverhead() const { | |||
5044 | // Some CPUs have more overhead for gather. The specified overhead is relative | |||
5045 | // to the Load operation. "2" is the number provided by Intel architects. This | |||
5046 | // parameter is used for cost estimation of Gather Op and comparison with | |||
5047 | // other alternatives. | |||
5048 | // TODO: Remove the explicit hasAVX512()?, That would mean we would only | |||
5049 | // enable gather with a -march. | |||
5050 | if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather())) | |||
5051 | return 2; | |||
5052 | ||||
5053 | return 1024; | |||
5054 | } | |||
5055 | ||||
5056 | int X86TTIImpl::getScatterOverhead() const { | |||
5057 | if (ST->hasAVX512()) | |||
5058 | return 2; | |||
5059 | ||||
5060 | return 1024; | |||
5061 | } | |||
5062 | ||||
5063 | // Return an average cost of Gather / Scatter instruction, maybe improved later. | |||
5064 | // FIXME: Add TargetCostKind support. | |||
5065 | InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, | |||
5066 | const Value *Ptr, Align Alignment, | |||
5067 | unsigned AddressSpace) { | |||
5068 | ||||
5069 | assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost")(static_cast <bool> (isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost") ? void (0) : __assert_fail ("isa<VectorType>(SrcVTy) && \"Unexpected type in getGSVectorCost\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5069, __extension__ __PRETTY_FUNCTION__)); | |||
5070 | unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements(); | |||
5071 | ||||
5072 | // Try to reduce index size from 64 bit (default for GEP) | |||
5073 | // to 32. It is essential for VF 16. If the index can't be reduced to 32, the | |||
5074 | // operation will use 16 x 64 indices which do not fit in a zmm and needs | |||
5075 | // to split. Also check that the base pointer is the same for all lanes, | |||
5076 | // and that there's at most one variable index. | |||
5077 | auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) { | |||
5078 | unsigned IndexSize = DL.getPointerSizeInBits(); | |||
5079 | const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); | |||
5080 | if (IndexSize < 64 || !GEP) | |||
5081 | return IndexSize; | |||
5082 | ||||
5083 | unsigned NumOfVarIndices = 0; | |||
5084 | const Value *Ptrs = GEP->getPointerOperand(); | |||
5085 | if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs)) | |||
5086 | return IndexSize; | |||
5087 | for (unsigned i = 1; i < GEP->getNumOperands(); ++i) { | |||
5088 | if (isa<Constant>(GEP->getOperand(i))) | |||
5089 | continue; | |||
5090 | Type *IndxTy = GEP->getOperand(i)->getType(); | |||
5091 | if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy)) | |||
5092 | IndxTy = IndexVTy->getElementType(); | |||
5093 | if ((IndxTy->getPrimitiveSizeInBits() == 64 && | |||
5094 | !isa<SExtInst>(GEP->getOperand(i))) || | |||
5095 | ++NumOfVarIndices > 1) | |||
5096 | return IndexSize; // 64 | |||
5097 | } | |||
5098 | return (unsigned)32; | |||
5099 | }; | |||
5100 | ||||
5101 | // Trying to reduce IndexSize to 32 bits for vector 16. | |||
5102 | // By default the IndexSize is equal to pointer size. | |||
5103 | unsigned IndexSize = (ST->hasAVX512() && VF >= 16) | |||
5104 | ? getIndexSizeInBits(Ptr, DL) | |||
5105 | : DL.getPointerSizeInBits(); | |||
5106 | ||||
5107 | auto *IndexVTy = FixedVectorType::get( | |||
5108 | IntegerType::get(SrcVTy->getContext(), IndexSize), VF); | |||
5109 | std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(IndexVTy); | |||
5110 | std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcVTy); | |||
5111 | InstructionCost::CostType SplitFactor = | |||
5112 | *std::max(IdxsLT.first, SrcLT.first).getValue(); | |||
5113 | if (SplitFactor > 1) { | |||
5114 | // Handle splitting of vector of pointers | |||
5115 | auto *SplitSrcTy = | |||
5116 | FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor); | |||
5117 | return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment, | |||
5118 | AddressSpace); | |||
5119 | } | |||
5120 | ||||
5121 | // The gather / scatter cost is given by Intel architects. It is a rough | |||
5122 | // number since we are looking at one instruction in a time. | |||
5123 | const int GSOverhead = (Opcode == Instruction::Load) | |||
5124 | ? getGatherOverhead() | |||
5125 | : getScatterOverhead(); | |||
5126 | return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), | |||
5127 | MaybeAlign(Alignment), AddressSpace, | |||
5128 | TTI::TCK_RecipThroughput); | |||
5129 | } | |||
5130 | ||||
5131 | /// Return the cost of full scalarization of gather / scatter operation. | |||
5132 | /// | |||
5133 | /// Opcode - Load or Store instruction. | |||
5134 | /// SrcVTy - The type of the data vector that should be gathered or scattered. | |||
5135 | /// VariableMask - The mask is non-constant at compile time. | |||
5136 | /// Alignment - Alignment for one element. | |||
5137 | /// AddressSpace - pointer[s] address space. | |||
5138 | /// | |||
5139 | /// FIXME: Add TargetCostKind support. | |||
5140 | InstructionCost X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy, | |||
5141 | bool VariableMask, Align Alignment, | |||
5142 | unsigned AddressSpace) { | |||
5143 | Type *ScalarTy = SrcVTy->getScalarType(); | |||
5144 | unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements(); | |||
5145 | APInt DemandedElts = APInt::getAllOnes(VF); | |||
5146 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; | |||
5147 | ||||
5148 | InstructionCost MaskUnpackCost = 0; | |||
5149 | if (VariableMask) { | |||
5150 | auto *MaskTy = | |||
5151 | FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF); | |||
5152 | MaskUnpackCost = getScalarizationOverhead( | |||
5153 | MaskTy, DemandedElts, /*Insert=*/false, /*Extract=*/true); | |||
5154 | InstructionCost ScalarCompareCost = getCmpSelInstrCost( | |||
5155 | Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr, | |||
5156 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | |||
5157 | InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind); | |||
5158 | MaskUnpackCost += VF * (BranchCost + ScalarCompareCost); | |||
5159 | } | |||
5160 | ||||
5161 | InstructionCost AddressUnpackCost = getScalarizationOverhead( | |||
5162 | FixedVectorType::get(ScalarTy->getPointerTo(), VF), DemandedElts, | |||
5163 | /*Insert=*/false, /*Extract=*/true); | |||
5164 | ||||
5165 | // The cost of the scalar loads/stores. | |||
5166 | InstructionCost MemoryOpCost = | |||
5167 | VF * getMemoryOpCost(Opcode, ScalarTy, MaybeAlign(Alignment), | |||
5168 | AddressSpace, CostKind); | |||
5169 | ||||
5170 | // The cost of forming the vector from loaded scalars/ | |||
5171 | // scalarizing the vector to perform scalar stores. | |||
5172 | InstructionCost InsertExtractCost = | |||
5173 | getScalarizationOverhead(cast<FixedVectorType>(SrcVTy), DemandedElts, | |||
5174 | /*Insert=*/Opcode == Instruction::Load, | |||
5175 | /*Extract=*/Opcode == Instruction::Store); | |||
5176 | ||||
5177 | return AddressUnpackCost + MemoryOpCost + MaskUnpackCost + InsertExtractCost; | |||
5178 | } | |||
5179 | ||||
5180 | /// Calculate the cost of Gather / Scatter operation | |||
5181 | InstructionCost X86TTIImpl::getGatherScatterOpCost( | |||
5182 | unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask, | |||
5183 | Align Alignment, TTI::TargetCostKind CostKind, | |||
5184 | const Instruction *I = nullptr) { | |||
5185 | if (CostKind != TTI::TCK_RecipThroughput) { | |||
5186 | if ((Opcode == Instruction::Load && | |||
5187 | isLegalMaskedGather(SrcVTy, Align(Alignment)) && | |||
5188 | !forceScalarizeMaskedGather(cast<VectorType>(SrcVTy), | |||
5189 | Align(Alignment))) || | |||
5190 | (Opcode == Instruction::Store && | |||
5191 | isLegalMaskedScatter(SrcVTy, Align(Alignment)) && | |||
5192 | !forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy), | |||
5193 | Align(Alignment)))) | |||
5194 | return 1; | |||
5195 | return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask, | |||
5196 | Alignment, CostKind, I); | |||
5197 | } | |||
5198 | ||||
5199 | assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter")(static_cast <bool> (SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter") ? void (0) : __assert_fail ("SrcVTy->isVectorTy() && \"Unexpected data type for Gather/Scatter\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5199, __extension__ __PRETTY_FUNCTION__)); | |||
5200 | PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); | |||
5201 | if (!PtrTy && Ptr->getType()->isVectorTy()) | |||
5202 | PtrTy = dyn_cast<PointerType>( | |||
5203 | cast<VectorType>(Ptr->getType())->getElementType()); | |||
5204 | assert(PtrTy && "Unexpected type for Ptr argument")(static_cast <bool> (PtrTy && "Unexpected type for Ptr argument" ) ? void (0) : __assert_fail ("PtrTy && \"Unexpected type for Ptr argument\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5204, __extension__ __PRETTY_FUNCTION__)); | |||
5205 | unsigned AddressSpace = PtrTy->getAddressSpace(); | |||
5206 | ||||
5207 | if ((Opcode == Instruction::Load && | |||
5208 | (!isLegalMaskedGather(SrcVTy, Align(Alignment)) || | |||
5209 | forceScalarizeMaskedGather(cast<VectorType>(SrcVTy), | |||
5210 | Align(Alignment)))) || | |||
5211 | (Opcode == Instruction::Store && | |||
5212 | (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) || | |||
5213 | forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy), | |||
5214 | Align(Alignment))))) | |||
5215 | return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, | |||
5216 | AddressSpace); | |||
5217 | ||||
5218 | return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace); | |||
5219 | } | |||
5220 | ||||
5221 | bool X86TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, | |||
5222 | const TargetTransformInfo::LSRCost &C2) { | |||
5223 | // X86 specific here are "instruction number 1st priority". | |||
5224 | return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, | |||
5225 | C1.NumIVMuls, C1.NumBaseAdds, | |||
5226 | C1.ScaleCost, C1.ImmCost, C1.SetupCost) < | |||
5227 | std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, | |||
5228 | C2.NumIVMuls, C2.NumBaseAdds, | |||
5229 | C2.ScaleCost, C2.ImmCost, C2.SetupCost); | |||
5230 | } | |||
5231 | ||||
5232 | bool X86TTIImpl::canMacroFuseCmp() { | |||
5233 | return ST->hasMacroFusion() || ST->hasBranchFusion(); | |||
5234 | } | |||
5235 | ||||
5236 | bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) { | |||
5237 | if (!ST->hasAVX()) | |||
5238 | return false; | |||
5239 | ||||
5240 | // The backend can't handle a single element vector. | |||
5241 | if (isa<VectorType>(DataTy) && | |||
5242 | cast<FixedVectorType>(DataTy)->getNumElements() == 1) | |||
5243 | return false; | |||
5244 | Type *ScalarTy = DataTy->getScalarType(); | |||
5245 | ||||
5246 | if (ScalarTy->isPointerTy()) | |||
5247 | return true; | |||
5248 | ||||
5249 | if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) | |||
5250 | return true; | |||
5251 | ||||
5252 | if (ScalarTy->isHalfTy() && ST->hasBWI()) | |||
5253 | return true; | |||
5254 | ||||
5255 | if (!ScalarTy->isIntegerTy()) | |||
5256 | return false; | |||
5257 | ||||
5258 | unsigned IntWidth = ScalarTy->getIntegerBitWidth(); | |||
5259 | return IntWidth == 32 || IntWidth == 64 || | |||
5260 | ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI()); | |||
5261 | } | |||
5262 | ||||
5263 | bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) { | |||
5264 | return isLegalMaskedLoad(DataType, Alignment); | |||
5265 | } | |||
5266 | ||||
5267 | bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) { | |||
5268 | unsigned DataSize = DL.getTypeStoreSize(DataType); | |||
5269 | // The only supported nontemporal loads are for aligned vectors of 16 or 32 | |||
5270 | // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2 | |||
5271 | // (the equivalent stores only require AVX). | |||
5272 | if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32)) | |||
5273 | return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2(); | |||
5274 | ||||
5275 | return false; | |||
5276 | } | |||
5277 | ||||
5278 | bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) { | |||
5279 | unsigned DataSize = DL.getTypeStoreSize(DataType); | |||
5280 | ||||
5281 | // SSE4A supports nontemporal stores of float and double at arbitrary | |||
5282 | // alignment. | |||
5283 | if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy())) | |||
5284 | return true; | |||
5285 | ||||
5286 | // Besides the SSE4A subtarget exception above, only aligned stores are | |||
5287 | // available nontemporaly on any other subtarget. And only stores with a size | |||
5288 | // of 4..32 bytes (powers of 2, only) are permitted. | |||
5289 | if (Alignment < DataSize || DataSize < 4 || DataSize > 32 || | |||
5290 | !isPowerOf2_32(DataSize)) | |||
5291 | return false; | |||
5292 | ||||
5293 | // 32-byte vector nontemporal stores are supported by AVX (the equivalent | |||
5294 | // loads require AVX2). | |||
5295 | if (DataSize == 32) | |||
5296 | return ST->hasAVX(); | |||
5297 | if (DataSize == 16) | |||
5298 | return ST->hasSSE1(); | |||
5299 | return true; | |||
5300 | } | |||
5301 | ||||
5302 | bool X86TTIImpl::isLegalBroadcastLoad(Type *ElementTy, | |||
5303 | ElementCount NumElements) const { | |||
5304 | // movddup | |||
5305 | return ST->hasSSE3() && !NumElements.isScalable() && | |||
5306 | NumElements.getFixedValue() == 2 && | |||
5307 | ElementTy == Type::getDoubleTy(ElementTy->getContext()); | |||
5308 | } | |||
5309 | ||||
5310 | bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) { | |||
5311 | if (!isa<VectorType>(DataTy)) | |||
5312 | return false; | |||
5313 | ||||
5314 | if (!ST->hasAVX512()) | |||
5315 | return false; | |||
5316 | ||||
5317 | // The backend can't handle a single element vector. | |||
5318 | if (cast<FixedVectorType>(DataTy)->getNumElements() == 1) | |||
5319 | return false; | |||
5320 | ||||
5321 | Type *ScalarTy = cast<VectorType>(DataTy)->getElementType(); | |||
5322 | ||||
5323 | if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) | |||
5324 | return true; | |||
5325 | ||||
5326 | if (!ScalarTy->isIntegerTy()) | |||
5327 | return false; | |||
5328 | ||||
5329 | unsigned IntWidth = ScalarTy->getIntegerBitWidth(); | |||
5330 | return IntWidth == 32 || IntWidth == 64 || | |||
5331 | ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2()); | |||
5332 | } | |||
5333 | ||||
5334 | bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) { | |||
5335 | return isLegalMaskedExpandLoad(DataTy); | |||
5336 | } | |||
5337 | ||||
5338 | bool X86TTIImpl::supportsGather() const { | |||
5339 | // Some CPUs have better gather performance than others. | |||
5340 | // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only | |||
5341 | // enable gather with a -march. | |||
5342 | return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2()); | |||
5343 | } | |||
5344 | ||||
5345 | bool X86TTIImpl::forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) { | |||
5346 | // Gather / Scatter for vector 2 is not profitable on KNL / SKX | |||
5347 | // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend | |||
5348 | // it to 8 elements, but zeroing upper bits of the mask vector will add more | |||
5349 | // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO: | |||
5350 | // Check, maybe the gather/scatter instruction is better in the VariableMask | |||
5351 | // case. | |||
5352 | unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements(); | |||
5353 | return NumElts == 1 || | |||
5354 | (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX()))); | |||
5355 | } | |||
5356 | ||||
5357 | bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) { | |||
5358 | if (!supportsGather()) | |||
5359 | return false; | |||
5360 | Type *ScalarTy = DataTy->getScalarType(); | |||
5361 | if (ScalarTy->isPointerTy()) | |||
5362 | return true; | |||
5363 | ||||
5364 | if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) | |||
5365 | return true; | |||
5366 | ||||
5367 | if (!ScalarTy->isIntegerTy()) | |||
5368 | return false; | |||
5369 | ||||
5370 | unsigned IntWidth = ScalarTy->getIntegerBitWidth(); | |||
5371 | return IntWidth == 32 || IntWidth == 64; | |||
5372 | } | |||
5373 | ||||
5374 | bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, | |||
5375 | unsigned Opcode1, | |||
5376 | const SmallBitVector &OpcodeMask) const { | |||
5377 | // ADDSUBPS 4xf32 SSE3 | |||
5378 | // VADDSUBPS 4xf32 AVX | |||
5379 | // VADDSUBPS 8xf32 AVX2 | |||
5380 | // ADDSUBPD 2xf64 SSE3 | |||
5381 | // VADDSUBPD 2xf64 AVX | |||
5382 | // VADDSUBPD 4xf64 AVX2 | |||
5383 | ||||
5384 | unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements(); | |||
5385 | assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible")(static_cast <bool> (OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible") ? void (0) : __assert_fail ("OpcodeMask.size() == NumElements && \"Mask and VecTy are incompatible\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5385, __extension__ __PRETTY_FUNCTION__)); | |||
5386 | if (!isPowerOf2_32(NumElements)) | |||
5387 | return false; | |||
5388 | // Check the opcode pattern. We apply the mask on the opcode arguments and | |||
5389 | // then check if it is what we expect. | |||
5390 | for (int Lane : seq<int>(0, NumElements)) { | |||
5391 | unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0; | |||
5392 | // We expect FSub for even lanes and FAdd for odd lanes. | |||
5393 | if (Lane % 2 == 0 && Opc != Instruction::FSub) | |||
5394 | return false; | |||
5395 | if (Lane % 2 == 1 && Opc != Instruction::FAdd) | |||
5396 | return false; | |||
5397 | } | |||
5398 | // Now check that the pattern is supported by the target ISA. | |||
5399 | Type *ElemTy = cast<VectorType>(VecTy)->getElementType(); | |||
5400 | if (ElemTy->isFloatTy()) | |||
5401 | return ST->hasSSE3() && NumElements % 4 == 0; | |||
5402 | if (ElemTy->isDoubleTy()) | |||
5403 | return ST->hasSSE3() && NumElements % 2 == 0; | |||
5404 | return false; | |||
5405 | } | |||
5406 | ||||
5407 | bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) { | |||
5408 | // AVX2 doesn't support scatter | |||
5409 | if (!ST->hasAVX512()) | |||
5410 | return false; | |||
5411 | return isLegalMaskedGather(DataType, Alignment); | |||
5412 | } | |||
5413 | ||||
5414 | bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) { | |||
5415 | EVT VT = TLI->getValueType(DL, DataType); | |||
5416 | return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT); | |||
5417 | } | |||
5418 | ||||
5419 | bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) { | |||
5420 | return false; | |||
5421 | } | |||
5422 | ||||
5423 | bool X86TTIImpl::areInlineCompatible(const Function *Caller, | |||
5424 | const Function *Callee) const { | |||
5425 | const TargetMachine &TM = getTLI()->getTargetMachine(); | |||
5426 | ||||
5427 | // Work this as a subsetting of subtarget features. | |||
5428 | const FeatureBitset &CallerBits = | |||
5429 | TM.getSubtargetImpl(*Caller)->getFeatureBits(); | |||
5430 | const FeatureBitset &CalleeBits = | |||
5431 | TM.getSubtargetImpl(*Callee)->getFeatureBits(); | |||
5432 | ||||
5433 | // Check whether features are the same (apart from the ignore list). | |||
5434 | FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList; | |||
5435 | FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList; | |||
5436 | if (RealCallerBits == RealCalleeBits) | |||
5437 | return true; | |||
5438 | ||||
5439 | // If the features are a subset, we need to additionally check for calls | |||
5440 | // that may become ABI-incompatible as a result of inlining. | |||
5441 | if ((RealCallerBits & RealCalleeBits) != RealCalleeBits) | |||
5442 | return false; | |||
5443 | ||||
5444 | for (const Instruction &I : instructions(Callee)) { | |||
5445 | if (const auto *CB = dyn_cast<CallBase>(&I)) { | |||
5446 | SmallVector<Type *, 8> Types; | |||
5447 | for (Value *Arg : CB->args()) | |||
5448 | Types.push_back(Arg->getType()); | |||
5449 | if (!CB->getType()->isVoidTy()) | |||
5450 | Types.push_back(CB->getType()); | |||
5451 | ||||
5452 | // Simple types are always ABI compatible. | |||
5453 | auto IsSimpleTy = [](Type *Ty) { | |||
5454 | return !Ty->isVectorTy() && !Ty->isAggregateType(); | |||
5455 | }; | |||
5456 | if (all_of(Types, IsSimpleTy)) | |||
5457 | continue; | |||
5458 | ||||
5459 | if (Function *NestedCallee = CB->getCalledFunction()) { | |||
5460 | // Assume that intrinsics are always ABI compatible. | |||
5461 | if (NestedCallee->isIntrinsic()) | |||
5462 | continue; | |||
5463 | ||||
5464 | // Do a precise compatibility check. | |||
5465 | if (!areTypesABICompatible(Caller, NestedCallee, Types)) | |||
5466 | return false; | |||
5467 | } else { | |||
5468 | // We don't know the target features of the callee, | |||
5469 | // assume it is incompatible. | |||
5470 | return false; | |||
5471 | } | |||
5472 | } | |||
5473 | } | |||
5474 | return true; | |||
5475 | } | |||
5476 | ||||
5477 | bool X86TTIImpl::areTypesABICompatible(const Function *Caller, | |||
5478 | const Function *Callee, | |||
5479 | const ArrayRef<Type *> &Types) const { | |||
5480 | if (!BaseT::areTypesABICompatible(Caller, Callee, Types)) | |||
5481 | return false; | |||
5482 | ||||
5483 | // If we get here, we know the target features match. If one function | |||
5484 | // considers 512-bit vectors legal and the other does not, consider them | |||
5485 | // incompatible. | |||
5486 | const TargetMachine &TM = getTLI()->getTargetMachine(); | |||
5487 | ||||
5488 | if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() == | |||
5489 | TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs()) | |||
5490 | return true; | |||
5491 | ||||
5492 | // Consider the arguments compatible if they aren't vectors or aggregates. | |||
5493 | // FIXME: Look at the size of vectors. | |||
5494 | // FIXME: Look at the element types of aggregates to see if there are vectors. | |||
5495 | return llvm::none_of(Types, | |||
5496 | [](Type *T) { return T->isVectorTy() || T->isAggregateType(); }); | |||
5497 | } | |||
5498 | ||||
5499 | X86TTIImpl::TTI::MemCmpExpansionOptions | |||
5500 | X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { | |||
5501 | TTI::MemCmpExpansionOptions Options; | |||
5502 | Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); | |||
5503 | Options.NumLoadsPerBlock = 2; | |||
5504 | // All GPR and vector loads can be unaligned. | |||
5505 | Options.AllowOverlappingLoads = true; | |||
5506 | if (IsZeroCmp) { | |||
5507 | // Only enable vector loads for equality comparison. Right now the vector | |||
5508 | // version is not as fast for three way compare (see #33329). | |||
5509 | const unsigned PreferredWidth = ST->getPreferVectorWidth(); | |||
5510 | if (PreferredWidth >= 512 && ST->hasAVX512()) Options.LoadSizes.push_back(64); | |||
5511 | if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32); | |||
5512 | if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16); | |||
5513 | } | |||
5514 | if (ST->is64Bit()) { | |||
5515 | Options.LoadSizes.push_back(8); | |||
5516 | } | |||
5517 | Options.LoadSizes.push_back(4); | |||
5518 | Options.LoadSizes.push_back(2); | |||
5519 | Options.LoadSizes.push_back(1); | |||
5520 | return Options; | |||
5521 | } | |||
5522 | ||||
5523 | bool X86TTIImpl::prefersVectorizedAddressing() const { | |||
5524 | return supportsGather(); | |||
5525 | } | |||
5526 | ||||
5527 | bool X86TTIImpl::supportsEfficientVectorElementLoadStore() const { | |||
5528 | return false; | |||
5529 | } | |||
5530 | ||||
5531 | bool X86TTIImpl::enableInterleavedAccessVectorization() { | |||
5532 | // TODO: We expect this to be beneficial regardless of arch, | |||
5533 | // but there are currently some unexplained performance artifacts on Atom. | |||
5534 | // As a temporary solution, disable on Atom. | |||
5535 | return !(ST->isAtom()); | |||
5536 | } | |||
5537 | ||||
5538 | // Get estimation for interleaved load/store operations and strided load. | |||
5539 | // \p Indices contains indices for strided load. | |||
5540 | // \p Factor - the factor of interleaving. | |||
5541 | // AVX-512 provides 3-src shuffles that significantly reduces the cost. | |||
5542 | InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512( | |||
5543 | unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, | |||
5544 | ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace, | |||
5545 | TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) { | |||
5546 | // VecTy for interleave memop is <VF*Factor x Elt>. | |||
5547 | // So, for VF=4, Interleave Factor = 3, Element type = i32 we have | |||
5548 | // VecTy = <12 x i32>. | |||
5549 | ||||
5550 | // Calculate the number of memory operations (NumOfMemOps), required | |||
5551 | // for load/store the VecTy. | |||
5552 | MVT LegalVT = getTypeLegalizationCost(VecTy).second; | |||
5553 | unsigned VecTySize = DL.getTypeStoreSize(VecTy); | |||
5554 | unsigned LegalVTSize = LegalVT.getStoreSize(); | |||
5555 | unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize; | |||
5556 | ||||
5557 | // Get the cost of one memory operation. | |||
5558 | auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(), | |||
5559 | LegalVT.getVectorNumElements()); | |||
5560 | InstructionCost MemOpCost; | |||
5561 | bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps; | |||
5562 | if (UseMaskedMemOp) | |||
5563 | MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment, | |||
5564 | AddressSpace, CostKind); | |||
5565 | else | |||
5566 | MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, MaybeAlign(Alignment), | |||
5567 | AddressSpace, CostKind); | |||
5568 | ||||
5569 | unsigned VF = VecTy->getNumElements() / Factor; | |||
5570 | MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF); | |||
5571 | ||||
5572 | InstructionCost MaskCost; | |||
5573 | if (UseMaskedMemOp) { | |||
5574 | APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements()); | |||
5575 | for (unsigned Index : Indices) { | |||
5576 | assert(Index < Factor && "Invalid index for interleaved memory op")(static_cast <bool> (Index < Factor && "Invalid index for interleaved memory op" ) ? void (0) : __assert_fail ("Index < Factor && \"Invalid index for interleaved memory op\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5576, __extension__ __PRETTY_FUNCTION__)); | |||
5577 | for (unsigned Elm = 0; Elm < VF; Elm++) | |||
5578 | DemandedLoadStoreElts.setBit(Index + Elm * Factor); | |||
5579 | } | |||
5580 | ||||
5581 | Type *I1Type = Type::getInt1Ty(VecTy->getContext()); | |||
5582 | ||||
5583 | MaskCost = getReplicationShuffleCost( | |||
5584 | I1Type, Factor, VF, | |||
5585 | UseMaskForGaps ? DemandedLoadStoreElts | |||
5586 | : APInt::getAllOnes(VecTy->getNumElements()), | |||
5587 | CostKind); | |||
5588 | ||||
5589 | // The Gaps mask is invariant and created outside the loop, therefore the | |||
5590 | // cost of creating it is not accounted for here. However if we have both | |||
5591 | // a MaskForGaps and some other mask that guards the execution of the | |||
5592 | // memory access, we need to account for the cost of And-ing the two masks | |||
5593 | // inside the loop. | |||
5594 | if (UseMaskForGaps) { | |||
5595 | auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements()); | |||
5596 | MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind); | |||
5597 | } | |||
5598 | } | |||
5599 | ||||
5600 | if (Opcode == Instruction::Load) { | |||
5601 | // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl) | |||
5602 | // contain the cost of the optimized shuffle sequence that the | |||
5603 | // X86InterleavedAccess pass will generate. | |||
5604 | // The cost of loads and stores are computed separately from the table. | |||
5605 | ||||
5606 | // X86InterleavedAccess support only the following interleaved-access group. | |||
5607 | static const CostTblEntry AVX512InterleavedLoadTbl[] = { | |||
5608 | {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8 | |||
5609 | {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8 | |||
5610 | {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8 | |||
5611 | }; | |||
5612 | ||||
5613 | if (const auto *Entry = | |||
5614 | CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT)) | |||
5615 | return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost; | |||
5616 | //If an entry does not exist, fallback to the default implementation. | |||
5617 | ||||
5618 | // Kind of shuffle depends on number of loaded values. | |||
5619 | // If we load the entire data in one register, we can use a 1-src shuffle. | |||
5620 | // Otherwise, we'll merge 2 sources in each operation. | |||
5621 | TTI::ShuffleKind ShuffleKind = | |||
5622 | (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc; | |||
5623 | ||||
5624 | InstructionCost ShuffleCost = | |||
5625 | getShuffleCost(ShuffleKind, SingleMemOpTy, None, 0, nullptr); | |||
5626 | ||||
5627 | unsigned NumOfLoadsInInterleaveGrp = | |||
5628 | Indices.size() ? Indices.size() : Factor; | |||
5629 | auto *ResultTy = FixedVectorType::get(VecTy->getElementType(), | |||
5630 | VecTy->getNumElements() / Factor); | |||
5631 | InstructionCost NumOfResults = | |||
5632 | getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp; | |||
5633 | ||||
5634 | // About a half of the loads may be folded in shuffles when we have only | |||
5635 | // one result. If we have more than one result, or the loads are masked, | |||
5636 | // we do not fold loads at all. | |||
5637 | unsigned NumOfUnfoldedLoads = | |||
5638 | UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2; | |||
5639 | ||||
5640 | // Get a number of shuffle operations per result. | |||
5641 | unsigned NumOfShufflesPerResult = | |||
5642 | std::max((unsigned)1, (unsigned)(NumOfMemOps - 1)); | |||
5643 | ||||
5644 | // The SK_MergeTwoSrc shuffle clobbers one of src operands. | |||
5645 | // When we have more than one destination, we need additional instructions | |||
5646 | // to keep sources. | |||
5647 | InstructionCost NumOfMoves = 0; | |||
5648 | if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc) | |||
5649 | NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2; | |||
5650 | ||||
5651 | InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost + | |||
5652 | MaskCost + NumOfUnfoldedLoads * MemOpCost + | |||
5653 | NumOfMoves; | |||
5654 | ||||
5655 | return Cost; | |||
5656 | } | |||
5657 | ||||
5658 | // Store. | |||
5659 | assert(Opcode == Instruction::Store &&(static_cast <bool> (Opcode == Instruction::Store && "Expected Store Instruction at this point") ? void (0) : __assert_fail ("Opcode == Instruction::Store && \"Expected Store Instruction at this point\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5660, __extension__ __PRETTY_FUNCTION__)) | |||
5660 | "Expected Store Instruction at this point")(static_cast <bool> (Opcode == Instruction::Store && "Expected Store Instruction at this point") ? void (0) : __assert_fail ("Opcode == Instruction::Store && \"Expected Store Instruction at this point\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5660, __extension__ __PRETTY_FUNCTION__)); | |||
5661 | // X86InterleavedAccess support only the following interleaved-access group. | |||
5662 | static const CostTblEntry AVX512InterleavedStoreTbl[] = { | |||
5663 | {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store) | |||
5664 | {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store) | |||
5665 | {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store) | |||
5666 | ||||
5667 | {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store) | |||
5668 | {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store) | |||
5669 | {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store) | |||
5670 | {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store) | |||
5671 | }; | |||
5672 | ||||
5673 | if (const auto *Entry = | |||
5674 | CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT)) | |||
5675 | return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost; | |||
5676 | //If an entry does not exist, fallback to the default implementation. | |||
5677 | ||||
5678 | // There is no strided stores meanwhile. And store can't be folded in | |||
5679 | // shuffle. | |||
5680 | unsigned NumOfSources = Factor; // The number of values to be merged. | |||
5681 | InstructionCost ShuffleCost = | |||
5682 | getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, None, 0, nullptr); | |||
5683 | unsigned NumOfShufflesPerStore = NumOfSources - 1; | |||
5684 | ||||
5685 | // The SK_MergeTwoSrc shuffle clobbers one of src operands. | |||
5686 | // We need additional instructions to keep sources. | |||
5687 | unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2; | |||
5688 | InstructionCost Cost = | |||
5689 | MaskCost + | |||
5690 | NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) + | |||
5691 | NumOfMoves; | |||
5692 | return Cost; | |||
5693 | } | |||
5694 | ||||
5695 | InstructionCost X86TTIImpl::getInterleavedMemoryOpCost( | |||
5696 | unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices, | |||
5697 | Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, | |||
5698 | bool UseMaskForCond, bool UseMaskForGaps) { | |||
5699 | auto *VecTy = cast<FixedVectorType>(BaseTy); | |||
5700 | ||||
5701 | auto isSupportedOnAVX512 = [&](Type *VecTy, bool HasBW) { | |||
5702 | Type *EltTy = cast<VectorType>(VecTy)->getElementType(); | |||
5703 | if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) || | |||
5704 | EltTy->isIntegerTy(32) || EltTy->isPointerTy()) | |||
5705 | return true; | |||
5706 | if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy()) | |||
5707 | return HasBW; | |||
5708 | return false; | |||
5709 | }; | |||
5710 | if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI())) | |||
5711 | return getInterleavedMemoryOpCostAVX512( | |||
5712 | Opcode, VecTy, Factor, Indices, Alignment, | |||
5713 | AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps); | |||
5714 | ||||
5715 | if (UseMaskForCond || UseMaskForGaps) | |||
5716 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | |||
5717 | Alignment, AddressSpace, CostKind, | |||
5718 | UseMaskForCond, UseMaskForGaps); | |||
5719 | ||||
5720 | // Get estimation for interleaved load/store operations for SSE-AVX2. | |||
5721 | // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow | |||
5722 | // computing the cost using a generic formula as a function of generic | |||
5723 | // shuffles. We therefore use a lookup table instead, filled according to | |||
5724 | // the instruction sequences that codegen currently generates. | |||
5725 | ||||
5726 | // VecTy for interleave memop is <VF*Factor x Elt>. | |||
5727 | // So, for VF=4, Interleave Factor = 3, Element type = i32 we have | |||
5728 | // VecTy = <12 x i32>. | |||
5729 | MVT LegalVT = getTypeLegalizationCost(VecTy).second; | |||
5730 | ||||
5731 | // This function can be called with VecTy=<6xi128>, Factor=3, in which case | |||
5732 | // the VF=2, while v2i128 is an unsupported MVT vector type | |||
5733 | // (see MachineValueType.h::getVectorVT()). | |||
5734 | if (!LegalVT.isVector()) | |||
5735 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | |||
5736 | Alignment, AddressSpace, CostKind); | |||
5737 | ||||
5738 | unsigned VF = VecTy->getNumElements() / Factor; | |||
5739 | Type *ScalarTy = VecTy->getElementType(); | |||
5740 | // Deduplicate entries, model floats/pointers as appropriately-sized integers. | |||
5741 | if (!ScalarTy->isIntegerTy()) | |||
5742 | ScalarTy = | |||
5743 | Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy)); | |||
5744 | ||||
5745 | // Get the cost of all the memory operations. | |||
5746 | // FIXME: discount dead loads. | |||
5747 | InstructionCost MemOpCosts = getMemoryOpCost( | |||
5748 | Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind); | |||
5749 | ||||
5750 | auto *VT = FixedVectorType::get(ScalarTy, VF); | |||
5751 | EVT ETy = TLI->getValueType(DL, VT); | |||
5752 | if (!ETy.isSimple()) | |||
5753 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | |||
5754 | Alignment, AddressSpace, CostKind); | |||
5755 | ||||
5756 | // TODO: Complete for other data-types and strides. | |||
5757 | // Each combination of Stride, element bit width and VF results in a different | |||
5758 | // sequence; The cost tables are therefore accessed with: | |||
5759 | // Factor (stride) and VectorType=VFxiN. | |||
5760 | // The Cost accounts only for the shuffle sequence; | |||
5761 | // The cost of the loads/stores is accounted for separately. | |||
5762 | // | |||
5763 | static const CostTblEntry AVX2InterleavedLoadTbl[] = { | |||
5764 | {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8 | |||
5765 | {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8 | |||
5766 | {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8 | |||
5767 | {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8 | |||
5768 | {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8 | |||
5769 | ||||
5770 | {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16 | |||
5771 | {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16 | |||
5772 | {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16 | |||
5773 | ||||
5774 | {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32 | |||
5775 | {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32 | |||
5776 | {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32 | |||
5777 | ||||
5778 | {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64 | |||
5779 | {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64 | |||
5780 | {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64 | |||
5781 | {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64 | |||
5782 | ||||
5783 | {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8 | |||
5784 | {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8 | |||
5785 | {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8 | |||
5786 | {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8 | |||
5787 | {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8 | |||
5788 | ||||
5789 | {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16 | |||
5790 | {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16 | |||
5791 | {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16 | |||
5792 | {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16 | |||
5793 | {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16 | |||
5794 | ||||
5795 | {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32 | |||
5796 | {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32 | |||
5797 | {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32 | |||
5798 | {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32 | |||
5799 | {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32 | |||
5800 | ||||
5801 | {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64 | |||
5802 | {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64 | |||
5803 | {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64 | |||
5804 | {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64 | |||
5805 | ||||
5806 | {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8 | |||
5807 | {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8 | |||
5808 | {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8 | |||
5809 | {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8 | |||
5810 | {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8 | |||
5811 | ||||
5812 | {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16 | |||
5813 | {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16 | |||
5814 | {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16 | |||
5815 | {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16 | |||
5816 | {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16 | |||
5817 | ||||
5818 | {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32 | |||
5819 | {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32 | |||
5820 | {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32 | |||
5821 | {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32 | |||
5822 | {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32 | |||
5823 | ||||
5824 | {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64 | |||
5825 | {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64 | |||
5826 | {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64 | |||
5827 | {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64 | |||
5828 | ||||
5829 | {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8 | |||
5830 | {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8 | |||
5831 | {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8 | |||
5832 | {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8 | |||
5833 | {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8 | |||
5834 | ||||
5835 | {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16 | |||
5836 | {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16 | |||
5837 | {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16 | |||
5838 | {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16 | |||
5839 | {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16 | |||
5840 | ||||
5841 | {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32 | |||
5842 | {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32 | |||
5843 | {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32 | |||
5844 | {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32 | |||
5845 | ||||
5846 | {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64 | |||
5847 | {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64 | |||
5848 | {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64 | |||
5849 | ||||
5850 | {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32 | |||
5851 | }; | |||
5852 | ||||
5853 | static const CostTblEntry SSSE3InterleavedLoadTbl[] = { | |||
5854 | {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16 | |||
5855 | }; | |||
5856 | ||||
5857 | static const CostTblEntry SSE2InterleavedLoadTbl[] = { | |||
5858 | {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16 | |||
5859 | {2, MVT::v4i16, 7}, // (load 8i16 and) deinterleave into 2 x 4i16 | |||
5860 | ||||
5861 | {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32 | |||
5862 | {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32 | |||
5863 | ||||
5864 | {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64 | |||
5865 | }; | |||
5866 | ||||
5867 | static const CostTblEntry AVX2InterleavedStoreTbl[] = { | |||
5868 | {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store) | |||
5869 | {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store) | |||
5870 | ||||
5871 | {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store) | |||
5872 | {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store) | |||
5873 | {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store) | |||
5874 | ||||
5875 | {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store) | |||
5876 | {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store) | |||
5877 | {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store) | |||
5878 | {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store) | |||
5879 | ||||
5880 | {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store) | |||
5881 | {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store) | |||
5882 | {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store) | |||
5883 | {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store) | |||
5884 | {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store) | |||
5885 | ||||
5886 | {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store) | |||
5887 | {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store) | |||
5888 | {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store) | |||
5889 | {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store) | |||
5890 | {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store) | |||
5891 | ||||
5892 | {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store) | |||
5893 | {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store) | |||
5894 | {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store) | |||
5895 | {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store) | |||
5896 | {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store) | |||
5897 | ||||
5898 | {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store) | |||
5899 | {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store) | |||
5900 | {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store) | |||
5901 | {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store) | |||
5902 | {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store) | |||
5903 | ||||
5904 | {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store) | |||
5905 | {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store) | |||
5906 | {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store) | |||
5907 | {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store) | |||
5908 | ||||
5909 | {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store) | |||
5910 | {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store) | |||
5911 | {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store) | |||
5912 | {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store) | |||
5913 | {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store) | |||
5914 | ||||
5915 | {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store) | |||
5916 | {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store) | |||
5917 | {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store) | |||
5918 | {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store) | |||
5919 | {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store) | |||
5920 | ||||
5921 | {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store) | |||
5922 | {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store) | |||
5923 | {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store) | |||
5924 | {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store) | |||
5925 | {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store) | |||
5926 | ||||
5927 | {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store) | |||
5928 | {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store) | |||
5929 | {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store) | |||
5930 | {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store) | |||
5931 | ||||
5932 | {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store) | |||
5933 | {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store) | |||
5934 | {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store) | |||
5935 | {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store) | |||
5936 | {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store) | |||
5937 | ||||
5938 | {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store) | |||
5939 | {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store) | |||
5940 | {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store) | |||
5941 | {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store) | |||
5942 | {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store) | |||
5943 | ||||
5944 | {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store) | |||
5945 | {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store) | |||
5946 | {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store) | |||
5947 | {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store) | |||
5948 | ||||
5949 | {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store) | |||
5950 | {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store) | |||
5951 | {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store) | |||
5952 | }; | |||
5953 | ||||
5954 | static const CostTblEntry SSE2InterleavedStoreTbl[] = { | |||
5955 | {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store) | |||
5956 | {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store) | |||
5957 | {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store) | |||
5958 | ||||
5959 | {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store) | |||
5960 | {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store) | |||
5961 | ||||
5962 | {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store) | |||
5963 | }; | |||
5964 | ||||
5965 | if (Opcode == Instruction::Load) { | |||
5966 | auto GetDiscountedCost = [Factor, NumMembers = Indices.size(), | |||
5967 | MemOpCosts](const CostTblEntry *Entry) { | |||
5968 | // NOTE: this is just an approximation! | |||
5969 | // It can over/under -estimate the cost! | |||
5970 | return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor); | |||
5971 | }; | |||
5972 | ||||
5973 | if (ST->hasAVX2()) | |||
5974 | if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor, | |||
5975 | ETy.getSimpleVT())) | |||
5976 | return GetDiscountedCost(Entry); | |||
5977 | ||||
5978 | if (ST->hasSSSE3()) | |||
5979 | if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor, | |||
5980 | ETy.getSimpleVT())) | |||
5981 | return GetDiscountedCost(Entry); | |||
5982 | ||||
5983 | if (ST->hasSSE2()) | |||
5984 | if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor, | |||
5985 | ETy.getSimpleVT())) | |||
5986 | return GetDiscountedCost(Entry); | |||
5987 | } else { | |||
5988 | assert(Opcode == Instruction::Store &&(static_cast <bool> (Opcode == Instruction::Store && "Expected Store Instruction at this point") ? void (0) : __assert_fail ("Opcode == Instruction::Store && \"Expected Store Instruction at this point\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5989, __extension__ __PRETTY_FUNCTION__)) | |||
5989 | "Expected Store Instruction at this point")(static_cast <bool> (Opcode == Instruction::Store && "Expected Store Instruction at this point") ? void (0) : __assert_fail ("Opcode == Instruction::Store && \"Expected Store Instruction at this point\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5989, __extension__ __PRETTY_FUNCTION__)); | |||
5990 | assert((!Indices.size() || Indices.size() == Factor) &&(static_cast <bool> ((!Indices.size() || Indices.size() == Factor) && "Interleaved store only supports fully-interleaved groups." ) ? void (0) : __assert_fail ("(!Indices.size() || Indices.size() == Factor) && \"Interleaved store only supports fully-interleaved groups.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5991, __extension__ __PRETTY_FUNCTION__)) | |||
5991 | "Interleaved store only supports fully-interleaved groups.")(static_cast <bool> ((!Indices.size() || Indices.size() == Factor) && "Interleaved store only supports fully-interleaved groups." ) ? void (0) : __assert_fail ("(!Indices.size() || Indices.size() == Factor) && \"Interleaved store only supports fully-interleaved groups.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5991, __extension__ __PRETTY_FUNCTION__)); | |||
5992 | if (ST->hasAVX2()) | |||
5993 | if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor, | |||
5994 | ETy.getSimpleVT())) | |||
5995 | return MemOpCosts + Entry->Cost; | |||
5996 | ||||
5997 | if (ST->hasSSE2()) | |||
5998 | if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor, | |||
5999 | ETy.getSimpleVT())) | |||
6000 | return MemOpCosts + Entry->Cost; | |||
6001 | } | |||
6002 | ||||
6003 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | |||
6004 | Alignment, AddressSpace, CostKind, | |||
6005 | UseMaskForCond, UseMaskForGaps); | |||
6006 | } | |||
6007 | ||||
6008 | InstructionCost X86TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, | |||
6009 | int64_t BaseOffset, | |||
6010 | bool HasBaseReg, int64_t Scale, | |||
6011 | unsigned AddrSpace) const { | |||
6012 | // Scaling factors are not free at all. | |||
6013 | // An indexed folded instruction, i.e., inst (reg1, reg2, scale), | |||
6014 | // will take 2 allocations in the out of order engine instead of 1 | |||
6015 | // for plain addressing mode, i.e. inst (reg1). | |||
6016 | // E.g., | |||
6017 | // vaddps (%rsi,%rdx), %ymm0, %ymm1 | |||
6018 | // Requires two allocations (one for the load, one for the computation) | |||
6019 | // whereas: | |||
6020 | // vaddps (%rsi), %ymm0, %ymm1 | |||
6021 | // Requires just 1 allocation, i.e., freeing allocations for other operations | |||
6022 | // and having less micro operations to execute. | |||
6023 | // | |||
6024 | // For some X86 architectures, this is even worse because for instance for | |||
6025 | // stores, the complex addressing mode forces the instruction to use the | |||
6026 | // "load" ports instead of the dedicated "store" port. | |||
6027 | // E.g., on Haswell: | |||
6028 | // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3. | |||
6029 | // vmovaps %ymm1, (%r8) can use port 2, 3, or 7. | |||
6030 | TargetLoweringBase::AddrMode AM; | |||
6031 | AM.BaseGV = BaseGV; | |||
6032 | AM.BaseOffs = BaseOffset; | |||
6033 | AM.HasBaseReg = HasBaseReg; | |||
6034 | AM.Scale = Scale; | |||
6035 | if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) | |||
6036 | // Scale represents reg2 * scale, thus account for 1 | |||
6037 | // as soon as we use a second register. | |||
6038 | return AM.Scale != 0; | |||
6039 | return -1; | |||
6040 | } |