File: | build/source/llvm/lib/Target/X86/X86TargetTransformInfo.cpp |
Warning: | line 4358, column 49 Called C++ object pointer is null |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===// | ||||
2 | // | ||||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||||
4 | // See https://llvm.org/LICENSE.txt for license information. | ||||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||
6 | // | ||||
7 | //===----------------------------------------------------------------------===// | ||||
8 | /// \file | ||||
9 | /// This file implements a TargetTransformInfo analysis pass specific to the | ||||
10 | /// X86 target machine. It uses the target's detailed information to provide | ||||
11 | /// more precise answers to certain TTI queries, while letting the target | ||||
12 | /// independent and default TTI implementations handle the rest. | ||||
13 | /// | ||||
14 | //===----------------------------------------------------------------------===// | ||||
15 | /// About Cost Model numbers used below it's necessary to say the following: | ||||
16 | /// the numbers correspond to some "generic" X86 CPU instead of usage of a | ||||
17 | /// specific CPU model. Usually the numbers correspond to the CPU where the | ||||
18 | /// feature first appeared. For example, if we do Subtarget.hasSSE42() in | ||||
19 | /// the lookups below the cost is based on Nehalem as that was the first CPU | ||||
20 | /// to support that feature level and thus has most likely the worst case cost, | ||||
21 | /// although we may discard an outlying worst cost from one CPU (e.g. Atom). | ||||
22 | /// | ||||
23 | /// Some examples of other technologies/CPUs: | ||||
24 | /// SSE 3 - Pentium4 / Athlon64 | ||||
25 | /// SSE 4.1 - Penryn | ||||
26 | /// SSE 4.2 - Nehalem / Silvermont | ||||
27 | /// AVX - Sandy Bridge / Jaguar / Bulldozer | ||||
28 | /// AVX2 - Haswell / Ryzen | ||||
29 | /// AVX-512 - Xeon Phi / Skylake | ||||
30 | /// | ||||
31 | /// And some examples of instruction target dependent costs (latency) | ||||
32 | /// divss sqrtss rsqrtss | ||||
33 | /// AMD K7 11-16 19 3 | ||||
34 | /// Piledriver 9-24 13-15 5 | ||||
35 | /// Jaguar 14 16 2 | ||||
36 | /// Pentium II,III 18 30 2 | ||||
37 | /// Nehalem 7-14 7-18 3 | ||||
38 | /// Haswell 10-13 11 5 | ||||
39 | /// | ||||
40 | /// Interpreting the 4 TargetCostKind types: | ||||
41 | /// TCK_RecipThroughput and TCK_Latency should try to match the worst case | ||||
42 | /// values reported by the CPU scheduler models (and llvm-mca). | ||||
43 | /// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the | ||||
44 | /// actual encoding size of the instruction. | ||||
45 | /// TCK_SizeAndLatency should match the worst case micro-op counts reported by | ||||
46 | /// by the CPU scheduler models (and llvm-mca), to ensure that they are | ||||
47 | /// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are | ||||
48 | /// often used as the cost thresholds where TCK_SizeAndLatency is requested. | ||||
49 | //===----------------------------------------------------------------------===// | ||||
50 | |||||
51 | #include "X86TargetTransformInfo.h" | ||||
52 | #include "llvm/Analysis/TargetTransformInfo.h" | ||||
53 | #include "llvm/CodeGen/BasicTTIImpl.h" | ||||
54 | #include "llvm/CodeGen/CostTable.h" | ||||
55 | #include "llvm/CodeGen/TargetLowering.h" | ||||
56 | #include "llvm/IR/InstIterator.h" | ||||
57 | #include "llvm/IR/IntrinsicInst.h" | ||||
58 | #include "llvm/Support/Debug.h" | ||||
59 | #include <optional> | ||||
60 | |||||
61 | using namespace llvm; | ||||
62 | |||||
63 | #define DEBUG_TYPE"x86tti" "x86tti" | ||||
64 | |||||
65 | //===----------------------------------------------------------------------===// | ||||
66 | // | ||||
67 | // X86 cost model. | ||||
68 | // | ||||
69 | //===----------------------------------------------------------------------===// | ||||
70 | |||||
71 | // Helper struct to store/access costs for each cost kind. | ||||
72 | // TODO: Move this to allow other targets to use it? | ||||
73 | struct CostKindCosts { | ||||
74 | unsigned RecipThroughputCost = ~0U; | ||||
75 | unsigned LatencyCost = ~0U; | ||||
76 | unsigned CodeSizeCost = ~0U; | ||||
77 | unsigned SizeAndLatencyCost = ~0U; | ||||
78 | |||||
79 | std::optional<unsigned> | ||||
80 | operator[](TargetTransformInfo::TargetCostKind Kind) const { | ||||
81 | unsigned Cost = ~0U; | ||||
82 | switch (Kind) { | ||||
83 | case TargetTransformInfo::TCK_RecipThroughput: | ||||
84 | Cost = RecipThroughputCost; | ||||
85 | break; | ||||
86 | case TargetTransformInfo::TCK_Latency: | ||||
87 | Cost = LatencyCost; | ||||
88 | break; | ||||
89 | case TargetTransformInfo::TCK_CodeSize: | ||||
90 | Cost = CodeSizeCost; | ||||
91 | break; | ||||
92 | case TargetTransformInfo::TCK_SizeAndLatency: | ||||
93 | Cost = SizeAndLatencyCost; | ||||
94 | break; | ||||
95 | } | ||||
96 | if (Cost == ~0U) | ||||
97 | return std::nullopt; | ||||
98 | return Cost; | ||||
99 | } | ||||
100 | }; | ||||
101 | using CostKindTblEntry = CostTblEntryT<CostKindCosts>; | ||||
102 | |||||
103 | TargetTransformInfo::PopcntSupportKind | ||||
104 | X86TTIImpl::getPopcntSupport(unsigned TyWidth) { | ||||
105 | assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2")(static_cast <bool> (isPowerOf2_32(TyWidth) && "Ty width must be power of 2" ) ? void (0) : __assert_fail ("isPowerOf2_32(TyWidth) && \"Ty width must be power of 2\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 105, __extension__ __PRETTY_FUNCTION__)); | ||||
106 | // TODO: Currently the __builtin_popcount() implementation using SSE3 | ||||
107 | // instructions is inefficient. Once the problem is fixed, we should | ||||
108 | // call ST->hasSSE3() instead of ST->hasPOPCNT(). | ||||
109 | return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software; | ||||
110 | } | ||||
111 | |||||
112 | std::optional<unsigned> X86TTIImpl::getCacheSize( | ||||
113 | TargetTransformInfo::CacheLevel Level) const { | ||||
114 | switch (Level) { | ||||
115 | case TargetTransformInfo::CacheLevel::L1D: | ||||
116 | // - Penryn | ||||
117 | // - Nehalem | ||||
118 | // - Westmere | ||||
119 | // - Sandy Bridge | ||||
120 | // - Ivy Bridge | ||||
121 | // - Haswell | ||||
122 | // - Broadwell | ||||
123 | // - Skylake | ||||
124 | // - Kabylake | ||||
125 | return 32 * 1024; // 32 KByte | ||||
126 | case TargetTransformInfo::CacheLevel::L2D: | ||||
127 | // - Penryn | ||||
128 | // - Nehalem | ||||
129 | // - Westmere | ||||
130 | // - Sandy Bridge | ||||
131 | // - Ivy Bridge | ||||
132 | // - Haswell | ||||
133 | // - Broadwell | ||||
134 | // - Skylake | ||||
135 | // - Kabylake | ||||
136 | return 256 * 1024; // 256 KByte | ||||
137 | } | ||||
138 | |||||
139 | llvm_unreachable("Unknown TargetTransformInfo::CacheLevel")::llvm::llvm_unreachable_internal("Unknown TargetTransformInfo::CacheLevel" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 139); | ||||
140 | } | ||||
141 | |||||
142 | std::optional<unsigned> X86TTIImpl::getCacheAssociativity( | ||||
143 | TargetTransformInfo::CacheLevel Level) const { | ||||
144 | // - Penryn | ||||
145 | // - Nehalem | ||||
146 | // - Westmere | ||||
147 | // - Sandy Bridge | ||||
148 | // - Ivy Bridge | ||||
149 | // - Haswell | ||||
150 | // - Broadwell | ||||
151 | // - Skylake | ||||
152 | // - Kabylake | ||||
153 | switch (Level) { | ||||
154 | case TargetTransformInfo::CacheLevel::L1D: | ||||
155 | [[fallthrough]]; | ||||
156 | case TargetTransformInfo::CacheLevel::L2D: | ||||
157 | return 8; | ||||
158 | } | ||||
159 | |||||
160 | llvm_unreachable("Unknown TargetTransformInfo::CacheLevel")::llvm::llvm_unreachable_internal("Unknown TargetTransformInfo::CacheLevel" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 160); | ||||
161 | } | ||||
162 | |||||
163 | unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const { | ||||
164 | bool Vector = (ClassID == 1); | ||||
165 | if (Vector && !ST->hasSSE1()) | ||||
166 | return 0; | ||||
167 | |||||
168 | if (ST->is64Bit()) { | ||||
169 | if (Vector && ST->hasAVX512()) | ||||
170 | return 32; | ||||
171 | return 16; | ||||
172 | } | ||||
173 | return 8; | ||||
174 | } | ||||
175 | |||||
176 | TypeSize | ||||
177 | X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { | ||||
178 | unsigned PreferVectorWidth = ST->getPreferVectorWidth(); | ||||
179 | switch (K) { | ||||
180 | case TargetTransformInfo::RGK_Scalar: | ||||
181 | return TypeSize::getFixed(ST->is64Bit() ? 64 : 32); | ||||
182 | case TargetTransformInfo::RGK_FixedWidthVector: | ||||
183 | if (ST->hasAVX512() && PreferVectorWidth >= 512) | ||||
184 | return TypeSize::getFixed(512); | ||||
185 | if (ST->hasAVX() && PreferVectorWidth >= 256) | ||||
186 | return TypeSize::getFixed(256); | ||||
187 | if (ST->hasSSE1() && PreferVectorWidth >= 128) | ||||
188 | return TypeSize::getFixed(128); | ||||
189 | return TypeSize::getFixed(0); | ||||
190 | case TargetTransformInfo::RGK_ScalableVector: | ||||
191 | return TypeSize::getScalable(0); | ||||
192 | } | ||||
193 | |||||
194 | llvm_unreachable("Unsupported register kind")::llvm::llvm_unreachable_internal("Unsupported register kind" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 194); | ||||
195 | } | ||||
196 | |||||
197 | unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const { | ||||
198 | return getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) | ||||
199 | .getFixedValue(); | ||||
200 | } | ||||
201 | |||||
202 | unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) { | ||||
203 | // If the loop will not be vectorized, don't interleave the loop. | ||||
204 | // Let regular unroll to unroll the loop, which saves the overflow | ||||
205 | // check and memory check cost. | ||||
206 | if (VF == 1) | ||||
207 | return 1; | ||||
208 | |||||
209 | if (ST->isAtom()) | ||||
210 | return 1; | ||||
211 | |||||
212 | // Sandybridge and Haswell have multiple execution ports and pipelined | ||||
213 | // vector units. | ||||
214 | if (ST->hasAVX()) | ||||
215 | return 4; | ||||
216 | |||||
217 | return 2; | ||||
218 | } | ||||
219 | |||||
220 | InstructionCost X86TTIImpl::getArithmeticInstrCost( | ||||
221 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, | ||||
222 | TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, | ||||
223 | ArrayRef<const Value *> Args, | ||||
224 | const Instruction *CxtI) { | ||||
225 | |||||
226 | // vXi8 multiplications are always promoted to vXi16. | ||||
227 | if (Opcode == Instruction::Mul && Ty->isVectorTy() && | ||||
228 | Ty->getScalarSizeInBits() == 8) { | ||||
229 | Type *WideVecTy = | ||||
230 | VectorType::getExtendedElementVectorType(cast<VectorType>(Ty)); | ||||
231 | return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty, | ||||
232 | TargetTransformInfo::CastContextHint::None, | ||||
233 | CostKind) + | ||||
234 | getCastInstrCost(Instruction::Trunc, Ty, WideVecTy, | ||||
235 | TargetTransformInfo::CastContextHint::None, | ||||
236 | CostKind) + | ||||
237 | getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info); | ||||
238 | } | ||||
239 | |||||
240 | // Legalize the type. | ||||
241 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); | ||||
242 | |||||
243 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | ||||
244 | assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ? void (0) : __assert_fail ("ISD && \"Invalid opcode\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 244, __extension__ __PRETTY_FUNCTION__)); | ||||
245 | |||||
246 | if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() && | ||||
247 | LT.second.getScalarType() == MVT::i32) { | ||||
248 | // Check if the operands can be represented as a smaller datatype. | ||||
249 | bool Op1Signed = false, Op2Signed = false; | ||||
250 | unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed); | ||||
251 | unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed); | ||||
252 | unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize); | ||||
253 | bool SignedMode = Op1Signed || Op2Signed; | ||||
254 | |||||
255 | // If both are representable as i15 and at least one is constant, | ||||
256 | // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we | ||||
257 | // can treat this as PMADDWD which has the same costs as a vXi16 multiply. | ||||
258 | if (OpMinSize <= 15 && !ST->isPMADDWDSlow()) { | ||||
259 | bool Op1Constant = | ||||
260 | isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]); | ||||
261 | bool Op2Constant = | ||||
262 | isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]); | ||||
263 | bool Op1Sext = isa<SExtInst>(Args[0]) && | ||||
264 | (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41())); | ||||
265 | bool Op2Sext = isa<SExtInst>(Args[1]) && | ||||
266 | (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41())); | ||||
267 | |||||
268 | bool IsZeroExtended = !Op1Signed || !Op2Signed; | ||||
269 | bool IsConstant = Op1Constant || Op2Constant; | ||||
270 | bool IsSext = Op1Sext || Op2Sext; | ||||
271 | if (IsConstant || IsZeroExtended || IsSext) | ||||
272 | LT.second = | ||||
273 | MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements()); | ||||
274 | } | ||||
275 | |||||
276 | // Check if the vXi32 operands can be shrunk into a smaller datatype. | ||||
277 | // This should match the codegen from reduceVMULWidth. | ||||
278 | // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()). | ||||
279 | if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) { | ||||
280 | if (OpMinSize <= 7) | ||||
281 | return LT.first * 3; // pmullw/sext | ||||
282 | if (!SignedMode && OpMinSize <= 8) | ||||
283 | return LT.first * 3; // pmullw/zext | ||||
284 | if (OpMinSize <= 15) | ||||
285 | return LT.first * 5; // pmullw/pmulhw/pshuf | ||||
286 | if (!SignedMode && OpMinSize <= 16) | ||||
287 | return LT.first * 5; // pmullw/pmulhw/pshuf | ||||
288 | } | ||||
289 | } | ||||
290 | |||||
291 | // Vector multiply by pow2 will be simplified to shifts. | ||||
292 | // Vector multiply by -pow2 will be simplified to shifts/negates. | ||||
293 | if (ISD == ISD::MUL && Op2Info.isConstant() && | ||||
294 | (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) { | ||||
295 | InstructionCost Cost = | ||||
296 | getArithmeticInstrCost(Instruction::Shl, Ty, CostKind, | ||||
297 | Op1Info.getNoProps(), Op2Info.getNoProps()); | ||||
298 | if (Op2Info.isNegatedPowerOf2()) | ||||
299 | Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind); | ||||
300 | return Cost; | ||||
301 | } | ||||
302 | |||||
303 | // On X86, vector signed division by constants power-of-two are | ||||
304 | // normally expanded to the sequence SRA + SRL + ADD + SRA. | ||||
305 | // The OperandValue properties may not be the same as that of the previous | ||||
306 | // operation; conservatively assume OP_None. | ||||
307 | if ((ISD == ISD::SDIV || ISD == ISD::SREM) && | ||||
308 | Op2Info.isConstant() && Op2Info.isPowerOf2()) { | ||||
309 | InstructionCost Cost = | ||||
310 | 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, | ||||
311 | Op1Info.getNoProps(), Op2Info.getNoProps()); | ||||
312 | Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, | ||||
313 | Op1Info.getNoProps(), Op2Info.getNoProps()); | ||||
314 | Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, | ||||
315 | Op1Info.getNoProps(), Op2Info.getNoProps()); | ||||
316 | |||||
317 | if (ISD == ISD::SREM) { | ||||
318 | // For SREM: (X % C) is the equivalent of (X - (X/C)*C) | ||||
319 | Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(), | ||||
320 | Op2Info.getNoProps()); | ||||
321 | Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(), | ||||
322 | Op2Info.getNoProps()); | ||||
323 | } | ||||
324 | |||||
325 | return Cost; | ||||
326 | } | ||||
327 | |||||
328 | // Vector unsigned division/remainder will be simplified to shifts/masks. | ||||
329 | if ((ISD == ISD::UDIV || ISD == ISD::UREM) && | ||||
330 | Op2Info.isConstant() && Op2Info.isPowerOf2()) { | ||||
331 | if (ISD == ISD::UDIV) | ||||
332 | return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, | ||||
333 | Op1Info.getNoProps(), Op2Info.getNoProps()); | ||||
334 | // UREM | ||||
335 | return getArithmeticInstrCost(Instruction::And, Ty, CostKind, | ||||
336 | Op1Info.getNoProps(), Op2Info.getNoProps()); | ||||
337 | } | ||||
338 | |||||
339 | static const CostKindTblEntry AVX512BWUniformConstCostTable[] = { | ||||
340 | { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand. | ||||
341 | { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand. | ||||
342 | { ISD::SRA, MVT::v16i8, { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb. | ||||
343 | { ISD::SHL, MVT::v32i8, { 1, 8, 2, 3 } }, // psllw + pand. | ||||
344 | { ISD::SRL, MVT::v32i8, { 1, 8, 2, 3 } }, // psrlw + pand. | ||||
345 | { ISD::SRA, MVT::v32i8, { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb. | ||||
346 | { ISD::SHL, MVT::v64i8, { 1, 8, 2, 3 } }, // psllw + pand. | ||||
347 | { ISD::SRL, MVT::v64i8, { 1, 8, 2, 3 } }, // psrlw + pand. | ||||
348 | { ISD::SRA, MVT::v64i8, { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb. | ||||
349 | |||||
350 | { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // psllw | ||||
351 | { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw | ||||
352 | { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw | ||||
353 | { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // psllw | ||||
354 | { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw | ||||
355 | { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw | ||||
356 | }; | ||||
357 | |||||
358 | if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI()) | ||||
359 | if (const auto *Entry = | ||||
360 | CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second)) | ||||
361 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
362 | return LT.first * *KindCost; | ||||
363 | |||||
364 | static const CostKindTblEntry AVX512UniformConstCostTable[] = { | ||||
365 | { ISD::SHL, MVT::v64i8, { 2, 12, 5, 6 } }, // psllw + pand. | ||||
366 | { ISD::SRL, MVT::v64i8, { 2, 12, 5, 6 } }, // psrlw + pand. | ||||
367 | { ISD::SRA, MVT::v64i8, { 3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb. | ||||
368 | |||||
369 | { ISD::SHL, MVT::v16i16, { 2, 7, 4, 4 } }, // psllw + split. | ||||
370 | { ISD::SRL, MVT::v16i16, { 2, 7, 4, 4 } }, // psrlw + split. | ||||
371 | { ISD::SRA, MVT::v16i16, { 2, 7, 4, 4 } }, // psraw + split. | ||||
372 | |||||
373 | { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, // pslld | ||||
374 | { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, // psrld | ||||
375 | { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, // psrad | ||||
376 | { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, // pslld | ||||
377 | { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, // psrld | ||||
378 | { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, // psrad | ||||
379 | |||||
380 | { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, // psraq | ||||
381 | { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, // psllq | ||||
382 | { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, // psrlq | ||||
383 | { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, // psraq | ||||
384 | { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, // psllq | ||||
385 | { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, // psrlq | ||||
386 | { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, // psraq | ||||
387 | |||||
388 | { ISD::SDIV, MVT::v16i32, { 6 } }, // pmuludq sequence | ||||
389 | { ISD::SREM, MVT::v16i32, { 8 } }, // pmuludq+mul+sub sequence | ||||
390 | { ISD::UDIV, MVT::v16i32, { 5 } }, // pmuludq sequence | ||||
391 | { ISD::UREM, MVT::v16i32, { 7 } }, // pmuludq+mul+sub sequence | ||||
392 | }; | ||||
393 | |||||
394 | if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512()) | ||||
395 | if (const auto *Entry = | ||||
396 | CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second)) | ||||
397 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
398 | return LT.first * *KindCost; | ||||
399 | |||||
400 | static const CostKindTblEntry AVX2UniformConstCostTable[] = { | ||||
401 | { ISD::SHL, MVT::v16i8, { 1, 8, 2, 3 } }, // psllw + pand. | ||||
402 | { ISD::SRL, MVT::v16i8, { 1, 8, 2, 3 } }, // psrlw + pand. | ||||
403 | { ISD::SRA, MVT::v16i8, { 2, 10, 5, 6 } }, // psrlw, pand, pxor, psubb. | ||||
404 | { ISD::SHL, MVT::v32i8, { 2, 8, 2, 4 } }, // psllw + pand. | ||||
405 | { ISD::SRL, MVT::v32i8, { 2, 8, 2, 4 } }, // psrlw + pand. | ||||
406 | { ISD::SRA, MVT::v32i8, { 3, 10, 5, 9 } }, // psrlw, pand, pxor, psubb. | ||||
407 | |||||
408 | { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw | ||||
409 | { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw | ||||
410 | { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw | ||||
411 | { ISD::SHL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psllw | ||||
412 | { ISD::SRL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psrlw | ||||
413 | { ISD::SRA, MVT::v16i16,{ 2, 2, 1, 2 } }, // psraw | ||||
414 | |||||
415 | { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld | ||||
416 | { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld | ||||
417 | { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad | ||||
418 | { ISD::SHL, MVT::v8i32, { 2, 2, 1, 2 } }, // pslld | ||||
419 | { ISD::SRL, MVT::v8i32, { 2, 2, 1, 2 } }, // psrld | ||||
420 | { ISD::SRA, MVT::v8i32, { 2, 2, 1, 2 } }, // psrad | ||||
421 | |||||
422 | { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq | ||||
423 | { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq | ||||
424 | { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle. | ||||
425 | { ISD::SHL, MVT::v4i64, { 2, 2, 1, 2 } }, // psllq | ||||
426 | { ISD::SRL, MVT::v4i64, { 2, 2, 1, 2 } }, // psrlq | ||||
427 | { ISD::SRA, MVT::v4i64, { 4, 4, 3, 6 } }, // psrad + shuffle + split. | ||||
428 | |||||
429 | { ISD::SDIV, MVT::v8i32, { 6 } }, // pmuludq sequence | ||||
430 | { ISD::SREM, MVT::v8i32, { 8 } }, // pmuludq+mul+sub sequence | ||||
431 | { ISD::UDIV, MVT::v8i32, { 5 } }, // pmuludq sequence | ||||
432 | { ISD::UREM, MVT::v8i32, { 7 } }, // pmuludq+mul+sub sequence | ||||
433 | }; | ||||
434 | |||||
435 | if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2()) | ||||
436 | if (const auto *Entry = | ||||
437 | CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second)) | ||||
438 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
439 | return LT.first * *KindCost; | ||||
440 | |||||
441 | static const CostKindTblEntry AVXUniformConstCostTable[] = { | ||||
442 | { ISD::SHL, MVT::v16i8, { 2, 7, 2, 3 } }, // psllw + pand. | ||||
443 | { ISD::SRL, MVT::v16i8, { 2, 7, 2, 3 } }, // psrlw + pand. | ||||
444 | { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb. | ||||
445 | { ISD::SHL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psllw + pand) + split. | ||||
446 | { ISD::SRL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psrlw + pand) + split. | ||||
447 | { ISD::SRA, MVT::v32i8, { 7, 7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split. | ||||
448 | |||||
449 | { ISD::SHL, MVT::v8i16, { 1, 2, 1, 1 } }, // psllw. | ||||
450 | { ISD::SRL, MVT::v8i16, { 1, 2, 1, 1 } }, // psrlw. | ||||
451 | { ISD::SRA, MVT::v8i16, { 1, 2, 1, 1 } }, // psraw. | ||||
452 | { ISD::SHL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psllw + split. | ||||
453 | { ISD::SRL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psrlw + split. | ||||
454 | { ISD::SRA, MVT::v16i16,{ 3, 6, 4, 5 } }, // psraw + split. | ||||
455 | |||||
456 | { ISD::SHL, MVT::v4i32, { 1, 2, 1, 1 } }, // pslld. | ||||
457 | { ISD::SRL, MVT::v4i32, { 1, 2, 1, 1 } }, // psrld. | ||||
458 | { ISD::SRA, MVT::v4i32, { 1, 2, 1, 1 } }, // psrad. | ||||
459 | { ISD::SHL, MVT::v8i32, { 3, 6, 4, 5 } }, // pslld + split. | ||||
460 | { ISD::SRL, MVT::v8i32, { 3, 6, 4, 5 } }, // psrld + split. | ||||
461 | { ISD::SRA, MVT::v8i32, { 3, 6, 4, 5 } }, // psrad + split. | ||||
462 | |||||
463 | { ISD::SHL, MVT::v2i64, { 1, 2, 1, 1 } }, // psllq. | ||||
464 | { ISD::SRL, MVT::v2i64, { 1, 2, 1, 1 } }, // psrlq. | ||||
465 | { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle. | ||||
466 | { ISD::SHL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split. | ||||
467 | { ISD::SRL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split. | ||||
468 | { ISD::SRA, MVT::v4i64, { 5, 7, 8, 9 } }, // 2 x psrad + shuffle + split. | ||||
469 | |||||
470 | { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split. | ||||
471 | { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split. | ||||
472 | { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split. | ||||
473 | { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split. | ||||
474 | }; | ||||
475 | |||||
476 | // XOP has faster vXi8 shifts. | ||||
477 | if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() && | ||||
478 | (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) | ||||
479 | if (const auto *Entry = | ||||
480 | CostTableLookup(AVXUniformConstCostTable, ISD, LT.second)) | ||||
481 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
482 | return LT.first * *KindCost; | ||||
483 | |||||
484 | static const CostKindTblEntry SSE2UniformConstCostTable[] = { | ||||
485 | { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand. | ||||
486 | { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand. | ||||
487 | { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb. | ||||
488 | |||||
489 | { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw. | ||||
490 | { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw. | ||||
491 | { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw. | ||||
492 | |||||
493 | { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld | ||||
494 | { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld. | ||||
495 | { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad. | ||||
496 | |||||
497 | { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq. | ||||
498 | { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq. | ||||
499 | { ISD::SRA, MVT::v2i64, { 3, 5, 6, 6 } }, // 2 x psrad + shuffle. | ||||
500 | |||||
501 | { ISD::SDIV, MVT::v4i32, { 6 } }, // pmuludq sequence | ||||
502 | { ISD::SREM, MVT::v4i32, { 8 } }, // pmuludq+mul+sub sequence | ||||
503 | { ISD::UDIV, MVT::v4i32, { 5 } }, // pmuludq sequence | ||||
504 | { ISD::UREM, MVT::v4i32, { 7 } }, // pmuludq+mul+sub sequence | ||||
505 | }; | ||||
506 | |||||
507 | // XOP has faster vXi8 shifts. | ||||
508 | if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() && | ||||
509 | (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) | ||||
510 | if (const auto *Entry = | ||||
511 | CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second)) | ||||
512 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
513 | return LT.first * *KindCost; | ||||
514 | |||||
515 | static const CostKindTblEntry AVX512BWConstCostTable[] = { | ||||
516 | { ISD::SDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence | ||||
517 | { ISD::SREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence | ||||
518 | { ISD::UDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence | ||||
519 | { ISD::UREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence | ||||
520 | |||||
521 | { ISD::SDIV, MVT::v32i16, { 6 } }, // vpmulhw sequence | ||||
522 | { ISD::SREM, MVT::v32i16, { 8 } }, // vpmulhw+mul+sub sequence | ||||
523 | { ISD::UDIV, MVT::v32i16, { 6 } }, // vpmulhuw sequence | ||||
524 | { ISD::UREM, MVT::v32i16, { 8 } }, // vpmulhuw+mul+sub sequence | ||||
525 | }; | ||||
526 | |||||
527 | if (Op2Info.isConstant() && ST->hasBWI()) | ||||
528 | if (const auto *Entry = | ||||
529 | CostTableLookup(AVX512BWConstCostTable, ISD, LT.second)) | ||||
530 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
531 | return LT.first * *KindCost; | ||||
532 | |||||
533 | static const CostKindTblEntry AVX512ConstCostTable[] = { | ||||
534 | { ISD::SDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence | ||||
535 | { ISD::SREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence | ||||
536 | { ISD::UDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence | ||||
537 | { ISD::UREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence | ||||
538 | |||||
539 | { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence | ||||
540 | { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence | ||||
541 | { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence | ||||
542 | { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence | ||||
543 | |||||
544 | { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence | ||||
545 | { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence | ||||
546 | { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence | ||||
547 | { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence | ||||
548 | }; | ||||
549 | |||||
550 | if (Op2Info.isConstant() && ST->hasAVX512()) | ||||
551 | if (const auto *Entry = | ||||
552 | CostTableLookup(AVX512ConstCostTable, ISD, LT.second)) | ||||
553 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
554 | return LT.first * *KindCost; | ||||
555 | |||||
556 | static const CostKindTblEntry AVX2ConstCostTable[] = { | ||||
557 | { ISD::SDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence | ||||
558 | { ISD::SREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence | ||||
559 | { ISD::UDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence | ||||
560 | { ISD::UREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence | ||||
561 | |||||
562 | { ISD::SDIV, MVT::v16i16, { 6 } }, // vpmulhw sequence | ||||
563 | { ISD::SREM, MVT::v16i16, { 8 } }, // vpmulhw+mul+sub sequence | ||||
564 | { ISD::UDIV, MVT::v16i16, { 6 } }, // vpmulhuw sequence | ||||
565 | { ISD::UREM, MVT::v16i16, { 8 } }, // vpmulhuw+mul+sub sequence | ||||
566 | |||||
567 | { ISD::SDIV, MVT::v8i32, { 15 } }, // vpmuldq sequence | ||||
568 | { ISD::SREM, MVT::v8i32, { 19 } }, // vpmuldq+mul+sub sequence | ||||
569 | { ISD::UDIV, MVT::v8i32, { 15 } }, // vpmuludq sequence | ||||
570 | { ISD::UREM, MVT::v8i32, { 19 } }, // vpmuludq+mul+sub sequence | ||||
571 | }; | ||||
572 | |||||
573 | if (Op2Info.isConstant() && ST->hasAVX2()) | ||||
574 | if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second)) | ||||
575 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
576 | return LT.first * *KindCost; | ||||
577 | |||||
578 | static const CostKindTblEntry AVXConstCostTable[] = { | ||||
579 | { ISD::SDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split. | ||||
580 | { ISD::SREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split. | ||||
581 | { ISD::UDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split. | ||||
582 | { ISD::UREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split. | ||||
583 | |||||
584 | { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split. | ||||
585 | { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split. | ||||
586 | { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split. | ||||
587 | { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split. | ||||
588 | |||||
589 | { ISD::SDIV, MVT::v8i32, { 32 } }, // vpmuludq sequence | ||||
590 | { ISD::SREM, MVT::v8i32, { 38 } }, // vpmuludq+mul+sub sequence | ||||
591 | { ISD::UDIV, MVT::v8i32, { 32 } }, // 2*pmuludq sequence + split. | ||||
592 | { ISD::UREM, MVT::v8i32, { 42 } }, // 2*pmuludq+mul+sub sequence + split. | ||||
593 | }; | ||||
594 | |||||
595 | if (Op2Info.isConstant() && ST->hasAVX()) | ||||
596 | if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second)) | ||||
597 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
598 | return LT.first * *KindCost; | ||||
599 | |||||
600 | static const CostKindTblEntry SSE41ConstCostTable[] = { | ||||
601 | { ISD::SDIV, MVT::v4i32, { 15 } }, // vpmuludq sequence | ||||
602 | { ISD::SREM, MVT::v4i32, { 20 } }, // vpmuludq+mul+sub sequence | ||||
603 | }; | ||||
604 | |||||
605 | if (Op2Info.isConstant() && ST->hasSSE41()) | ||||
606 | if (const auto *Entry = | ||||
607 | CostTableLookup(SSE41ConstCostTable, ISD, LT.second)) | ||||
608 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
609 | return LT.first * *KindCost; | ||||
610 | |||||
611 | static const CostKindTblEntry SSE2ConstCostTable[] = { | ||||
612 | { ISD::SDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence | ||||
613 | { ISD::SREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence | ||||
614 | { ISD::UDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence | ||||
615 | { ISD::UREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence | ||||
616 | |||||
617 | { ISD::SDIV, MVT::v8i16, { 6 } }, // pmulhw sequence | ||||
618 | { ISD::SREM, MVT::v8i16, { 8 } }, // pmulhw+mul+sub sequence | ||||
619 | { ISD::UDIV, MVT::v8i16, { 6 } }, // pmulhuw sequence | ||||
620 | { ISD::UREM, MVT::v8i16, { 8 } }, // pmulhuw+mul+sub sequence | ||||
621 | |||||
622 | { ISD::SDIV, MVT::v4i32, { 19 } }, // pmuludq sequence | ||||
623 | { ISD::SREM, MVT::v4i32, { 24 } }, // pmuludq+mul+sub sequence | ||||
624 | { ISD::UDIV, MVT::v4i32, { 15 } }, // pmuludq sequence | ||||
625 | { ISD::UREM, MVT::v4i32, { 20 } }, // pmuludq+mul+sub sequence | ||||
626 | }; | ||||
627 | |||||
628 | if (Op2Info.isConstant() && ST->hasSSE2()) | ||||
629 | if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second)) | ||||
630 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
631 | return LT.first * *KindCost; | ||||
632 | |||||
633 | static const CostKindTblEntry AVX512BWUniformCostTable[] = { | ||||
634 | { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand. | ||||
635 | { ISD::SRL, MVT::v16i8, { 3,10, 5, 8 } }, // psrlw + pand. | ||||
636 | { ISD::SRA, MVT::v16i8, { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb. | ||||
637 | { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand. | ||||
638 | { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand. | ||||
639 | { ISD::SRA, MVT::v32i8, { 5,10,10,13 } }, // psrlw, pand, pxor, psubb. | ||||
640 | { ISD::SHL, MVT::v64i8, { 4, 7, 6, 8 } }, // psllw + pand. | ||||
641 | { ISD::SRL, MVT::v64i8, { 4, 8, 7,10 } }, // psrlw + pand. | ||||
642 | { ISD::SRA, MVT::v64i8, { 5,10,10,15 } }, // psrlw, pand, pxor, psubb. | ||||
643 | |||||
644 | { ISD::SHL, MVT::v32i16, { 2, 4, 2, 3 } }, // psllw | ||||
645 | { ISD::SRL, MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw | ||||
646 | { ISD::SRA, MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw | ||||
647 | }; | ||||
648 | |||||
649 | if (ST->hasBWI() && Op2Info.isUniform()) | ||||
650 | if (const auto *Entry = | ||||
651 | CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second)) | ||||
652 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
653 | return LT.first * *KindCost; | ||||
654 | |||||
655 | static const CostKindTblEntry AVX512UniformCostTable[] = { | ||||
656 | { ISD::SHL, MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split. | ||||
657 | { ISD::SRL, MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split. | ||||
658 | { ISD::SRA, MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split. | ||||
659 | |||||
660 | { ISD::SHL, MVT::v16i32, { 2, 4, 2, 3 } }, // pslld | ||||
661 | { ISD::SRL, MVT::v16i32, { 2, 4, 2, 3 } }, // psrld | ||||
662 | { ISD::SRA, MVT::v16i32, { 2, 4, 2, 3 } }, // psrad | ||||
663 | |||||
664 | { ISD::SRA, MVT::v2i64, { 1, 2, 1, 2 } }, // psraq | ||||
665 | { ISD::SHL, MVT::v4i64, { 1, 4, 1, 2 } }, // psllq | ||||
666 | { ISD::SRL, MVT::v4i64, { 1, 4, 1, 2 } }, // psrlq | ||||
667 | { ISD::SRA, MVT::v4i64, { 1, 4, 1, 2 } }, // psraq | ||||
668 | { ISD::SHL, MVT::v8i64, { 1, 4, 1, 2 } }, // psllq | ||||
669 | { ISD::SRL, MVT::v8i64, { 1, 4, 1, 2 } }, // psrlq | ||||
670 | { ISD::SRA, MVT::v8i64, { 1, 4, 1, 2 } }, // psraq | ||||
671 | }; | ||||
672 | |||||
673 | if (ST->hasAVX512() && Op2Info.isUniform()) | ||||
674 | if (const auto *Entry = | ||||
675 | CostTableLookup(AVX512UniformCostTable, ISD, LT.second)) | ||||
676 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
677 | return LT.first * *KindCost; | ||||
678 | |||||
679 | static const CostKindTblEntry AVX2UniformCostTable[] = { | ||||
680 | // Uniform splats are cheaper for the following instructions. | ||||
681 | { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand. | ||||
682 | { ISD::SRL, MVT::v16i8, { 3, 9, 5, 8 } }, // psrlw + pand. | ||||
683 | { ISD::SRA, MVT::v16i8, { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb. | ||||
684 | { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand. | ||||
685 | { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand. | ||||
686 | { ISD::SRA, MVT::v32i8, { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb. | ||||
687 | |||||
688 | { ISD::SHL, MVT::v8i16, { 1, 2, 1, 2 } }, // psllw. | ||||
689 | { ISD::SRL, MVT::v8i16, { 1, 2, 1, 2 } }, // psrlw. | ||||
690 | { ISD::SRA, MVT::v8i16, { 1, 2, 1, 2 } }, // psraw. | ||||
691 | { ISD::SHL, MVT::v16i16, { 2, 4, 2, 3 } }, // psllw. | ||||
692 | { ISD::SRL, MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw. | ||||
693 | { ISD::SRA, MVT::v16i16, { 2, 4, 2, 3 } }, // psraw. | ||||
694 | |||||
695 | { ISD::SHL, MVT::v4i32, { 1, 2, 1, 2 } }, // pslld | ||||
696 | { ISD::SRL, MVT::v4i32, { 1, 2, 1, 2 } }, // psrld | ||||
697 | { ISD::SRA, MVT::v4i32, { 1, 2, 1, 2 } }, // psrad | ||||
698 | { ISD::SHL, MVT::v8i32, { 2, 4, 2, 3 } }, // pslld | ||||
699 | { ISD::SRL, MVT::v8i32, { 2, 4, 2, 3 } }, // psrld | ||||
700 | { ISD::SRA, MVT::v8i32, { 2, 4, 2, 3 } }, // psrad | ||||
701 | |||||
702 | { ISD::SHL, MVT::v2i64, { 1, 2, 1, 2 } }, // psllq | ||||
703 | { ISD::SRL, MVT::v2i64, { 1, 2, 1, 2 } }, // psrlq | ||||
704 | { ISD::SRA, MVT::v2i64, { 2, 4, 5, 7 } }, // 2 x psrad + shuffle. | ||||
705 | { ISD::SHL, MVT::v4i64, { 2, 4, 1, 2 } }, // psllq | ||||
706 | { ISD::SRL, MVT::v4i64, { 2, 4, 1, 2 } }, // psrlq | ||||
707 | { ISD::SRA, MVT::v4i64, { 4, 6, 5, 9 } }, // 2 x psrad + shuffle. | ||||
708 | }; | ||||
709 | |||||
710 | if (ST->hasAVX2() && Op2Info.isUniform()) | ||||
711 | if (const auto *Entry = | ||||
712 | CostTableLookup(AVX2UniformCostTable, ISD, LT.second)) | ||||
713 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
714 | return LT.first * *KindCost; | ||||
715 | |||||
716 | static const CostKindTblEntry AVXUniformCostTable[] = { | ||||
717 | { ISD::SHL, MVT::v16i8, { 4, 4, 6, 8 } }, // psllw + pand. | ||||
718 | { ISD::SRL, MVT::v16i8, { 4, 8, 5, 8 } }, // psrlw + pand. | ||||
719 | { ISD::SRA, MVT::v16i8, { 6, 6, 9,13 } }, // psrlw, pand, pxor, psubb. | ||||
720 | { ISD::SHL, MVT::v32i8, { 7, 8,11,14 } }, // psllw + pand + split. | ||||
721 | { ISD::SRL, MVT::v32i8, { 7, 9,10,14 } }, // psrlw + pand + split. | ||||
722 | { ISD::SRA, MVT::v32i8, { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split. | ||||
723 | |||||
724 | { ISD::SHL, MVT::v8i16, { 1, 3, 1, 2 } }, // psllw. | ||||
725 | { ISD::SRL, MVT::v8i16, { 1, 3, 1, 2 } }, // psrlw. | ||||
726 | { ISD::SRA, MVT::v8i16, { 1, 3, 1, 2 } }, // psraw. | ||||
727 | { ISD::SHL, MVT::v16i16, { 3, 7, 5, 7 } }, // psllw + split. | ||||
728 | { ISD::SRL, MVT::v16i16, { 3, 7, 5, 7 } }, // psrlw + split. | ||||
729 | { ISD::SRA, MVT::v16i16, { 3, 7, 5, 7 } }, // psraw + split. | ||||
730 | |||||
731 | { ISD::SHL, MVT::v4i32, { 1, 3, 1, 2 } }, // pslld. | ||||
732 | { ISD::SRL, MVT::v4i32, { 1, 3, 1, 2 } }, // psrld. | ||||
733 | { ISD::SRA, MVT::v4i32, { 1, 3, 1, 2 } }, // psrad. | ||||
734 | { ISD::SHL, MVT::v8i32, { 3, 7, 5, 7 } }, // pslld + split. | ||||
735 | { ISD::SRL, MVT::v8i32, { 3, 7, 5, 7 } }, // psrld + split. | ||||
736 | { ISD::SRA, MVT::v8i32, { 3, 7, 5, 7 } }, // psrad + split. | ||||
737 | |||||
738 | { ISD::SHL, MVT::v2i64, { 1, 3, 1, 2 } }, // psllq. | ||||
739 | { ISD::SRL, MVT::v2i64, { 1, 3, 1, 2 } }, // psrlq. | ||||
740 | { ISD::SRA, MVT::v2i64, { 3, 4, 5, 7 } }, // 2 x psrad + shuffle. | ||||
741 | { ISD::SHL, MVT::v4i64, { 3, 7, 4, 6 } }, // psllq + split. | ||||
742 | { ISD::SRL, MVT::v4i64, { 3, 7, 4, 6 } }, // psrlq + split. | ||||
743 | { ISD::SRA, MVT::v4i64, { 6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split. | ||||
744 | }; | ||||
745 | |||||
746 | // XOP has faster vXi8 shifts. | ||||
747 | if (ST->hasAVX() && Op2Info.isUniform() && | ||||
748 | (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) | ||||
749 | if (const auto *Entry = | ||||
750 | CostTableLookup(AVXUniformCostTable, ISD, LT.second)) | ||||
751 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
752 | return LT.first * *KindCost; | ||||
753 | |||||
754 | static const CostKindTblEntry SSE2UniformCostTable[] = { | ||||
755 | // Uniform splats are cheaper for the following instructions. | ||||
756 | { ISD::SHL, MVT::v16i8, { 9, 10, 6, 9 } }, // psllw + pand. | ||||
757 | { ISD::SRL, MVT::v16i8, { 9, 13, 5, 9 } }, // psrlw + pand. | ||||
758 | { ISD::SRA, MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence. | ||||
759 | |||||
760 | { ISD::SHL, MVT::v8i16, { 2, 2, 1, 2 } }, // psllw. | ||||
761 | { ISD::SRL, MVT::v8i16, { 2, 2, 1, 2 } }, // psrlw. | ||||
762 | { ISD::SRA, MVT::v8i16, { 2, 2, 1, 2 } }, // psraw. | ||||
763 | |||||
764 | { ISD::SHL, MVT::v4i32, { 2, 2, 1, 2 } }, // pslld | ||||
765 | { ISD::SRL, MVT::v4i32, { 2, 2, 1, 2 } }, // psrld. | ||||
766 | { ISD::SRA, MVT::v4i32, { 2, 2, 1, 2 } }, // psrad. | ||||
767 | |||||
768 | { ISD::SHL, MVT::v2i64, { 2, 2, 1, 2 } }, // psllq. | ||||
769 | { ISD::SRL, MVT::v2i64, { 2, 2, 1, 2 } }, // psrlq. | ||||
770 | { ISD::SRA, MVT::v2i64, { 5, 9, 5, 7 } }, // 2*psrlq + xor + sub. | ||||
771 | }; | ||||
772 | |||||
773 | if (ST->hasSSE2() && Op2Info.isUniform() && | ||||
774 | (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) | ||||
775 | if (const auto *Entry = | ||||
776 | CostTableLookup(SSE2UniformCostTable, ISD, LT.second)) | ||||
777 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
778 | return LT.first * *KindCost; | ||||
779 | |||||
780 | static const CostKindTblEntry AVX512DQCostTable[] = { | ||||
781 | { ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq | ||||
782 | { ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq | ||||
783 | { ISD::MUL, MVT::v8i64, { 3, 15, 1, 3 } } // pmullq | ||||
784 | }; | ||||
785 | |||||
786 | // Look for AVX512DQ lowering tricks for custom cases. | ||||
787 | if (ST->hasDQI()) | ||||
788 | if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second)) | ||||
789 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
790 | return LT.first * *KindCost; | ||||
791 | |||||
792 | static const CostKindTblEntry AVX512BWCostTable[] = { | ||||
793 | { ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence. | ||||
794 | { ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence. | ||||
795 | { ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence. | ||||
796 | { ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence. | ||||
797 | { ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence. | ||||
798 | { ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence. | ||||
799 | { ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence. | ||||
800 | { ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence. | ||||
801 | { ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence. | ||||
802 | |||||
803 | { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw | ||||
804 | { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw | ||||
805 | { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw | ||||
806 | { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw | ||||
807 | { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw | ||||
808 | { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw | ||||
809 | { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw | ||||
810 | { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw | ||||
811 | { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw | ||||
812 | |||||
813 | { ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb | ||||
814 | { ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw | ||||
815 | |||||
816 | { ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb | ||||
817 | { ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw | ||||
818 | { ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd | ||||
819 | { ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq | ||||
820 | |||||
821 | { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb | ||||
822 | { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw | ||||
823 | |||||
824 | { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw | ||||
825 | |||||
826 | { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb | ||||
827 | { ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw | ||||
828 | { ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd | ||||
829 | { ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq | ||||
830 | }; | ||||
831 | |||||
832 | // Look for AVX512BW lowering tricks for custom cases. | ||||
833 | if (ST->hasBWI()) | ||||
834 | if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second)) | ||||
835 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
836 | return LT.first * *KindCost; | ||||
837 | |||||
838 | static const CostKindTblEntry AVX512CostTable[] = { | ||||
839 | { ISD::SHL, MVT::v64i8, { 15, 19,27,33 } }, // vpblendv+split sequence. | ||||
840 | { ISD::SRL, MVT::v64i8, { 15, 19,30,36 } }, // vpblendv+split sequence. | ||||
841 | { ISD::SRA, MVT::v64i8, { 37, 37,51,63 } }, // vpblendv+split sequence. | ||||
842 | |||||
843 | { ISD::SHL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence. | ||||
844 | { ISD::SRL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence. | ||||
845 | { ISD::SRA, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence. | ||||
846 | |||||
847 | { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, | ||||
848 | { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, | ||||
849 | { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, | ||||
850 | { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, | ||||
851 | { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, | ||||
852 | { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, | ||||
853 | { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, | ||||
854 | { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, | ||||
855 | { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, | ||||
856 | |||||
857 | { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, | ||||
858 | { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, | ||||
859 | { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, | ||||
860 | { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, | ||||
861 | { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, | ||||
862 | { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, | ||||
863 | { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, | ||||
864 | { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, | ||||
865 | { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, | ||||
866 | |||||
867 | { ISD::ADD, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*paddb + split | ||||
868 | { ISD::ADD, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*paddw + split | ||||
869 | |||||
870 | { ISD::SUB, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*psubb + split | ||||
871 | { ISD::SUB, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*psubw + split | ||||
872 | |||||
873 | { ISD::AND, MVT::v32i8, { 1, 1, 1, 1 } }, | ||||
874 | { ISD::AND, MVT::v16i16, { 1, 1, 1, 1 } }, | ||||
875 | { ISD::AND, MVT::v8i32, { 1, 1, 1, 1 } }, | ||||
876 | { ISD::AND, MVT::v4i64, { 1, 1, 1, 1 } }, | ||||
877 | |||||
878 | { ISD::OR, MVT::v32i8, { 1, 1, 1, 1 } }, | ||||
879 | { ISD::OR, MVT::v16i16, { 1, 1, 1, 1 } }, | ||||
880 | { ISD::OR, MVT::v8i32, { 1, 1, 1, 1 } }, | ||||
881 | { ISD::OR, MVT::v4i64, { 1, 1, 1, 1 } }, | ||||
882 | |||||
883 | { ISD::XOR, MVT::v32i8, { 1, 1, 1, 1 } }, | ||||
884 | { ISD::XOR, MVT::v16i16, { 1, 1, 1, 1 } }, | ||||
885 | { ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } }, | ||||
886 | { ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } }, | ||||
887 | |||||
888 | { ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org) | ||||
889 | { ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org) | ||||
890 | { ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org) | ||||
891 | { ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add | ||||
892 | { ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/ | ||||
893 | |||||
894 | { ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/ | ||||
895 | { ISD::FADD, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | ||||
896 | { ISD::FADD, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | ||||
897 | { ISD::FSUB, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | ||||
898 | { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | ||||
899 | { ISD::FMUL, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | ||||
900 | { ISD::FMUL, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | ||||
901 | { ISD::FMUL, MVT::v2f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | ||||
902 | { ISD::FMUL, MVT::f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | ||||
903 | |||||
904 | { ISD::FDIV, MVT::f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/ | ||||
905 | { ISD::FDIV, MVT::v2f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/ | ||||
906 | { ISD::FDIV, MVT::v4f64, { 8, 14, 1, 1 } }, // Skylake from http://www.agner.org/ | ||||
907 | { ISD::FDIV, MVT::v8f64, { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/ | ||||
908 | |||||
909 | { ISD::FNEG, MVT::v16f32, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/ | ||||
910 | { ISD::FADD, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | ||||
911 | { ISD::FADD, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | ||||
912 | { ISD::FSUB, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | ||||
913 | { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | ||||
914 | { ISD::FMUL, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | ||||
915 | { ISD::FMUL, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | ||||
916 | { ISD::FMUL, MVT::v4f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | ||||
917 | { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | ||||
918 | |||||
919 | { ISD::FDIV, MVT::f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/ | ||||
920 | { ISD::FDIV, MVT::v4f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/ | ||||
921 | { ISD::FDIV, MVT::v8f32, { 5, 11, 1, 1 } }, // Skylake from http://www.agner.org/ | ||||
922 | { ISD::FDIV, MVT::v16f32, { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/ | ||||
923 | }; | ||||
924 | |||||
925 | if (ST->hasAVX512()) | ||||
926 | if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second)) | ||||
927 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
928 | return LT.first * *KindCost; | ||||
929 | |||||
930 | static const CostKindTblEntry AVX2ShiftCostTable[] = { | ||||
931 | // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to | ||||
932 | // customize them to detect the cases where shift amount is a scalar one. | ||||
933 | { ISD::SHL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org) | ||||
934 | { ISD::SRL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org) | ||||
935 | { ISD::SRA, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org) | ||||
936 | { ISD::SHL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org) | ||||
937 | { ISD::SRL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org) | ||||
938 | { ISD::SRA, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org) | ||||
939 | { ISD::SHL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org) | ||||
940 | { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org) | ||||
941 | { ISD::SHL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org) | ||||
942 | { ISD::SRL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org) | ||||
943 | }; | ||||
944 | |||||
945 | if (ST->hasAVX512()) { | ||||
946 | if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant()) | ||||
947 | // On AVX512, a packed v32i16 shift left by a constant build_vector | ||||
948 | // is lowered into a vector multiply (vpmullw). | ||||
949 | return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, | ||||
950 | Op1Info.getNoProps(), Op2Info.getNoProps()); | ||||
951 | } | ||||
952 | |||||
953 | // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts). | ||||
954 | if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) { | ||||
955 | if (ISD == ISD::SHL && LT.second == MVT::v16i16 && | ||||
956 | Op2Info.isConstant()) | ||||
957 | // On AVX2, a packed v16i16 shift left by a constant build_vector | ||||
958 | // is lowered into a vector multiply (vpmullw). | ||||
959 | return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, | ||||
960 | Op1Info.getNoProps(), Op2Info.getNoProps()); | ||||
961 | |||||
962 | if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second)) | ||||
963 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
964 | return LT.first * *KindCost; | ||||
965 | } | ||||
966 | |||||
967 | static const CostKindTblEntry XOPShiftCostTable[] = { | ||||
968 | // 128bit shifts take 1cy, but right shifts require negation beforehand. | ||||
969 | { ISD::SHL, MVT::v16i8, { 1, 3, 1, 1 } }, | ||||
970 | { ISD::SRL, MVT::v16i8, { 2, 3, 1, 1 } }, | ||||
971 | { ISD::SRA, MVT::v16i8, { 2, 3, 1, 1 } }, | ||||
972 | { ISD::SHL, MVT::v8i16, { 1, 3, 1, 1 } }, | ||||
973 | { ISD::SRL, MVT::v8i16, { 2, 3, 1, 1 } }, | ||||
974 | { ISD::SRA, MVT::v8i16, { 2, 3, 1, 1 } }, | ||||
975 | { ISD::SHL, MVT::v4i32, { 1, 3, 1, 1 } }, | ||||
976 | { ISD::SRL, MVT::v4i32, { 2, 3, 1, 1 } }, | ||||
977 | { ISD::SRA, MVT::v4i32, { 2, 3, 1, 1 } }, | ||||
978 | { ISD::SHL, MVT::v2i64, { 1, 3, 1, 1 } }, | ||||
979 | { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, | ||||
980 | { ISD::SRA, MVT::v2i64, { 2, 3, 1, 1 } }, | ||||
981 | // 256bit shifts require splitting if AVX2 didn't catch them above. | ||||
982 | { ISD::SHL, MVT::v32i8, { 4, 7, 5, 6 } }, | ||||
983 | { ISD::SRL, MVT::v32i8, { 6, 7, 5, 6 } }, | ||||
984 | { ISD::SRA, MVT::v32i8, { 6, 7, 5, 6 } }, | ||||
985 | { ISD::SHL, MVT::v16i16, { 4, 7, 5, 6 } }, | ||||
986 | { ISD::SRL, MVT::v16i16, { 6, 7, 5, 6 } }, | ||||
987 | { ISD::SRA, MVT::v16i16, { 6, 7, 5, 6 } }, | ||||
988 | { ISD::SHL, MVT::v8i32, { 4, 7, 5, 6 } }, | ||||
989 | { ISD::SRL, MVT::v8i32, { 6, 7, 5, 6 } }, | ||||
990 | { ISD::SRA, MVT::v8i32, { 6, 7, 5, 6 } }, | ||||
991 | { ISD::SHL, MVT::v4i64, { 4, 7, 5, 6 } }, | ||||
992 | { ISD::SRL, MVT::v4i64, { 6, 7, 5, 6 } }, | ||||
993 | { ISD::SRA, MVT::v4i64, { 6, 7, 5, 6 } }, | ||||
994 | }; | ||||
995 | |||||
996 | // Look for XOP lowering tricks. | ||||
997 | if (ST->hasXOP()) { | ||||
998 | // If the right shift is constant then we'll fold the negation so | ||||
999 | // it's as cheap as a left shift. | ||||
1000 | int ShiftISD = ISD; | ||||
1001 | if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant()) | ||||
1002 | ShiftISD = ISD::SHL; | ||||
1003 | if (const auto *Entry = | ||||
1004 | CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second)) | ||||
1005 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
1006 | return LT.first * *KindCost; | ||||
1007 | } | ||||
1008 | |||||
1009 | if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) { | ||||
1010 | MVT VT = LT.second; | ||||
1011 | // Vector shift left by non uniform constant can be lowered | ||||
1012 | // into vector multiply. | ||||
1013 | if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) || | ||||
1014 | ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX())) | ||||
1015 | ISD = ISD::MUL; | ||||
1016 | } | ||||
1017 | |||||
1018 | static const CostKindTblEntry GLMCostTable[] = { | ||||
1019 | { ISD::FDIV, MVT::f32, { 18, 19, 1, 1 } }, // divss | ||||
1020 | { ISD::FDIV, MVT::v4f32, { 35, 36, 1, 1 } }, // divps | ||||
1021 | { ISD::FDIV, MVT::f64, { 33, 34, 1, 1 } }, // divsd | ||||
1022 | { ISD::FDIV, MVT::v2f64, { 65, 66, 1, 1 } }, // divpd | ||||
1023 | }; | ||||
1024 | |||||
1025 | if (ST->useGLMDivSqrtCosts()) | ||||
1026 | if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second)) | ||||
1027 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
1028 | return LT.first * *KindCost; | ||||
1029 | |||||
1030 | static const CostKindTblEntry SLMCostTable[] = { | ||||
1031 | { ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld | ||||
1032 | { ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw | ||||
1033 | { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd | ||||
1034 | { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss | ||||
1035 | { ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd | ||||
1036 | { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // mulps | ||||
1037 | { ISD::FDIV, MVT::f32, { 17, 19, 1, 1 } }, // divss | ||||
1038 | { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 6 } }, // divps | ||||
1039 | { ISD::FDIV, MVT::f64, { 32, 34, 1, 1 } }, // divsd | ||||
1040 | { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 6 } }, // divpd | ||||
1041 | { ISD::FADD, MVT::v2f64, { 2, 4, 1, 1 } }, // addpd | ||||
1042 | { ISD::FSUB, MVT::v2f64, { 2, 4, 1, 1 } }, // subpd | ||||
1043 | // v2i64/v4i64 mul is custom lowered as a series of long: | ||||
1044 | // multiplies(3), shifts(3) and adds(2) | ||||
1045 | // slm muldq version throughput is 2 and addq throughput 4 | ||||
1046 | // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) + | ||||
1047 | // 3X4 (addq throughput) = 17 | ||||
1048 | { ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } }, | ||||
1049 | // slm addq\subq throughput is 4 | ||||
1050 | { ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } }, | ||||
1051 | { ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } }, | ||||
1052 | }; | ||||
1053 | |||||
1054 | if (ST->useSLMArithCosts()) | ||||
1055 | if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second)) | ||||
1056 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
1057 | return LT.first * *KindCost; | ||||
1058 | |||||
1059 | static const CostKindTblEntry AVX2CostTable[] = { | ||||
1060 | { ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence. | ||||
1061 | { ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence. | ||||
1062 | { ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence. | ||||
1063 | { ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence. | ||||
1064 | |||||
1065 | { ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence. | ||||
1066 | { ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence. | ||||
1067 | { ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence. | ||||
1068 | { ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence. | ||||
1069 | |||||
1070 | { ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence. | ||||
1071 | { ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence. | ||||
1072 | { ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence. | ||||
1073 | { ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence. | ||||
1074 | { ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence. | ||||
1075 | { ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence. | ||||
1076 | |||||
1077 | { ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb | ||||
1078 | { ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb | ||||
1079 | { ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw | ||||
1080 | { ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw | ||||
1081 | { ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd | ||||
1082 | { ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd | ||||
1083 | { ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq | ||||
1084 | { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq | ||||
1085 | |||||
1086 | { ISD::MUL, MVT::v16i16, { 2, 5, 1, 1 } }, // pmullw | ||||
1087 | { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld | ||||
1088 | { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld | ||||
1089 | { ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add | ||||
1090 | { ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add | ||||
1091 | |||||
1092 | { ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd | ||||
1093 | { ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps | ||||
1094 | |||||
1095 | { ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd | ||||
1096 | { ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss | ||||
1097 | { ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd | ||||
1098 | { ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps | ||||
1099 | { ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd | ||||
1100 | { ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps | ||||
1101 | |||||
1102 | { ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd | ||||
1103 | { ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss | ||||
1104 | { ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd | ||||
1105 | { ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps | ||||
1106 | { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd | ||||
1107 | { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps | ||||
1108 | |||||
1109 | { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd | ||||
1110 | { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss | ||||
1111 | { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd | ||||
1112 | { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps | ||||
1113 | { ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd | ||||
1114 | { ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps | ||||
1115 | |||||
1116 | { ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss | ||||
1117 | { ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps | ||||
1118 | { ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps | ||||
1119 | { ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd | ||||
1120 | { ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd | ||||
1121 | { ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd | ||||
1122 | }; | ||||
1123 | |||||
1124 | // Look for AVX2 lowering tricks for custom cases. | ||||
1125 | if (ST->hasAVX2()) | ||||
1126 | if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second)) | ||||
1127 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
1128 | return LT.first * *KindCost; | ||||
1129 | |||||
1130 | static const CostKindTblEntry AVX1CostTable[] = { | ||||
1131 | // We don't have to scalarize unsupported ops. We can issue two half-sized | ||||
1132 | // operations and we only need to extract the upper YMM half. | ||||
1133 | // Two ops + 1 extract + 1 insert = 4. | ||||
1134 | { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split | ||||
1135 | { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split | ||||
1136 | { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld | ||||
1137 | { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } }, | ||||
1138 | |||||
1139 | { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps | ||||
1140 | { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps | ||||
1141 | { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps | ||||
1142 | { ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps | ||||
1143 | |||||
1144 | { ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps | ||||
1145 | { ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps | ||||
1146 | { ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps | ||||
1147 | { ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps | ||||
1148 | |||||
1149 | { ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps | ||||
1150 | { ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps | ||||
1151 | { ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps | ||||
1152 | { ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps | ||||
1153 | |||||
1154 | { ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split | ||||
1155 | { ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split | ||||
1156 | { ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split | ||||
1157 | { ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split | ||||
1158 | { ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split | ||||
1159 | { ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split | ||||
1160 | { ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split | ||||
1161 | { ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split | ||||
1162 | { ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq | ||||
1163 | { ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq | ||||
1164 | |||||
1165 | { ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence. | ||||
1166 | { ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split. | ||||
1167 | { ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence. | ||||
1168 | { ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split. | ||||
1169 | { ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld | ||||
1170 | { ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split | ||||
1171 | { ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend. | ||||
1172 | { ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split. | ||||
1173 | |||||
1174 | { ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence. | ||||
1175 | { ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split. | ||||
1176 | { ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence. | ||||
1177 | { ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split. | ||||
1178 | { ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend. | ||||
1179 | { ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split. | ||||
1180 | { ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend. | ||||
1181 | { ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split. | ||||
1182 | |||||
1183 | { ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence. | ||||
1184 | { ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split. | ||||
1185 | { ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence. | ||||
1186 | { ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split. | ||||
1187 | { ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend. | ||||
1188 | { ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split. | ||||
1189 | { ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend. | ||||
1190 | { ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split. | ||||
1191 | |||||
1192 | { ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/ | ||||
1193 | { ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/ | ||||
1194 | |||||
1195 | { ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ | ||||
1196 | { ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ | ||||
1197 | { ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ | ||||
1198 | { ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ | ||||
1199 | { ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/ | ||||
1200 | { ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/ | ||||
1201 | |||||
1202 | { ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ | ||||
1203 | { ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ | ||||
1204 | { ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ | ||||
1205 | { ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ | ||||
1206 | { ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/ | ||||
1207 | { ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/ | ||||
1208 | |||||
1209 | { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/ | ||||
1210 | { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/ | ||||
1211 | { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/ | ||||
1212 | { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/ | ||||
1213 | { ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/ | ||||
1214 | { ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/ | ||||
1215 | |||||
1216 | { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/ | ||||
1217 | { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/ | ||||
1218 | { ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/ | ||||
1219 | { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/ | ||||
1220 | { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/ | ||||
1221 | { ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/ | ||||
1222 | }; | ||||
1223 | |||||
1224 | if (ST->hasAVX()) | ||||
1225 | if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second)) | ||||
1226 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
1227 | return LT.first * *KindCost; | ||||
1228 | |||||
1229 | static const CostKindTblEntry SSE42CostTable[] = { | ||||
1230 | { ISD::FADD, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ | ||||
1231 | { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ | ||||
1232 | { ISD::FADD, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ | ||||
1233 | { ISD::FADD, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ | ||||
1234 | |||||
1235 | { ISD::FSUB, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ | ||||
1236 | { ISD::FSUB, MVT::f32 , { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ | ||||
1237 | { ISD::FSUB, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ | ||||
1238 | { ISD::FSUB, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ | ||||
1239 | |||||
1240 | { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/ | ||||
1241 | { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/ | ||||
1242 | { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/ | ||||
1243 | { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/ | ||||
1244 | |||||
1245 | { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/ | ||||
1246 | { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/ | ||||
1247 | { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/ | ||||
1248 | { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/ | ||||
1249 | |||||
1250 | { ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add | ||||
1251 | }; | ||||
1252 | |||||
1253 | if (ST->hasSSE42()) | ||||
1254 | if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second)) | ||||
1255 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
1256 | return LT.first * *KindCost; | ||||
1257 | |||||
1258 | static const CostKindTblEntry SSE41CostTable[] = { | ||||
1259 | { ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence. | ||||
1260 | { ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence. | ||||
1261 | { ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld | ||||
1262 | |||||
1263 | { ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence. | ||||
1264 | { ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence. | ||||
1265 | { ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend. | ||||
1266 | { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence. | ||||
1267 | |||||
1268 | { ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence. | ||||
1269 | { ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence. | ||||
1270 | { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend. | ||||
1271 | { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence. | ||||
1272 | |||||
1273 | { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org) | ||||
1274 | }; | ||||
1275 | |||||
1276 | if (ST->hasSSE41()) | ||||
1277 | if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second)) | ||||
1278 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
1279 | return LT.first * *KindCost; | ||||
1280 | |||||
1281 | static const CostKindTblEntry SSE2CostTable[] = { | ||||
1282 | // We don't correctly identify costs of casts because they are marked as | ||||
1283 | // custom. | ||||
1284 | { ISD::SHL, MVT::v16i8, { 13, 21,26,28 } }, // cmpgtb sequence. | ||||
1285 | { ISD::SHL, MVT::v8i16, { 24, 27,16,20 } }, // cmpgtw sequence. | ||||
1286 | { ISD::SHL, MVT::v4i32, { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq. | ||||
1287 | { ISD::SHL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence. | ||||
1288 | |||||
1289 | { ISD::SRL, MVT::v16i8, { 14, 28,27,30 } }, // cmpgtb sequence. | ||||
1290 | { ISD::SRL, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence. | ||||
1291 | { ISD::SRL, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend. | ||||
1292 | { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence. | ||||
1293 | |||||
1294 | { ISD::SRA, MVT::v16i8, { 27, 30,54,54 } }, // unpacked cmpgtb sequence. | ||||
1295 | { ISD::SRA, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence. | ||||
1296 | { ISD::SRA, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend. | ||||
1297 | { ISD::SRA, MVT::v2i64, { 8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence. | ||||
1298 | |||||
1299 | { ISD::AND, MVT::v16i8, { 1, 1, 1, 1 } }, // pand | ||||
1300 | { ISD::AND, MVT::v8i16, { 1, 1, 1, 1 } }, // pand | ||||
1301 | { ISD::AND, MVT::v4i32, { 1, 1, 1, 1 } }, // pand | ||||
1302 | { ISD::AND, MVT::v2i64, { 1, 1, 1, 1 } }, // pand | ||||
1303 | |||||
1304 | { ISD::OR, MVT::v16i8, { 1, 1, 1, 1 } }, // por | ||||
1305 | { ISD::OR, MVT::v8i16, { 1, 1, 1, 1 } }, // por | ||||
1306 | { ISD::OR, MVT::v4i32, { 1, 1, 1, 1 } }, // por | ||||
1307 | { ISD::OR, MVT::v2i64, { 1, 1, 1, 1 } }, // por | ||||
1308 | |||||
1309 | { ISD::XOR, MVT::v16i8, { 1, 1, 1, 1 } }, // pxor | ||||
1310 | { ISD::XOR, MVT::v8i16, { 1, 1, 1, 1 } }, // pxor | ||||
1311 | { ISD::XOR, MVT::v4i32, { 1, 1, 1, 1 } }, // pxor | ||||
1312 | { ISD::XOR, MVT::v2i64, { 1, 1, 1, 1 } }, // pxor | ||||
1313 | |||||
1314 | { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq | ||||
1315 | { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq | ||||
1316 | |||||
1317 | { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw | ||||
1318 | { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle | ||||
1319 | { ISD::MUL, MVT::v2i64, { 8, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add | ||||
1320 | |||||
1321 | { ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/ | ||||
1322 | { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/ | ||||
1323 | { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/ | ||||
1324 | { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/ | ||||
1325 | |||||
1326 | { ISD::FNEG, MVT::f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/ | ||||
1327 | { ISD::FNEG, MVT::f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/ | ||||
1328 | { ISD::FNEG, MVT::v4f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/ | ||||
1329 | { ISD::FNEG, MVT::v2f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/ | ||||
1330 | |||||
1331 | { ISD::FADD, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ | ||||
1332 | { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ | ||||
1333 | { ISD::FADD, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ | ||||
1334 | |||||
1335 | { ISD::FSUB, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ | ||||
1336 | { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ | ||||
1337 | { ISD::FSUB, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ | ||||
1338 | |||||
1339 | { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/ | ||||
1340 | { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/ | ||||
1341 | }; | ||||
1342 | |||||
1343 | if (ST->hasSSE2()) | ||||
1344 | if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second)) | ||||
1345 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
1346 | return LT.first * *KindCost; | ||||
1347 | |||||
1348 | static const CostKindTblEntry SSE1CostTable[] = { | ||||
1349 | { ISD::FDIV, MVT::f32, { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/ | ||||
1350 | { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/ | ||||
1351 | |||||
1352 | { ISD::FNEG, MVT::f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/ | ||||
1353 | { ISD::FNEG, MVT::v4f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/ | ||||
1354 | |||||
1355 | { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/ | ||||
1356 | { ISD::FADD, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/ | ||||
1357 | |||||
1358 | { ISD::FSUB, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/ | ||||
1359 | { ISD::FSUB, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/ | ||||
1360 | |||||
1361 | { ISD::FMUL, MVT::f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/ | ||||
1362 | { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/ | ||||
1363 | }; | ||||
1364 | |||||
1365 | if (ST->hasSSE1()) | ||||
1366 | if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second)) | ||||
1367 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
1368 | return LT.first * *KindCost; | ||||
1369 | |||||
1370 | static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets | ||||
1371 | { ISD::ADD, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/ | ||||
1372 | { ISD::SUB, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/ | ||||
1373 | { ISD::MUL, MVT::i64, { 2 } }, // Nehalem from http://www.agner.org/ | ||||
1374 | }; | ||||
1375 | |||||
1376 | if (ST->is64Bit()) | ||||
1377 | if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second)) | ||||
1378 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
1379 | return LT.first * *KindCost; | ||||
1380 | |||||
1381 | static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets | ||||
1382 | { ISD::ADD, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/ | ||||
1383 | { ISD::ADD, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/ | ||||
1384 | { ISD::ADD, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/ | ||||
1385 | |||||
1386 | { ISD::SUB, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/ | ||||
1387 | { ISD::SUB, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/ | ||||
1388 | { ISD::SUB, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/ | ||||
1389 | |||||
1390 | { ISD::FNEG, MVT::f64, { 2, 2, 1, 3 } }, // (x87) | ||||
1391 | { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // (x87) | ||||
1392 | { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // (x87) | ||||
1393 | { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // (x87) | ||||
1394 | { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87) | ||||
1395 | }; | ||||
1396 | |||||
1397 | if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second)) | ||||
1398 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
1399 | return LT.first * *KindCost; | ||||
1400 | |||||
1401 | // It is not a good idea to vectorize division. We have to scalarize it and | ||||
1402 | // in the process we will often end up having to spilling regular | ||||
1403 | // registers. The overhead of division is going to dominate most kernels | ||||
1404 | // anyways so try hard to prevent vectorization of division - it is | ||||
1405 | // generally a bad idea. Assume somewhat arbitrarily that we have to be able | ||||
1406 | // to hide "20 cycles" for each lane. | ||||
1407 | if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() && | ||||
1408 | (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV || | ||||
1409 | ISD == ISD::UREM)) { | ||||
1410 | InstructionCost ScalarCost = | ||||
1411 | getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind, | ||||
1412 | Op1Info.getNoProps(), Op2Info.getNoProps()); | ||||
1413 | return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost; | ||||
1414 | } | ||||
1415 | |||||
1416 | // Handle some basic single instruction code size cases. | ||||
1417 | if (CostKind == TTI::TCK_CodeSize) { | ||||
1418 | switch (ISD) { | ||||
1419 | case ISD::FADD: | ||||
1420 | case ISD::FSUB: | ||||
1421 | case ISD::FMUL: | ||||
1422 | case ISD::FDIV: | ||||
1423 | case ISD::FNEG: | ||||
1424 | case ISD::AND: | ||||
1425 | case ISD::OR: | ||||
1426 | case ISD::XOR: | ||||
1427 | return LT.first; | ||||
1428 | break; | ||||
1429 | } | ||||
1430 | } | ||||
1431 | |||||
1432 | // Fallback to the default implementation. | ||||
1433 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, | ||||
1434 | Args, CxtI); | ||||
1435 | } | ||||
1436 | |||||
1437 | InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, | ||||
1438 | VectorType *BaseTp, | ||||
1439 | ArrayRef<int> Mask, | ||||
1440 | TTI::TargetCostKind CostKind, | ||||
1441 | int Index, VectorType *SubTp, | ||||
1442 | ArrayRef<const Value *> Args) { | ||||
1443 | // 64-bit packed float vectors (v2f32) are widened to type v4f32. | ||||
1444 | // 64-bit packed integer vectors (v2i32) are widened to type v4i32. | ||||
1445 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(BaseTp); | ||||
1446 | |||||
1447 | Kind = improveShuffleKindFromMask(Kind, Mask); | ||||
1448 | |||||
1449 | // Treat Transpose as 2-op shuffles - there's no difference in lowering. | ||||
1450 | if (Kind == TTI::SK_Transpose) | ||||
1451 | Kind = TTI::SK_PermuteTwoSrc; | ||||
1452 | |||||
1453 | // For Broadcasts we are splatting the first element from the first input | ||||
1454 | // register, so only need to reference that input and all the output | ||||
1455 | // registers are the same. | ||||
1456 | if (Kind == TTI::SK_Broadcast) | ||||
1457 | LT.first = 1; | ||||
1458 | |||||
1459 | // Subvector extractions are free if they start at the beginning of a | ||||
1460 | // vector and cheap if the subvectors are aligned. | ||||
1461 | if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) { | ||||
1462 | int NumElts = LT.second.getVectorNumElements(); | ||||
1463 | if ((Index % NumElts) == 0) | ||||
1464 | return 0; | ||||
1465 | std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp); | ||||
1466 | if (SubLT.second.isVector()) { | ||||
1467 | int NumSubElts = SubLT.second.getVectorNumElements(); | ||||
1468 | if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) | ||||
1469 | return SubLT.first; | ||||
1470 | // Handle some cases for widening legalization. For now we only handle | ||||
1471 | // cases where the original subvector was naturally aligned and evenly | ||||
1472 | // fit in its legalized subvector type. | ||||
1473 | // FIXME: Remove some of the alignment restrictions. | ||||
1474 | // FIXME: We can use permq for 64-bit or larger extracts from 256-bit | ||||
1475 | // vectors. | ||||
1476 | int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements(); | ||||
1477 | if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 && | ||||
1478 | (NumSubElts % OrigSubElts) == 0 && | ||||
1479 | LT.second.getVectorElementType() == | ||||
1480 | SubLT.second.getVectorElementType() && | ||||
1481 | LT.second.getVectorElementType().getSizeInBits() == | ||||
1482 | BaseTp->getElementType()->getPrimitiveSizeInBits()) { | ||||
1483 | assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&(static_cast <bool> (NumElts >= NumSubElts && NumElts > OrigSubElts && "Unexpected number of elements!" ) ? void (0) : __assert_fail ("NumElts >= NumSubElts && NumElts > OrigSubElts && \"Unexpected number of elements!\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 1484, __extension__ __PRETTY_FUNCTION__)) | ||||
1484 | "Unexpected number of elements!")(static_cast <bool> (NumElts >= NumSubElts && NumElts > OrigSubElts && "Unexpected number of elements!" ) ? void (0) : __assert_fail ("NumElts >= NumSubElts && NumElts > OrigSubElts && \"Unexpected number of elements!\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 1484, __extension__ __PRETTY_FUNCTION__)); | ||||
1485 | auto *VecTy = FixedVectorType::get(BaseTp->getElementType(), | ||||
1486 | LT.second.getVectorNumElements()); | ||||
1487 | auto *SubTy = FixedVectorType::get(BaseTp->getElementType(), | ||||
1488 | SubLT.second.getVectorNumElements()); | ||||
1489 | int ExtractIndex = alignDown((Index % NumElts), NumSubElts); | ||||
1490 | InstructionCost ExtractCost = | ||||
1491 | getShuffleCost(TTI::SK_ExtractSubvector, VecTy, std::nullopt, | ||||
1492 | CostKind, ExtractIndex, SubTy); | ||||
1493 | |||||
1494 | // If the original size is 32-bits or more, we can use pshufd. Otherwise | ||||
1495 | // if we have SSSE3 we can use pshufb. | ||||
1496 | if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3()) | ||||
1497 | return ExtractCost + 1; // pshufd or pshufb | ||||
1498 | |||||
1499 | assert(SubTp->getPrimitiveSizeInBits() == 16 &&(static_cast <bool> (SubTp->getPrimitiveSizeInBits() == 16 && "Unexpected vector size") ? void (0) : __assert_fail ("SubTp->getPrimitiveSizeInBits() == 16 && \"Unexpected vector size\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 1500, __extension__ __PRETTY_FUNCTION__)) | ||||
1500 | "Unexpected vector size")(static_cast <bool> (SubTp->getPrimitiveSizeInBits() == 16 && "Unexpected vector size") ? void (0) : __assert_fail ("SubTp->getPrimitiveSizeInBits() == 16 && \"Unexpected vector size\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 1500, __extension__ __PRETTY_FUNCTION__)); | ||||
1501 | |||||
1502 | return ExtractCost + 2; // worst case pshufhw + pshufd | ||||
1503 | } | ||||
1504 | } | ||||
1505 | } | ||||
1506 | |||||
1507 | // Subvector insertions are cheap if the subvectors are aligned. | ||||
1508 | // Note that in general, the insertion starting at the beginning of a vector | ||||
1509 | // isn't free, because we need to preserve the rest of the wide vector. | ||||
1510 | if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) { | ||||
1511 | int NumElts = LT.second.getVectorNumElements(); | ||||
1512 | std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp); | ||||
1513 | if (SubLT.second.isVector()) { | ||||
1514 | int NumSubElts = SubLT.second.getVectorNumElements(); | ||||
1515 | if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) | ||||
1516 | return SubLT.first; | ||||
1517 | } | ||||
1518 | |||||
1519 | // If the insertion isn't aligned, treat it like a 2-op shuffle. | ||||
1520 | Kind = TTI::SK_PermuteTwoSrc; | ||||
1521 | } | ||||
1522 | |||||
1523 | // Handle some common (illegal) sub-vector types as they are often very cheap | ||||
1524 | // to shuffle even on targets without PSHUFB. | ||||
1525 | EVT VT = TLI->getValueType(DL, BaseTp); | ||||
1526 | if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 && | ||||
1527 | !ST->hasSSSE3()) { | ||||
1528 | static const CostTblEntry SSE2SubVectorShuffleTbl[] = { | ||||
1529 | {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw | ||||
1530 | {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw | ||||
1531 | {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw | ||||
1532 | {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw | ||||
1533 | {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck | ||||
1534 | |||||
1535 | {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw | ||||
1536 | {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw | ||||
1537 | {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus | ||||
1538 | {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck | ||||
1539 | |||||
1540 | {TTI::SK_Splice, MVT::v4i16, 2}, // punpck+psrldq | ||||
1541 | {TTI::SK_Splice, MVT::v2i16, 2}, // punpck+psrldq | ||||
1542 | {TTI::SK_Splice, MVT::v4i8, 2}, // punpck+psrldq | ||||
1543 | {TTI::SK_Splice, MVT::v2i8, 2}, // punpck+psrldq | ||||
1544 | |||||
1545 | {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw | ||||
1546 | {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw | ||||
1547 | {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw | ||||
1548 | {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw | ||||
1549 | {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck | ||||
1550 | |||||
1551 | {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw | ||||
1552 | {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw | ||||
1553 | {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw | ||||
1554 | {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw | ||||
1555 | {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck | ||||
1556 | }; | ||||
1557 | |||||
1558 | if (ST->hasSSE2()) | ||||
1559 | if (const auto *Entry = | ||||
1560 | CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT())) | ||||
1561 | return Entry->Cost; | ||||
1562 | } | ||||
1563 | |||||
1564 | // We are going to permute multiple sources and the result will be in multiple | ||||
1565 | // destinations. Providing an accurate cost only for splits where the element | ||||
1566 | // type remains the same. | ||||
1567 | if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) { | ||||
1568 | MVT LegalVT = LT.second; | ||||
1569 | if (LegalVT.isVector() && | ||||
1570 | LegalVT.getVectorElementType().getSizeInBits() == | ||||
1571 | BaseTp->getElementType()->getPrimitiveSizeInBits() && | ||||
1572 | LegalVT.getVectorNumElements() < | ||||
1573 | cast<FixedVectorType>(BaseTp)->getNumElements()) { | ||||
1574 | |||||
1575 | unsigned VecTySize = DL.getTypeStoreSize(BaseTp); | ||||
1576 | unsigned LegalVTSize = LegalVT.getStoreSize(); | ||||
1577 | // Number of source vectors after legalization: | ||||
1578 | unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize; | ||||
1579 | // Number of destination vectors after legalization: | ||||
1580 | InstructionCost NumOfDests = LT.first; | ||||
1581 | |||||
1582 | auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(), | ||||
1583 | LegalVT.getVectorNumElements()); | ||||
1584 | |||||
1585 | if (!Mask.empty() && NumOfDests.isValid()) { | ||||
1586 | // Try to perform better estimation of the permutation. | ||||
1587 | // 1. Split the source/destination vectors into real registers. | ||||
1588 | // 2. Do the mask analysis to identify which real registers are | ||||
1589 | // permuted. If more than 1 source registers are used for the | ||||
1590 | // destination register building, the cost for this destination register | ||||
1591 | // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one | ||||
1592 | // source register is used, build mask and calculate the cost as a cost | ||||
1593 | // of PermuteSingleSrc. | ||||
1594 | // Also, for the single register permute we try to identify if the | ||||
1595 | // destination register is just a copy of the source register or the | ||||
1596 | // copy of the previous destination register (the cost is | ||||
1597 | // TTI::TCC_Basic). If the source register is just reused, the cost for | ||||
1598 | // this operation is 0. | ||||
1599 | unsigned E = *NumOfDests.getValue(); | ||||
1600 | unsigned NormalizedVF = | ||||
1601 | LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E); | ||||
1602 | unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements(); | ||||
1603 | unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements(); | ||||
1604 | SmallVector<int> NormalizedMask(NormalizedVF, UndefMaskElem); | ||||
1605 | copy(Mask, NormalizedMask.begin()); | ||||
1606 | unsigned PrevSrcReg = 0; | ||||
1607 | ArrayRef<int> PrevRegMask; | ||||
1608 | InstructionCost Cost = 0; | ||||
1609 | processShuffleMasks( | ||||
1610 | NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {}, | ||||
1611 | [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask, | ||||
1612 | &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) { | ||||
1613 | if (!ShuffleVectorInst::isIdentityMask(RegMask)) { | ||||
1614 | // Check if the previous register can be just copied to the next | ||||
1615 | // one. | ||||
1616 | if (PrevRegMask.empty() || PrevSrcReg != SrcReg || | ||||
1617 | PrevRegMask != RegMask) | ||||
1618 | Cost += getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy, | ||||
1619 | RegMask, CostKind, 0, nullptr); | ||||
1620 | else | ||||
1621 | // Just a copy of previous destination register. | ||||
1622 | Cost += TTI::TCC_Basic; | ||||
1623 | return; | ||||
1624 | } | ||||
1625 | if (SrcReg != DestReg && | ||||
1626 | any_of(RegMask, [](int I) { return I != UndefMaskElem; })) { | ||||
1627 | // Just a copy of the source register. | ||||
1628 | Cost += TTI::TCC_Basic; | ||||
1629 | } | ||||
1630 | PrevSrcReg = SrcReg; | ||||
1631 | PrevRegMask = RegMask; | ||||
1632 | }, | ||||
1633 | [this, SingleOpTy, CostKind, &Cost](ArrayRef<int> RegMask, | ||||
1634 | unsigned /*Unused*/, | ||||
1635 | unsigned /*Unused*/) { | ||||
1636 | Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask, | ||||
1637 | CostKind, 0, nullptr); | ||||
1638 | }); | ||||
1639 | return Cost; | ||||
1640 | } | ||||
1641 | |||||
1642 | InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests; | ||||
1643 | return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, | ||||
1644 | std::nullopt, CostKind, 0, nullptr); | ||||
1645 | } | ||||
1646 | |||||
1647 | return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp); | ||||
1648 | } | ||||
1649 | |||||
1650 | // For 2-input shuffles, we must account for splitting the 2 inputs into many. | ||||
1651 | if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) { | ||||
1652 | // We assume that source and destination have the same vector type. | ||||
1653 | InstructionCost NumOfDests = LT.first; | ||||
1654 | InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1; | ||||
1655 | LT.first = NumOfDests * NumOfShufflesPerDest; | ||||
1656 | } | ||||
1657 | |||||
1658 | static const CostTblEntry AVX512VBMIShuffleTbl[] = { | ||||
1659 | {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb | ||||
1660 | {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb | ||||
1661 | |||||
1662 | {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb | ||||
1663 | {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb | ||||
1664 | |||||
1665 | {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b | ||||
1666 | {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b | ||||
1667 | {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b | ||||
1668 | }; | ||||
1669 | |||||
1670 | if (ST->hasVBMI()) | ||||
1671 | if (const auto *Entry = | ||||
1672 | CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second)) | ||||
1673 | return LT.first * Entry->Cost; | ||||
1674 | |||||
1675 | static const CostTblEntry AVX512BWShuffleTbl[] = { | ||||
1676 | {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw | ||||
1677 | {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw | ||||
1678 | {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb | ||||
1679 | |||||
1680 | {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw | ||||
1681 | {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw | ||||
1682 | {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw | ||||
1683 | {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2 | ||||
1684 | |||||
1685 | {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw | ||||
1686 | {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw | ||||
1687 | {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw | ||||
1688 | {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw | ||||
1689 | {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16 | ||||
1690 | |||||
1691 | {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w | ||||
1692 | {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w | ||||
1693 | {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w | ||||
1694 | {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w | ||||
1695 | {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1 | ||||
1696 | |||||
1697 | {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw | ||||
1698 | {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb | ||||
1699 | |||||
1700 | {TTI::SK_Splice, MVT::v32i16, 2}, // vshufi64x2 + palignr | ||||
1701 | {TTI::SK_Splice, MVT::v32f16, 2}, // vshufi64x2 + palignr | ||||
1702 | {TTI::SK_Splice, MVT::v64i8, 2}, // vshufi64x2 + palignr | ||||
1703 | }; | ||||
1704 | |||||
1705 | if (ST->hasBWI()) | ||||
1706 | if (const auto *Entry = | ||||
1707 | CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second)) | ||||
1708 | return LT.first * Entry->Cost; | ||||
1709 | |||||
1710 | static const CostKindTblEntry AVX512ShuffleTbl[] = { | ||||
1711 | {TTI::SK_Broadcast, MVT::v8f64, { 1, 1, 1, 1 } }, // vbroadcastsd | ||||
1712 | {TTI::SK_Broadcast, MVT::v16f32, { 1, 1, 1, 1 } }, // vbroadcastss | ||||
1713 | {TTI::SK_Broadcast, MVT::v8i64, { 1, 1, 1, 1 } }, // vpbroadcastq | ||||
1714 | {TTI::SK_Broadcast, MVT::v16i32, { 1, 1, 1, 1 } }, // vpbroadcastd | ||||
1715 | {TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw | ||||
1716 | {TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw | ||||
1717 | {TTI::SK_Broadcast, MVT::v64i8, { 1, 1, 1, 1 } }, // vpbroadcastb | ||||
1718 | |||||
1719 | {TTI::SK_Reverse, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd | ||||
1720 | {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps | ||||
1721 | {TTI::SK_Reverse, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq | ||||
1722 | {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd | ||||
1723 | {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca | ||||
1724 | {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca | ||||
1725 | {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca | ||||
1726 | |||||
1727 | {TTI::SK_Splice, MVT::v8f64, { 1, 1, 1, 1 } }, // vpalignd | ||||
1728 | {TTI::SK_Splice, MVT::v4f64, { 1, 1, 1, 1 } }, // vpalignd | ||||
1729 | {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd | ||||
1730 | {TTI::SK_Splice, MVT::v8f32, { 1, 1, 1, 1 } }, // vpalignd | ||||
1731 | {TTI::SK_Splice, MVT::v8i64, { 1, 1, 1, 1 } }, // vpalignd | ||||
1732 | {TTI::SK_Splice, MVT::v4i64, { 1, 1, 1, 1 } }, // vpalignd | ||||
1733 | {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd | ||||
1734 | {TTI::SK_Splice, MVT::v8i32, { 1, 1, 1, 1 } }, // vpalignd | ||||
1735 | {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr | ||||
1736 | {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr | ||||
1737 | {TTI::SK_Splice, MVT::v64i8, { 4, 4, 4, 4 } }, // split + palignr | ||||
1738 | |||||
1739 | {TTI::SK_PermuteSingleSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd | ||||
1740 | {TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermpd | ||||
1741 | {TTI::SK_PermuteSingleSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermpd | ||||
1742 | {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps | ||||
1743 | {TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermps | ||||
1744 | {TTI::SK_PermuteSingleSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermps | ||||
1745 | {TTI::SK_PermuteSingleSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq | ||||
1746 | {TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermq | ||||
1747 | {TTI::SK_PermuteSingleSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermq | ||||
1748 | {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd | ||||
1749 | {TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermd | ||||
1750 | {TTI::SK_PermuteSingleSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermd | ||||
1751 | {TTI::SK_PermuteSingleSrc, MVT::v16i8, { 1, 3, 1, 1 } }, // pshufb | ||||
1752 | |||||
1753 | {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermt2pd | ||||
1754 | {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermt2ps | ||||
1755 | {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermt2q | ||||
1756 | {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermt2d | ||||
1757 | {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermt2pd | ||||
1758 | {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermt2ps | ||||
1759 | {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermt2q | ||||
1760 | {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermt2d | ||||
1761 | {TTI::SK_PermuteTwoSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermt2pd | ||||
1762 | {TTI::SK_PermuteTwoSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermt2ps | ||||
1763 | {TTI::SK_PermuteTwoSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermt2q | ||||
1764 | {TTI::SK_PermuteTwoSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermt2d | ||||
1765 | |||||
1766 | // FIXME: This just applies the type legalization cost rules above | ||||
1767 | // assuming these completely split. | ||||
1768 | {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } }, | ||||
1769 | {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } }, | ||||
1770 | {TTI::SK_PermuteSingleSrc, MVT::v64i8, { 14, 14, 14, 14 } }, | ||||
1771 | {TTI::SK_PermuteTwoSrc, MVT::v32i16, { 42, 42, 42, 42 } }, | ||||
1772 | {TTI::SK_PermuteTwoSrc, MVT::v32f16, { 42, 42, 42, 42 } }, | ||||
1773 | {TTI::SK_PermuteTwoSrc, MVT::v64i8, { 42, 42, 42, 42 } }, | ||||
1774 | |||||
1775 | {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq | ||||
1776 | {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq | ||||
1777 | {TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vpternlogq | ||||
1778 | {TTI::SK_Select, MVT::v8f64, { 1, 1, 1, 1 } }, // vblendmpd | ||||
1779 | {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps | ||||
1780 | {TTI::SK_Select, MVT::v8i64, { 1, 1, 1, 1 } }, // vblendmq | ||||
1781 | {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd | ||||
1782 | }; | ||||
1783 | |||||
1784 | if (ST->hasAVX512()) | ||||
1785 | if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second)) | ||||
1786 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
1787 | return LT.first * *KindCost; | ||||
1788 | |||||
1789 | static const CostTblEntry AVX2ShuffleTbl[] = { | ||||
1790 | {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd | ||||
1791 | {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps | ||||
1792 | {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq | ||||
1793 | {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd | ||||
1794 | {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw | ||||
1795 | {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw | ||||
1796 | {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb | ||||
1797 | |||||
1798 | {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd | ||||
1799 | {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps | ||||
1800 | {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq | ||||
1801 | {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd | ||||
1802 | {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb | ||||
1803 | {TTI::SK_Reverse, MVT::v16f16, 2}, // vperm2i128 + pshufb | ||||
1804 | {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb | ||||
1805 | |||||
1806 | {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb | ||||
1807 | {TTI::SK_Select, MVT::v16f16, 1}, // vpblendvb | ||||
1808 | {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb | ||||
1809 | |||||
1810 | {TTI::SK_Splice, MVT::v8i32, 2}, // vperm2i128 + vpalignr | ||||
1811 | {TTI::SK_Splice, MVT::v8f32, 2}, // vperm2i128 + vpalignr | ||||
1812 | {TTI::SK_Splice, MVT::v16i16, 2}, // vperm2i128 + vpalignr | ||||
1813 | {TTI::SK_Splice, MVT::v16f16, 2}, // vperm2i128 + vpalignr | ||||
1814 | {TTI::SK_Splice, MVT::v32i8, 2}, // vperm2i128 + vpalignr | ||||
1815 | |||||
1816 | {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd | ||||
1817 | {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps | ||||
1818 | {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq | ||||
1819 | {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd | ||||
1820 | {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb | ||||
1821 | // + vpblendvb | ||||
1822 | {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vperm2i128 + 2*vpshufb | ||||
1823 | // + vpblendvb | ||||
1824 | {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb | ||||
1825 | // + vpblendvb | ||||
1826 | |||||
1827 | {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd | ||||
1828 | {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps | ||||
1829 | {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd | ||||
1830 | {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd | ||||
1831 | {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb | ||||
1832 | // + vpblendvb | ||||
1833 | {TTI::SK_PermuteTwoSrc, MVT::v16f16, 7}, // 2*vperm2i128 + 4*vpshufb | ||||
1834 | // + vpblendvb | ||||
1835 | {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb | ||||
1836 | // + vpblendvb | ||||
1837 | }; | ||||
1838 | |||||
1839 | if (ST->hasAVX2()) | ||||
1840 | if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second)) | ||||
1841 | return LT.first * Entry->Cost; | ||||
1842 | |||||
1843 | static const CostTblEntry XOPShuffleTbl[] = { | ||||
1844 | {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd | ||||
1845 | {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps | ||||
1846 | {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd | ||||
1847 | {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps | ||||
1848 | {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm | ||||
1849 | // + vinsertf128 | ||||
1850 | {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm | ||||
1851 | // + vinsertf128 | ||||
1852 | |||||
1853 | {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm | ||||
1854 | // + vinsertf128 | ||||
1855 | {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm | ||||
1856 | {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm | ||||
1857 | // + vinsertf128 | ||||
1858 | {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm | ||||
1859 | }; | ||||
1860 | |||||
1861 | if (ST->hasXOP()) | ||||
1862 | if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second)) | ||||
1863 | return LT.first * Entry->Cost; | ||||
1864 | |||||
1865 | static const CostTblEntry AVX1ShuffleTbl[] = { | ||||
1866 | {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd | ||||
1867 | {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps | ||||
1868 | {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd | ||||
1869 | {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps | ||||
1870 | {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128 | ||||
1871 | {TTI::SK_Broadcast, MVT::v16f16, 3}, // vpshuflw + vpshufd + vinsertf128 | ||||
1872 | {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128 | ||||
1873 | |||||
1874 | {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd | ||||
1875 | {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps | ||||
1876 | {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd | ||||
1877 | {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps | ||||
1878 | {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb | ||||
1879 | // + vinsertf128 | ||||
1880 | {TTI::SK_Reverse, MVT::v16f16, 4}, // vextractf128 + 2*pshufb | ||||
1881 | // + vinsertf128 | ||||
1882 | {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb | ||||
1883 | // + vinsertf128 | ||||
1884 | |||||
1885 | {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd | ||||
1886 | {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd | ||||
1887 | {TTI::SK_Select, MVT::v8i32, 1}, // vblendps | ||||
1888 | {TTI::SK_Select, MVT::v8f32, 1}, // vblendps | ||||
1889 | {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor | ||||
1890 | {TTI::SK_Select, MVT::v16f16, 3}, // vpand + vpandn + vpor | ||||
1891 | {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor | ||||
1892 | |||||
1893 | {TTI::SK_Splice, MVT::v4i64, 2}, // vperm2f128 + shufpd | ||||
1894 | {TTI::SK_Splice, MVT::v4f64, 2}, // vperm2f128 + shufpd | ||||
1895 | {TTI::SK_Splice, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps | ||||
1896 | {TTI::SK_Splice, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps | ||||
1897 | {TTI::SK_Splice, MVT::v16i16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128 | ||||
1898 | {TTI::SK_Splice, MVT::v16f16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128 | ||||
1899 | {TTI::SK_Splice, MVT::v32i8, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128 | ||||
1900 | |||||
1901 | {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd | ||||
1902 | {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd | ||||
1903 | {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps | ||||
1904 | {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps | ||||
1905 | {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb | ||||
1906 | // + 2*por + vinsertf128 | ||||
1907 | {TTI::SK_PermuteSingleSrc, MVT::v16f16, 8}, // vextractf128 + 4*pshufb | ||||
1908 | // + 2*por + vinsertf128 | ||||
1909 | {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb | ||||
1910 | // + 2*por + vinsertf128 | ||||
1911 | |||||
1912 | {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd | ||||
1913 | {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd | ||||
1914 | {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps | ||||
1915 | {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps | ||||
1916 | {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb | ||||
1917 | // + 4*por + vinsertf128 | ||||
1918 | {TTI::SK_PermuteTwoSrc, MVT::v16f16, 15}, // 2*vextractf128 + 8*pshufb | ||||
1919 | // + 4*por + vinsertf128 | ||||
1920 | {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb | ||||
1921 | // + 4*por + vinsertf128 | ||||
1922 | }; | ||||
1923 | |||||
1924 | if (ST->hasAVX()) | ||||
1925 | if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second)) | ||||
1926 | return LT.first * Entry->Cost; | ||||
1927 | |||||
1928 | static const CostTblEntry SSE41ShuffleTbl[] = { | ||||
1929 | {TTI::SK_Select, MVT::v2i64, 1}, // pblendw | ||||
1930 | {TTI::SK_Select, MVT::v2f64, 1}, // movsd | ||||
1931 | {TTI::SK_Select, MVT::v4i32, 1}, // pblendw | ||||
1932 | {TTI::SK_Select, MVT::v4f32, 1}, // blendps | ||||
1933 | {TTI::SK_Select, MVT::v8i16, 1}, // pblendw | ||||
1934 | {TTI::SK_Select, MVT::v8f16, 1}, // pblendw | ||||
1935 | {TTI::SK_Select, MVT::v16i8, 1} // pblendvb | ||||
1936 | }; | ||||
1937 | |||||
1938 | if (ST->hasSSE41()) | ||||
1939 | if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second)) | ||||
1940 | return LT.first * Entry->Cost; | ||||
1941 | |||||
1942 | static const CostTblEntry SSSE3ShuffleTbl[] = { | ||||
1943 | {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb | ||||
1944 | {TTI::SK_Broadcast, MVT::v8f16, 1}, // pshufb | ||||
1945 | {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb | ||||
1946 | |||||
1947 | {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb | ||||
1948 | {TTI::SK_Reverse, MVT::v8f16, 1}, // pshufb | ||||
1949 | {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb | ||||
1950 | |||||
1951 | {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por | ||||
1952 | {TTI::SK_Select, MVT::v8f16, 3}, // 2*pshufb + por | ||||
1953 | {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por | ||||
1954 | |||||
1955 | {TTI::SK_Splice, MVT::v4i32, 1}, // palignr | ||||
1956 | {TTI::SK_Splice, MVT::v4f32, 1}, // palignr | ||||
1957 | {TTI::SK_Splice, MVT::v8i16, 1}, // palignr | ||||
1958 | {TTI::SK_Splice, MVT::v8f16, 1}, // palignr | ||||
1959 | {TTI::SK_Splice, MVT::v16i8, 1}, // palignr | ||||
1960 | |||||
1961 | {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb | ||||
1962 | {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // pshufb | ||||
1963 | {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb | ||||
1964 | |||||
1965 | {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por | ||||
1966 | {TTI::SK_PermuteTwoSrc, MVT::v8f16, 3}, // 2*pshufb + por | ||||
1967 | {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por | ||||
1968 | }; | ||||
1969 | |||||
1970 | if (ST->hasSSSE3()) | ||||
1971 | if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second)) | ||||
1972 | return LT.first * Entry->Cost; | ||||
1973 | |||||
1974 | static const CostTblEntry SSE2ShuffleTbl[] = { | ||||
1975 | {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd | ||||
1976 | {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd | ||||
1977 | {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd | ||||
1978 | {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd | ||||
1979 | {TTI::SK_Broadcast, MVT::v8f16, 2}, // pshuflw + pshufd | ||||
1980 | {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd | ||||
1981 | |||||
1982 | {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd | ||||
1983 | {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd | ||||
1984 | {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd | ||||
1985 | {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd | ||||
1986 | {TTI::SK_Reverse, MVT::v8f16, 3}, // pshuflw + pshufhw + pshufd | ||||
1987 | {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw | ||||
1988 | // + 2*pshufd + 2*unpck + packus | ||||
1989 | |||||
1990 | {TTI::SK_Select, MVT::v2i64, 1}, // movsd | ||||
1991 | {TTI::SK_Select, MVT::v2f64, 1}, // movsd | ||||
1992 | {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps | ||||
1993 | {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por | ||||
1994 | {TTI::SK_Select, MVT::v8f16, 3}, // pand + pandn + por | ||||
1995 | {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por | ||||
1996 | |||||
1997 | {TTI::SK_Splice, MVT::v2i64, 1}, // shufpd | ||||
1998 | {TTI::SK_Splice, MVT::v2f64, 1}, // shufpd | ||||
1999 | {TTI::SK_Splice, MVT::v4i32, 2}, // 2*{unpck,movsd,pshufd} | ||||
2000 | {TTI::SK_Splice, MVT::v8i16, 3}, // psrldq + psrlldq + por | ||||
2001 | {TTI::SK_Splice, MVT::v8f16, 3}, // psrldq + psrlldq + por | ||||
2002 | {TTI::SK_Splice, MVT::v16i8, 3}, // psrldq + psrlldq + por | ||||
2003 | |||||
2004 | {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd | ||||
2005 | {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd | ||||
2006 | {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd | ||||
2007 | {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw | ||||
2008 | // + pshufd/unpck | ||||
2009 | {TTI::SK_PermuteSingleSrc, MVT::v8f16, 5}, // 2*pshuflw + 2*pshufhw | ||||
2010 | // + pshufd/unpck | ||||
2011 | { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw | ||||
2012 | // + 2*pshufd + 2*unpck + 2*packus | ||||
2013 | |||||
2014 | { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd | ||||
2015 | { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd | ||||
2016 | { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd} | ||||
2017 | { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute | ||||
2018 | { TTI::SK_PermuteTwoSrc, MVT::v8f16, 8 }, // blend+permute | ||||
2019 | { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute | ||||
2020 | }; | ||||
2021 | |||||
2022 | static const CostTblEntry SSE3BroadcastLoadTbl[] = { | ||||
2023 | {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup | ||||
2024 | }; | ||||
2025 | |||||
2026 | if (ST->hasSSE2()) { | ||||
2027 | bool IsLoad = | ||||
2028 | llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); }); | ||||
2029 | if (ST->hasSSE3() && IsLoad) | ||||
2030 | if (const auto *Entry = | ||||
2031 | CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) { | ||||
2032 | assert(isLegalBroadcastLoad(BaseTp->getElementType(),(static_cast <bool> (isLegalBroadcastLoad(BaseTp->getElementType (), LT.second.getVectorElementCount()) && "Table entry missing from isLegalBroadcastLoad()" ) ? void (0) : __assert_fail ("isLegalBroadcastLoad(BaseTp->getElementType(), LT.second.getVectorElementCount()) && \"Table entry missing from isLegalBroadcastLoad()\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 2034, __extension__ __PRETTY_FUNCTION__)) | ||||
2033 | LT.second.getVectorElementCount()) &&(static_cast <bool> (isLegalBroadcastLoad(BaseTp->getElementType (), LT.second.getVectorElementCount()) && "Table entry missing from isLegalBroadcastLoad()" ) ? void (0) : __assert_fail ("isLegalBroadcastLoad(BaseTp->getElementType(), LT.second.getVectorElementCount()) && \"Table entry missing from isLegalBroadcastLoad()\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 2034, __extension__ __PRETTY_FUNCTION__)) | ||||
2034 | "Table entry missing from isLegalBroadcastLoad()")(static_cast <bool> (isLegalBroadcastLoad(BaseTp->getElementType (), LT.second.getVectorElementCount()) && "Table entry missing from isLegalBroadcastLoad()" ) ? void (0) : __assert_fail ("isLegalBroadcastLoad(BaseTp->getElementType(), LT.second.getVectorElementCount()) && \"Table entry missing from isLegalBroadcastLoad()\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 2034, __extension__ __PRETTY_FUNCTION__)); | ||||
2035 | return LT.first * Entry->Cost; | ||||
2036 | } | ||||
2037 | |||||
2038 | if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second)) | ||||
2039 | return LT.first * Entry->Cost; | ||||
2040 | } | ||||
2041 | |||||
2042 | static const CostTblEntry SSE1ShuffleTbl[] = { | ||||
2043 | { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps | ||||
2044 | { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps | ||||
2045 | { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps | ||||
2046 | { TTI::SK_Splice, MVT::v4f32, 2 }, // 2*shufps | ||||
2047 | { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps | ||||
2048 | { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps | ||||
2049 | }; | ||||
2050 | |||||
2051 | if (ST->hasSSE1()) | ||||
2052 | if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second)) | ||||
2053 | return LT.first * Entry->Cost; | ||||
2054 | |||||
2055 | return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp); | ||||
2056 | } | ||||
2057 | |||||
2058 | InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, | ||||
2059 | Type *Src, | ||||
2060 | TTI::CastContextHint CCH, | ||||
2061 | TTI::TargetCostKind CostKind, | ||||
2062 | const Instruction *I) { | ||||
2063 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | ||||
2064 | assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ? void (0) : __assert_fail ("ISD && \"Invalid opcode\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 2064, __extension__ __PRETTY_FUNCTION__)); | ||||
2065 | |||||
2066 | // TODO: Allow non-throughput costs that aren't binary. | ||||
2067 | auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost { | ||||
2068 | if (CostKind != TTI::TCK_RecipThroughput) | ||||
2069 | return Cost == 0 ? 0 : 1; | ||||
2070 | return Cost; | ||||
2071 | }; | ||||
2072 | |||||
2073 | // The cost tables include both specific, custom (non-legal) src/dst type | ||||
2074 | // conversions and generic, legalized types. We test for customs first, before | ||||
2075 | // falling back to legalization. | ||||
2076 | // FIXME: Need a better design of the cost table to handle non-simple types of | ||||
2077 | // potential massive combinations (elem_num x src_type x dst_type). | ||||
2078 | static const TypeConversionCostTblEntry AVX512BWConversionTbl[] { | ||||
2079 | { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 }, | ||||
2080 | { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 }, | ||||
2081 | |||||
2082 | // Mask sign extend has an instruction. | ||||
2083 | { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 }, | ||||
2084 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 }, | ||||
2085 | { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 }, | ||||
2086 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 }, | ||||
2087 | { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 }, | ||||
2088 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 }, | ||||
2089 | { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 }, | ||||
2090 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 }, | ||||
2091 | { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 }, | ||||
2092 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 }, | ||||
2093 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 }, | ||||
2094 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 }, | ||||
2095 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, | ||||
2096 | { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 }, | ||||
2097 | { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 }, | ||||
2098 | { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 }, | ||||
2099 | { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, 1 }, | ||||
2100 | |||||
2101 | // Mask zero extend is a sext + shift. | ||||
2102 | { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 }, | ||||
2103 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 }, | ||||
2104 | { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 }, | ||||
2105 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 }, | ||||
2106 | { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 }, | ||||
2107 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 }, | ||||
2108 | { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 }, | ||||
2109 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 }, | ||||
2110 | { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 }, | ||||
2111 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 }, | ||||
2112 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 }, | ||||
2113 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 }, | ||||
2114 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 }, | ||||
2115 | { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 }, | ||||
2116 | { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 }, | ||||
2117 | { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 }, | ||||
2118 | { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, 2 }, | ||||
2119 | |||||
2120 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, | ||||
2121 | { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 }, | ||||
2122 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, | ||||
2123 | { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 }, | ||||
2124 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, | ||||
2125 | { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 }, | ||||
2126 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, | ||||
2127 | { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 }, | ||||
2128 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, | ||||
2129 | { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 }, | ||||
2130 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, | ||||
2131 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, | ||||
2132 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, | ||||
2133 | { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, | ||||
2134 | { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, 2 }, | ||||
2135 | { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, 2 }, | ||||
2136 | { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, 2 }, | ||||
2137 | |||||
2138 | { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 2 }, | ||||
2139 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm | ||||
2140 | { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // vpmovwb | ||||
2141 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // vpmovwb | ||||
2142 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // vpmovwb | ||||
2143 | }; | ||||
2144 | |||||
2145 | static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = { | ||||
2146 | // Mask sign extend has an instruction. | ||||
2147 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, | ||||
2148 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 }, | ||||
2149 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, | ||||
2150 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, | ||||
2151 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, | ||||
2152 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, 1 }, | ||||
2153 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, | ||||
2154 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, | ||||
2155 | |||||
2156 | // Mask zero extend is a sext + shift. | ||||
2157 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, | ||||
2158 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 }, | ||||
2159 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, | ||||
2160 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, | ||||
2161 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, | ||||
2162 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, 2 }, | ||||
2163 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, | ||||
2164 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, | ||||
2165 | |||||
2166 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, | ||||
2167 | { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 }, | ||||
2168 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, | ||||
2169 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, | ||||
2170 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, | ||||
2171 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, | ||||
2172 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, | ||||
2173 | { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, 2 }, | ||||
2174 | |||||
2175 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, | ||||
2176 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, | ||||
2177 | |||||
2178 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, | ||||
2179 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, | ||||
2180 | |||||
2181 | { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 }, | ||||
2182 | { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 }, | ||||
2183 | |||||
2184 | { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 }, | ||||
2185 | { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 }, | ||||
2186 | }; | ||||
2187 | |||||
2188 | // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and | ||||
2189 | // 256-bit wide vectors. | ||||
2190 | |||||
2191 | static const TypeConversionCostTblEntry AVX512FConversionTbl[] = { | ||||
2192 | { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 }, | ||||
2193 | { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 }, | ||||
2194 | { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 }, | ||||
2195 | |||||
2196 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd | ||||
2197 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd | ||||
2198 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd | ||||
2199 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd | ||||
2200 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq | ||||
2201 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq | ||||
2202 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq | ||||
2203 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd | ||||
2204 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd | ||||
2205 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd | ||||
2206 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd | ||||
2207 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd | ||||
2208 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq | ||||
2209 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq | ||||
2210 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq | ||||
2211 | { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 2 }, // vpmovdb | ||||
2212 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 2 }, // vpmovdb | ||||
2213 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 }, // vpmovdb | ||||
2214 | { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, 2 }, // vpmovdb | ||||
2215 | { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, 2 }, // vpmovdb | ||||
2216 | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 }, // vpmovdw | ||||
2217 | { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, 2 }, // vpmovdw | ||||
2218 | { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 2 }, // vpmovqb | ||||
2219 | { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1 }, // vpshufb | ||||
2220 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 }, // vpmovqb | ||||
2221 | { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, 2 }, // vpmovqb | ||||
2222 | { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, 2 }, // vpmovqb | ||||
2223 | { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, 2 }, // vpmovqb | ||||
2224 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 }, // vpmovqw | ||||
2225 | { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, 2 }, // vpmovqw | ||||
2226 | { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, 2 }, // vpmovqw | ||||
2227 | { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, // vpmovqd | ||||
2228 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd | ||||
2229 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb | ||||
2230 | |||||
2231 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32 | ||||
2232 | { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 8 }, | ||||
2233 | { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, 8 }, | ||||
2234 | |||||
2235 | // Sign extend is zmm vpternlogd+vptruncdb. | ||||
2236 | // Zero extend is zmm broadcast load+vptruncdw. | ||||
2237 | { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 3 }, | ||||
2238 | { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 4 }, | ||||
2239 | { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 3 }, | ||||
2240 | { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 4 }, | ||||
2241 | { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 3 }, | ||||
2242 | { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 4 }, | ||||
2243 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 3 }, | ||||
2244 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 4 }, | ||||
2245 | |||||
2246 | // Sign extend is zmm vpternlogd+vptruncdw. | ||||
2247 | // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw. | ||||
2248 | { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 3 }, | ||||
2249 | { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 4 }, | ||||
2250 | { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 3 }, | ||||
2251 | { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 4 }, | ||||
2252 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 3 }, | ||||
2253 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 4 }, | ||||
2254 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 3 }, | ||||
2255 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, | ||||
2256 | |||||
2257 | { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd | ||||
2258 | { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld | ||||
2259 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd | ||||
2260 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld | ||||
2261 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd | ||||
2262 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld | ||||
2263 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq | ||||
2264 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq | ||||
2265 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq | ||||
2266 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq | ||||
2267 | |||||
2268 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd | ||||
2269 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld | ||||
2270 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq | ||||
2271 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq | ||||
2272 | |||||
2273 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, | ||||
2274 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, | ||||
2275 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, | ||||
2276 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, | ||||
2277 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 }, | ||||
2278 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 }, | ||||
2279 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, | ||||
2280 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, | ||||
2281 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, | ||||
2282 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, | ||||
2283 | |||||
2284 | { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right | ||||
2285 | { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right | ||||
2286 | |||||
2287 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, | ||||
2288 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, | ||||
2289 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 }, | ||||
2290 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 }, | ||||
2291 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, | ||||
2292 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 }, | ||||
2293 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, | ||||
2294 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, | ||||
2295 | |||||
2296 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, | ||||
2297 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, | ||||
2298 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 }, | ||||
2299 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 }, | ||||
2300 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, | ||||
2301 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 }, | ||||
2302 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, | ||||
2303 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, | ||||
2304 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 }, | ||||
2305 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 }, | ||||
2306 | |||||
2307 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 }, | ||||
2308 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, 7 }, | ||||
2309 | { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64,15 }, | ||||
2310 | { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32,11 }, | ||||
2311 | { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64,31 }, | ||||
2312 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, 3 }, | ||||
2313 | { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, 7 }, | ||||
2314 | { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, 5 }, | ||||
2315 | { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64,15 }, | ||||
2316 | { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 1 }, | ||||
2317 | { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, 3 }, | ||||
2318 | |||||
2319 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 }, | ||||
2320 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 3 }, | ||||
2321 | { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 3 }, | ||||
2322 | { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 }, | ||||
2323 | { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 3 }, | ||||
2324 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 3 }, | ||||
2325 | }; | ||||
2326 | |||||
2327 | static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] { | ||||
2328 | // Mask sign extend has an instruction. | ||||
2329 | { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 }, | ||||
2330 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 }, | ||||
2331 | { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 }, | ||||
2332 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 }, | ||||
2333 | { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 }, | ||||
2334 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 }, | ||||
2335 | { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 }, | ||||
2336 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 }, | ||||
2337 | { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 }, | ||||
2338 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 }, | ||||
2339 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 }, | ||||
2340 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 }, | ||||
2341 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, | ||||
2342 | { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 }, | ||||
2343 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, 1 }, | ||||
2344 | { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, 1 }, | ||||
2345 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, 1 }, | ||||
2346 | |||||
2347 | // Mask zero extend is a sext + shift. | ||||
2348 | { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 }, | ||||
2349 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 }, | ||||
2350 | { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 }, | ||||
2351 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 }, | ||||
2352 | { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 }, | ||||
2353 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 }, | ||||
2354 | { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 }, | ||||
2355 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 }, | ||||
2356 | { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 }, | ||||
2357 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 }, | ||||
2358 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 }, | ||||
2359 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 }, | ||||
2360 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 }, | ||||
2361 | { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 }, | ||||
2362 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, 2 }, | ||||
2363 | { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, 2 }, | ||||
2364 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, 2 }, | ||||
2365 | |||||
2366 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, | ||||
2367 | { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 }, | ||||
2368 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, | ||||
2369 | { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 }, | ||||
2370 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, | ||||
2371 | { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 }, | ||||
2372 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, | ||||
2373 | { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 }, | ||||
2374 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, | ||||
2375 | { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 }, | ||||
2376 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, | ||||
2377 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, | ||||
2378 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, | ||||
2379 | { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, | ||||
2380 | { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, 2 }, | ||||
2381 | { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, 2 }, | ||||
2382 | { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, 2 }, | ||||
2383 | |||||
2384 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, | ||||
2385 | }; | ||||
2386 | |||||
2387 | static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = { | ||||
2388 | // Mask sign extend has an instruction. | ||||
2389 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, | ||||
2390 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 }, | ||||
2391 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, | ||||
2392 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, 1 }, | ||||
2393 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, | ||||
2394 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, 1 }, | ||||
2395 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, 1 }, | ||||
2396 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, | ||||
2397 | |||||
2398 | // Mask zero extend is a sext + shift. | ||||
2399 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, | ||||
2400 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 }, | ||||
2401 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, | ||||
2402 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, 2 }, | ||||
2403 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, | ||||
2404 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, 2 }, | ||||
2405 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, 2 }, | ||||
2406 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, | ||||
2407 | |||||
2408 | { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, 2 }, | ||||
2409 | { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, 2 }, | ||||
2410 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, | ||||
2411 | { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 }, | ||||
2412 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, | ||||
2413 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, | ||||
2414 | { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, 2 }, | ||||
2415 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, | ||||
2416 | |||||
2417 | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, | ||||
2418 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, | ||||
2419 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, | ||||
2420 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, | ||||
2421 | |||||
2422 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, | ||||
2423 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, | ||||
2424 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, | ||||
2425 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, | ||||
2426 | |||||
2427 | { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, 1 }, | ||||
2428 | { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 }, | ||||
2429 | { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, | ||||
2430 | { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 }, | ||||
2431 | |||||
2432 | { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, 1 }, | ||||
2433 | { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 }, | ||||
2434 | { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, | ||||
2435 | { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 }, | ||||
2436 | }; | ||||
2437 | |||||
2438 | static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = { | ||||
2439 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd | ||||
2440 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd | ||||
2441 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd | ||||
2442 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8 | ||||
2443 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq | ||||
2444 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq | ||||
2445 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq | ||||
2446 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16 | ||||
2447 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd | ||||
2448 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd | ||||
2449 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd | ||||
2450 | { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, 2 }, // vpslld+vptestmd | ||||
2451 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq | ||||
2452 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq | ||||
2453 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd | ||||
2454 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, // vpmovqb | ||||
2455 | { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, // vpmovqw | ||||
2456 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, // vpmovwb | ||||
2457 | |||||
2458 | // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb | ||||
2459 | // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb | ||||
2460 | { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 5 }, | ||||
2461 | { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 6 }, | ||||
2462 | { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 5 }, | ||||
2463 | { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 6 }, | ||||
2464 | { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 5 }, | ||||
2465 | { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 6 }, | ||||
2466 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 10 }, | ||||
2467 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 12 }, | ||||
2468 | |||||
2469 | // sign extend is vpcmpeq+maskedmove+vpmovdw | ||||
2470 | // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw | ||||
2471 | { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 4 }, | ||||
2472 | { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 5 }, | ||||
2473 | { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 4 }, | ||||
2474 | { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 5 }, | ||||
2475 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 4 }, | ||||
2476 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 5 }, | ||||
2477 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 10 }, | ||||
2478 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 12 }, | ||||
2479 | |||||
2480 | { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd | ||||
2481 | { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld | ||||
2482 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd | ||||
2483 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld | ||||
2484 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd | ||||
2485 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld | ||||
2486 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, 1 }, // vpternlogd | ||||
2487 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, 2 }, // vpternlogd+psrld | ||||
2488 | |||||
2489 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq | ||||
2490 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq | ||||
2491 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq | ||||
2492 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq | ||||
2493 | |||||
2494 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 1 }, | ||||
2495 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 1 }, | ||||
2496 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 1 }, | ||||
2497 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 1 }, | ||||
2498 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, | ||||
2499 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, | ||||
2500 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 1 }, | ||||
2501 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 1 }, | ||||
2502 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, | ||||
2503 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, | ||||
2504 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, | ||||
2505 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, | ||||
2506 | |||||
2507 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, | ||||
2508 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 }, | ||||
2509 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, | ||||
2510 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 }, | ||||
2511 | |||||
2512 | { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 }, | ||||
2513 | { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 }, | ||||
2514 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, | ||||
2515 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 }, | ||||
2516 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, | ||||
2517 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 }, | ||||
2518 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, | ||||
2519 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, | ||||
2520 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, | ||||
2521 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, | ||||
2522 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 }, | ||||
2523 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 }, | ||||
2524 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 5 }, | ||||
2525 | |||||
2526 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 }, | ||||
2527 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 }, | ||||
2528 | { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, 5 }, | ||||
2529 | |||||
2530 | { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 }, | ||||
2531 | { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 }, | ||||
2532 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, | ||||
2533 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 1 }, | ||||
2534 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 }, | ||||
2535 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 }, | ||||
2536 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 }, | ||||
2537 | }; | ||||
2538 | |||||
2539 | static const TypeConversionCostTblEntry AVX2ConversionTbl[] = { | ||||
2540 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, | ||||
2541 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, | ||||
2542 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, | ||||
2543 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, | ||||
2544 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, | ||||
2545 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, | ||||
2546 | |||||
2547 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 2 }, | ||||
2548 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 2 }, | ||||
2549 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 2 }, | ||||
2550 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 2 }, | ||||
2551 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, | ||||
2552 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, | ||||
2553 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 2 }, | ||||
2554 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 2 }, | ||||
2555 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, | ||||
2556 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, | ||||
2557 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 3 }, | ||||
2558 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 3 }, | ||||
2559 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, | ||||
2560 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, | ||||
2561 | |||||
2562 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, | ||||
2563 | |||||
2564 | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 4 }, | ||||
2565 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 4 }, | ||||
2566 | { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 1 }, | ||||
2567 | { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 1 }, | ||||
2568 | { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 1 }, | ||||
2569 | { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 4 }, | ||||
2570 | { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 4 }, | ||||
2571 | { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 1 }, | ||||
2572 | { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 1 }, | ||||
2573 | { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 5 }, | ||||
2574 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, | ||||
2575 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 }, | ||||
2576 | |||||
2577 | { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 }, | ||||
2578 | { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 }, | ||||
2579 | |||||
2580 | { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 1 }, | ||||
2581 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 1 }, | ||||
2582 | { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 1 }, | ||||
2583 | { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 3 }, | ||||
2584 | |||||
2585 | { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 3 }, | ||||
2586 | { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 3 }, | ||||
2587 | { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 1 }, | ||||
2588 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 }, | ||||
2589 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 }, | ||||
2590 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4 }, | ||||
2591 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 3 }, | ||||
2592 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 4 }, | ||||
2593 | |||||
2594 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 }, | ||||
2595 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 }, | ||||
2596 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 }, | ||||
2597 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 }, | ||||
2598 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, | ||||
2599 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, | ||||
2600 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 3 }, | ||||
2601 | |||||
2602 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 }, | ||||
2603 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 }, | ||||
2604 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 }, | ||||
2605 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 }, | ||||
2606 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 }, | ||||
2607 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 }, | ||||
2608 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 2 }, | ||||
2609 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 }, | ||||
2610 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 }, | ||||
2611 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 }, | ||||
2612 | }; | ||||
2613 | |||||
2614 | static const TypeConversionCostTblEntry AVXConversionTbl[] = { | ||||
2615 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 }, | ||||
2616 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 }, | ||||
2617 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 }, | ||||
2618 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 }, | ||||
2619 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, | ||||
2620 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, | ||||
2621 | |||||
2622 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 3 }, | ||||
2623 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 3 }, | ||||
2624 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 3 }, | ||||
2625 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 3 }, | ||||
2626 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 3 }, | ||||
2627 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 }, | ||||
2628 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 3 }, | ||||
2629 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 3 }, | ||||
2630 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 3 }, | ||||
2631 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 }, | ||||
2632 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 3 }, | ||||
2633 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 }, | ||||
2634 | |||||
2635 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 4 }, | ||||
2636 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 5 }, | ||||
2637 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 4 }, | ||||
2638 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 9 }, | ||||
2639 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, 11 }, | ||||
2640 | |||||
2641 | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 }, | ||||
2642 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 }, | ||||
2643 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // and+extract+packuswb | ||||
2644 | { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 5 }, | ||||
2645 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, | ||||
2646 | { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 5 }, | ||||
2647 | { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 3 }, // and+extract+2*packusdw | ||||
2648 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 }, | ||||
2649 | |||||
2650 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, | ||||
2651 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 }, | ||||
2652 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 }, | ||||
2653 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 }, | ||||
2654 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 }, | ||||
2655 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, | ||||
2656 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 }, | ||||
2657 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 }, | ||||
2658 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 }, | ||||
2659 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 }, | ||||
2660 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 5 }, | ||||
2661 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 8 }, | ||||
2662 | |||||
2663 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 }, | ||||
2664 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 }, | ||||
2665 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 }, | ||||
2666 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 }, | ||||
2667 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 }, | ||||
2668 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, | ||||
2669 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 }, | ||||
2670 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 4 }, | ||||
2671 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 4 }, | ||||
2672 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 }, | ||||
2673 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 }, | ||||
2674 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 }, | ||||
2675 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 10 }, | ||||
2676 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 10 }, | ||||
2677 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 18 }, | ||||
2678 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 }, | ||||
2679 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 10 }, | ||||
2680 | |||||
2681 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 }, | ||||
2682 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, 2 }, | ||||
2683 | { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, 2 }, | ||||
2684 | { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, 2 }, | ||||
2685 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 2 }, | ||||
2686 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, 2 }, | ||||
2687 | { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 2 }, | ||||
2688 | { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, 2 }, | ||||
2689 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 2 }, | ||||
2690 | { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 2 }, | ||||
2691 | { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 5 }, | ||||
2692 | |||||
2693 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, 2 }, | ||||
2694 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, 2 }, | ||||
2695 | { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, 2 }, | ||||
2696 | { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, 2 }, | ||||
2697 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 2 }, | ||||
2698 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, 2 }, | ||||
2699 | { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 2 }, | ||||
2700 | { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, 2 }, | ||||
2701 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 }, | ||||
2702 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 }, | ||||
2703 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 6 }, | ||||
2704 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 7 }, | ||||
2705 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 7 }, | ||||
2706 | |||||
2707 | { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 }, | ||||
2708 | { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 }, | ||||
2709 | }; | ||||
2710 | |||||
2711 | static const TypeConversionCostTblEntry SSE41ConversionTbl[] = { | ||||
2712 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 1 }, | ||||
2713 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 1 }, | ||||
2714 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 1 }, | ||||
2715 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 1 }, | ||||
2716 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 }, | ||||
2717 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 1 }, | ||||
2718 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 1 }, | ||||
2719 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 1 }, | ||||
2720 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 }, | ||||
2721 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 1 }, | ||||
2722 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 }, | ||||
2723 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 1 }, | ||||
2724 | |||||
2725 | // These truncates end up widening elements. | ||||
2726 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ | ||||
2727 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ | ||||
2728 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD | ||||
2729 | |||||
2730 | { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 2 }, | ||||
2731 | { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 2 }, | ||||
2732 | { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 2 }, | ||||
2733 | |||||
2734 | { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 1 }, | ||||
2735 | { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 1 }, | ||||
2736 | { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 1 }, | ||||
2737 | { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 1 }, | ||||
2738 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 }, | ||||
2739 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, | ||||
2740 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 }, | ||||
2741 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, | ||||
2742 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, | ||||
2743 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 1 }, | ||||
2744 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 }, | ||||
2745 | |||||
2746 | { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 1 }, | ||||
2747 | { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 1 }, | ||||
2748 | { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 4 }, | ||||
2749 | { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 }, | ||||
2750 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 }, | ||||
2751 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, | ||||
2752 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 }, | ||||
2753 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, | ||||
2754 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 3 }, | ||||
2755 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 }, | ||||
2756 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 2 }, | ||||
2757 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 12 }, | ||||
2758 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 22 }, | ||||
2759 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 4 }, | ||||
2760 | |||||
2761 | { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 1 }, | ||||
2762 | { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 1 }, | ||||
2763 | { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 1 }, | ||||
2764 | { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 1 }, | ||||
2765 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 2 }, | ||||
2766 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 2 }, | ||||
2767 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 1 }, | ||||
2768 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 1 }, | ||||
2769 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, | ||||
2770 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 1 }, | ||||
2771 | |||||
2772 | { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 1 }, | ||||
2773 | { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 }, | ||||
2774 | { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 1 }, | ||||
2775 | { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 }, | ||||
2776 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 2 }, | ||||
2777 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 2 }, | ||||
2778 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 1 }, | ||||
2779 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 1 }, | ||||
2780 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 4 }, | ||||
2781 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 }, | ||||
2782 | }; | ||||
2783 | |||||
2784 | static const TypeConversionCostTblEntry SSE2ConversionTbl[] = { | ||||
2785 | // These are somewhat magic numbers justified by comparing the | ||||
2786 | // output of llvm-mca for our various supported scheduler models | ||||
2787 | // and basing it off the worst case scenario. | ||||
2788 | { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 3 }, | ||||
2789 | { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 3 }, | ||||
2790 | { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 3 }, | ||||
2791 | { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 3 }, | ||||
2792 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 3 }, | ||||
2793 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 }, | ||||
2794 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 3 }, | ||||
2795 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 }, | ||||
2796 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 }, | ||||
2797 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4 }, | ||||
2798 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 8 }, | ||||
2799 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 8 }, | ||||
2800 | |||||
2801 | { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 3 }, | ||||
2802 | { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 3 }, | ||||
2803 | { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 8 }, | ||||
2804 | { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 9 }, | ||||
2805 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 }, | ||||
2806 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 4 }, | ||||
2807 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 4 }, | ||||
2808 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 }, | ||||
2809 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 7 }, | ||||
2810 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 7 }, | ||||
2811 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 }, | ||||
2812 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 15 }, | ||||
2813 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 18 }, | ||||
2814 | |||||
2815 | { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 4 }, | ||||
2816 | { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 4 }, | ||||
2817 | { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 4 }, | ||||
2818 | { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 4 }, | ||||
2819 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 6 }, | ||||
2820 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 6 }, | ||||
2821 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 5 }, | ||||
2822 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 5 }, | ||||
2823 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 4 }, | ||||
2824 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 4 }, | ||||
2825 | |||||
2826 | { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 4 }, | ||||
2827 | { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 }, | ||||
2828 | { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 4 }, | ||||
2829 | { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 15 }, | ||||
2830 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 6 }, | ||||
2831 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 6 }, | ||||
2832 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 5 }, | ||||
2833 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 5 }, | ||||
2834 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 8 }, | ||||
2835 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 8 }, | ||||
2836 | |||||
2837 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 4 }, | ||||
2838 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 4 }, | ||||
2839 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 2 }, | ||||
2840 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 3 }, | ||||
2841 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 }, | ||||
2842 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 2 }, | ||||
2843 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 2 }, | ||||
2844 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 3 }, | ||||
2845 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 }, | ||||
2846 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 2 }, | ||||
2847 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 }, | ||||
2848 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 2 }, | ||||
2849 | |||||
2850 | // These truncates are really widening elements. | ||||
2851 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD | ||||
2852 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // PUNPCKLWD+DQ | ||||
2853 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // PUNPCKLBW+WD+PSHUFD | ||||
2854 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 1 }, // PUNPCKLWD | ||||
2855 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD | ||||
2856 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW | ||||
2857 | |||||
2858 | { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 2 }, // PAND+PACKUSWB | ||||
2859 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, | ||||
2860 | { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 3 }, // PAND+2*PACKUSWB | ||||
2861 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 }, | ||||
2862 | { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 }, | ||||
2863 | { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 3 }, | ||||
2864 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, | ||||
2865 | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32,10 }, | ||||
2866 | { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB | ||||
2867 | { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW | ||||
2868 | { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, 1 }, // PSHUFD | ||||
2869 | }; | ||||
2870 | |||||
2871 | // Attempt to map directly to (simple) MVT types to let us match custom entries. | ||||
2872 | EVT SrcTy = TLI->getValueType(DL, Src); | ||||
2873 | EVT DstTy = TLI->getValueType(DL, Dst); | ||||
2874 | |||||
2875 | // The function getSimpleVT only handles simple value types. | ||||
2876 | if (SrcTy.isSimple() && DstTy.isSimple()) { | ||||
2877 | MVT SimpleSrcTy = SrcTy.getSimpleVT(); | ||||
2878 | MVT SimpleDstTy = DstTy.getSimpleVT(); | ||||
2879 | |||||
2880 | if (ST->useAVX512Regs()) { | ||||
2881 | if (ST->hasBWI()) | ||||
2882 | if (const auto *Entry = ConvertCostTableLookup( | ||||
2883 | AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) | ||||
2884 | return AdjustCost(Entry->Cost); | ||||
2885 | |||||
2886 | if (ST->hasDQI()) | ||||
2887 | if (const auto *Entry = ConvertCostTableLookup( | ||||
2888 | AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) | ||||
2889 | return AdjustCost(Entry->Cost); | ||||
2890 | |||||
2891 | if (ST->hasAVX512()) | ||||
2892 | if (const auto *Entry = ConvertCostTableLookup( | ||||
2893 | AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) | ||||
2894 | return AdjustCost(Entry->Cost); | ||||
2895 | } | ||||
2896 | |||||
2897 | if (ST->hasBWI()) | ||||
2898 | if (const auto *Entry = ConvertCostTableLookup( | ||||
2899 | AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) | ||||
2900 | return AdjustCost(Entry->Cost); | ||||
2901 | |||||
2902 | if (ST->hasDQI()) | ||||
2903 | if (const auto *Entry = ConvertCostTableLookup( | ||||
2904 | AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) | ||||
2905 | return AdjustCost(Entry->Cost); | ||||
2906 | |||||
2907 | if (ST->hasAVX512()) | ||||
2908 | if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD, | ||||
2909 | SimpleDstTy, SimpleSrcTy)) | ||||
2910 | return AdjustCost(Entry->Cost); | ||||
2911 | |||||
2912 | if (ST->hasAVX2()) { | ||||
2913 | if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, | ||||
2914 | SimpleDstTy, SimpleSrcTy)) | ||||
2915 | return AdjustCost(Entry->Cost); | ||||
2916 | } | ||||
2917 | |||||
2918 | if (ST->hasAVX()) { | ||||
2919 | if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, | ||||
2920 | SimpleDstTy, SimpleSrcTy)) | ||||
2921 | return AdjustCost(Entry->Cost); | ||||
2922 | } | ||||
2923 | |||||
2924 | if (ST->hasSSE41()) { | ||||
2925 | if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, | ||||
2926 | SimpleDstTy, SimpleSrcTy)) | ||||
2927 | return AdjustCost(Entry->Cost); | ||||
2928 | } | ||||
2929 | |||||
2930 | if (ST->hasSSE2()) { | ||||
2931 | if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, | ||||
2932 | SimpleDstTy, SimpleSrcTy)) | ||||
2933 | return AdjustCost(Entry->Cost); | ||||
2934 | } | ||||
2935 | } | ||||
2936 | |||||
2937 | // Fall back to legalized types. | ||||
2938 | std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src); | ||||
2939 | std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst); | ||||
2940 | |||||
2941 | // If we're truncating to the same legalized type - just assume its free. | ||||
2942 | if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second) | ||||
2943 | return TTI::TCC_Free; | ||||
2944 | |||||
2945 | if (ST->useAVX512Regs()) { | ||||
2946 | if (ST->hasBWI()) | ||||
2947 | if (const auto *Entry = ConvertCostTableLookup( | ||||
2948 | AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second)) | ||||
2949 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | ||||
2950 | |||||
2951 | if (ST->hasDQI()) | ||||
2952 | if (const auto *Entry = ConvertCostTableLookup( | ||||
2953 | AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second)) | ||||
2954 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | ||||
2955 | |||||
2956 | if (ST->hasAVX512()) | ||||
2957 | if (const auto *Entry = ConvertCostTableLookup( | ||||
2958 | AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second)) | ||||
2959 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | ||||
2960 | } | ||||
2961 | |||||
2962 | if (ST->hasBWI()) | ||||
2963 | if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD, | ||||
2964 | LTDest.second, LTSrc.second)) | ||||
2965 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | ||||
2966 | |||||
2967 | if (ST->hasDQI()) | ||||
2968 | if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD, | ||||
2969 | LTDest.second, LTSrc.second)) | ||||
2970 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | ||||
2971 | |||||
2972 | if (ST->hasAVX512()) | ||||
2973 | if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD, | ||||
2974 | LTDest.second, LTSrc.second)) | ||||
2975 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | ||||
2976 | |||||
2977 | if (ST->hasAVX2()) | ||||
2978 | if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, | ||||
2979 | LTDest.second, LTSrc.second)) | ||||
2980 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | ||||
2981 | |||||
2982 | if (ST->hasAVX()) | ||||
2983 | if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, | ||||
2984 | LTDest.second, LTSrc.second)) | ||||
2985 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | ||||
2986 | |||||
2987 | if (ST->hasSSE41()) | ||||
2988 | if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, | ||||
2989 | LTDest.second, LTSrc.second)) | ||||
2990 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | ||||
2991 | |||||
2992 | if (ST->hasSSE2()) | ||||
2993 | if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, | ||||
2994 | LTDest.second, LTSrc.second)) | ||||
2995 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | ||||
2996 | |||||
2997 | // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for | ||||
2998 | // sitofp. | ||||
2999 | if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) && | ||||
3000 | 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) { | ||||
3001 | Type *ExtSrc = Src->getWithNewBitWidth(32); | ||||
3002 | unsigned ExtOpc = | ||||
3003 | (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt; | ||||
3004 | |||||
3005 | // For scalar loads the extend would be free. | ||||
3006 | InstructionCost ExtCost = 0; | ||||
3007 | if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0)))) | ||||
3008 | ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind); | ||||
3009 | |||||
3010 | return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc, | ||||
3011 | TTI::CastContextHint::None, CostKind); | ||||
3012 | } | ||||
3013 | |||||
3014 | // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi | ||||
3015 | // i32. | ||||
3016 | if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) && | ||||
3017 | 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) { | ||||
3018 | Type *TruncDst = Dst->getWithNewBitWidth(32); | ||||
3019 | return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) + | ||||
3020 | getCastInstrCost(Instruction::Trunc, Dst, TruncDst, | ||||
3021 | TTI::CastContextHint::None, CostKind); | ||||
3022 | } | ||||
3023 | |||||
3024 | return AdjustCost( | ||||
3025 | BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); | ||||
3026 | } | ||||
3027 | |||||
3028 | InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, | ||||
3029 | Type *CondTy, | ||||
3030 | CmpInst::Predicate VecPred, | ||||
3031 | TTI::TargetCostKind CostKind, | ||||
3032 | const Instruction *I) { | ||||
3033 | // Early out if this type isn't scalar/vector integer/float. | ||||
3034 | if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) | ||||
3035 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, | ||||
3036 | I); | ||||
3037 | |||||
3038 | // Legalize the type. | ||||
3039 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); | ||||
3040 | |||||
3041 | MVT MTy = LT.second; | ||||
3042 | |||||
3043 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | ||||
3044 | assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ? void (0) : __assert_fail ("ISD && \"Invalid opcode\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3044, __extension__ __PRETTY_FUNCTION__)); | ||||
3045 | |||||
3046 | InstructionCost ExtraCost = 0; | ||||
3047 | if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) { | ||||
3048 | // Some vector comparison predicates cost extra instructions. | ||||
3049 | // TODO: Should we invert this and assume worst case cmp costs | ||||
3050 | // and reduce for particular predicates? | ||||
3051 | if (MTy.isVector() && | ||||
3052 | !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) || | ||||
3053 | (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) || | ||||
3054 | ST->hasBWI())) { | ||||
3055 | // Fallback to I if a specific predicate wasn't specified. | ||||
3056 | CmpInst::Predicate Pred = VecPred; | ||||
3057 | if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE || | ||||
3058 | Pred == CmpInst::BAD_FCMP_PREDICATE)) | ||||
3059 | Pred = cast<CmpInst>(I)->getPredicate(); | ||||
3060 | |||||
3061 | switch (Pred) { | ||||
3062 | case CmpInst::Predicate::ICMP_NE: | ||||
3063 | // xor(cmpeq(x,y),-1) | ||||
3064 | ExtraCost = 1; | ||||
3065 | break; | ||||
3066 | case CmpInst::Predicate::ICMP_SGE: | ||||
3067 | case CmpInst::Predicate::ICMP_SLE: | ||||
3068 | // xor(cmpgt(x,y),-1) | ||||
3069 | ExtraCost = 1; | ||||
3070 | break; | ||||
3071 | case CmpInst::Predicate::ICMP_ULT: | ||||
3072 | case CmpInst::Predicate::ICMP_UGT: | ||||
3073 | // cmpgt(xor(x,signbit),xor(y,signbit)) | ||||
3074 | // xor(cmpeq(pmaxu(x,y),x),-1) | ||||
3075 | ExtraCost = 2; | ||||
3076 | break; | ||||
3077 | case CmpInst::Predicate::ICMP_ULE: | ||||
3078 | case CmpInst::Predicate::ICMP_UGE: | ||||
3079 | if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) || | ||||
3080 | (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) { | ||||
3081 | // cmpeq(psubus(x,y),0) | ||||
3082 | // cmpeq(pminu(x,y),x) | ||||
3083 | ExtraCost = 1; | ||||
3084 | } else { | ||||
3085 | // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1) | ||||
3086 | ExtraCost = 3; | ||||
3087 | } | ||||
3088 | break; | ||||
3089 | case CmpInst::Predicate::FCMP_ONE: | ||||
3090 | case CmpInst::Predicate::FCMP_UEQ: | ||||
3091 | // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases. | ||||
3092 | // Use FCMP_UEQ expansion - FCMP_ONE should be the same. | ||||
3093 | if (CondTy && !ST->hasAVX()) | ||||
3094 | return getCmpSelInstrCost(Opcode, ValTy, CondTy, | ||||
3095 | CmpInst::Predicate::FCMP_UNO, CostKind) + | ||||
3096 | getCmpSelInstrCost(Opcode, ValTy, CondTy, | ||||
3097 | CmpInst::Predicate::FCMP_OEQ, CostKind) + | ||||
3098 | getArithmeticInstrCost(Instruction::Or, CondTy, CostKind); | ||||
3099 | |||||
3100 | break; | ||||
3101 | case CmpInst::Predicate::BAD_ICMP_PREDICATE: | ||||
3102 | case CmpInst::Predicate::BAD_FCMP_PREDICATE: | ||||
3103 | // Assume worst case scenario and add the maximum extra cost. | ||||
3104 | ExtraCost = 3; | ||||
3105 | break; | ||||
3106 | default: | ||||
3107 | break; | ||||
3108 | } | ||||
3109 | } | ||||
3110 | } | ||||
3111 | |||||
3112 | static const CostKindTblEntry SLMCostTbl[] = { | ||||
3113 | // slm pcmpeq/pcmpgt throughput is 2 | ||||
3114 | { ISD::SETCC, MVT::v2i64, { 2, 5, 1, 2 } }, | ||||
3115 | // slm pblendvb/blendvpd/blendvps throughput is 4 | ||||
3116 | { ISD::SELECT, MVT::v2f64, { 4, 4, 1, 3 } }, // vblendvpd | ||||
3117 | { ISD::SELECT, MVT::v4f32, { 4, 4, 1, 3 } }, // vblendvps | ||||
3118 | { ISD::SELECT, MVT::v2i64, { 4, 4, 1, 3 } }, // pblendvb | ||||
3119 | { ISD::SELECT, MVT::v8i32, { 4, 4, 1, 3 } }, // pblendvb | ||||
3120 | { ISD::SELECT, MVT::v8i16, { 4, 4, 1, 3 } }, // pblendvb | ||||
3121 | { ISD::SELECT, MVT::v16i8, { 4, 4, 1, 3 } }, // pblendvb | ||||
3122 | }; | ||||
3123 | |||||
3124 | static const CostKindTblEntry AVX512BWCostTbl[] = { | ||||
3125 | { ISD::SETCC, MVT::v32i16, { 1, 1, 1, 1 } }, | ||||
3126 | { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 1 } }, | ||||
3127 | { ISD::SETCC, MVT::v64i8, { 1, 1, 1, 1 } }, | ||||
3128 | { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 1 } }, | ||||
3129 | |||||
3130 | { ISD::SELECT, MVT::v32i16, { 1, 1, 1, 1 } }, | ||||
3131 | { ISD::SELECT, MVT::v64i8, { 1, 1, 1, 1 } }, | ||||
3132 | }; | ||||
3133 | |||||
3134 | static const CostKindTblEntry AVX512CostTbl[] = { | ||||
3135 | { ISD::SETCC, MVT::v8f64, { 1, 4, 1, 1 } }, | ||||
3136 | { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 1 } }, | ||||
3137 | { ISD::SETCC, MVT::v16f32, { 1, 4, 1, 1 } }, | ||||
3138 | { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 1 } }, | ||||
3139 | |||||
3140 | { ISD::SETCC, MVT::v8i64, { 1, 1, 1, 1 } }, | ||||
3141 | { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 1 } }, | ||||
3142 | { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } }, | ||||
3143 | { ISD::SETCC, MVT::v16i32, { 1, 1, 1, 1 } }, | ||||
3144 | { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 1 } }, | ||||
3145 | { ISD::SETCC, MVT::v32i16, { 3, 7, 5, 5 } }, | ||||
3146 | { ISD::SETCC, MVT::v64i8, { 3, 7, 5, 5 } }, | ||||
3147 | |||||
3148 | { ISD::SELECT, MVT::v8i64, { 1, 1, 1, 1 } }, | ||||
3149 | { ISD::SELECT, MVT::v4i64, { 1, 1, 1, 1 } }, | ||||
3150 | { ISD::SELECT, MVT::v2i64, { 1, 1, 1, 1 } }, | ||||
3151 | { ISD::SELECT, MVT::v16i32, { 1, 1, 1, 1 } }, | ||||
3152 | { ISD::SELECT, MVT::v8i32, { 1, 1, 1, 1 } }, | ||||
3153 | { ISD::SELECT, MVT::v4i32, { 1, 1, 1, 1 } }, | ||||
3154 | { ISD::SELECT, MVT::v8f64, { 1, 1, 1, 1 } }, | ||||
3155 | { ISD::SELECT, MVT::v4f64, { 1, 1, 1, 1 } }, | ||||
3156 | { ISD::SELECT, MVT::v2f64, { 1, 1, 1, 1 } }, | ||||
3157 | { ISD::SELECT, MVT::f64, { 1, 1, 1, 1 } }, | ||||
3158 | { ISD::SELECT, MVT::v16f32, { 1, 1, 1, 1 } }, | ||||
3159 | { ISD::SELECT, MVT::v8f32 , { 1, 1, 1, 1 } }, | ||||
3160 | { ISD::SELECT, MVT::v4f32, { 1, 1, 1, 1 } }, | ||||
3161 | { ISD::SELECT, MVT::f32 , { 1, 1, 1, 1 } }, | ||||
3162 | |||||
3163 | { ISD::SELECT, MVT::v32i16, { 2, 2, 4, 4 } }, | ||||
3164 | { ISD::SELECT, MVT::v16i16, { 1, 1, 1, 1 } }, | ||||
3165 | { ISD::SELECT, MVT::v8i16, { 1, 1, 1, 1 } }, | ||||
3166 | { ISD::SELECT, MVT::v64i8, { 2, 2, 4, 4 } }, | ||||
3167 | { ISD::SELECT, MVT::v32i8, { 1, 1, 1, 1 } }, | ||||
3168 | { ISD::SELECT, MVT::v16i8, { 1, 1, 1, 1 } }, | ||||
3169 | }; | ||||
3170 | |||||
3171 | static const CostKindTblEntry AVX2CostTbl[] = { | ||||
3172 | { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 2 } }, | ||||
3173 | { ISD::SETCC, MVT::v2f64, { 1, 4, 1, 1 } }, | ||||
3174 | { ISD::SETCC, MVT::f64, { 1, 4, 1, 1 } }, | ||||
3175 | { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 2 } }, | ||||
3176 | { ISD::SETCC, MVT::v4f32, { 1, 4, 1, 1 } }, | ||||
3177 | { ISD::SETCC, MVT::f32, { 1, 4, 1, 1 } }, | ||||
3178 | |||||
3179 | { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 2 } }, | ||||
3180 | { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 2 } }, | ||||
3181 | { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 2 } }, | ||||
3182 | { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 2 } }, | ||||
3183 | |||||
3184 | { ISD::SELECT, MVT::v4f64, { 2, 2, 1, 2 } }, // vblendvpd | ||||
3185 | { ISD::SELECT, MVT::v8f32, { 2, 2, 1, 2 } }, // vblendvps | ||||
3186 | { ISD::SELECT, MVT::v4i64, { 2, 2, 1, 2 } }, // pblendvb | ||||
3187 | { ISD::SELECT, MVT::v8i32, { 2, 2, 1, 2 } }, // pblendvb | ||||
3188 | { ISD::SELECT, MVT::v16i16, { 2, 2, 1, 2 } }, // pblendvb | ||||
3189 | { ISD::SELECT, MVT::v32i8, { 2, 2, 1, 2 } }, // pblendvb | ||||
3190 | }; | ||||
3191 | |||||
3192 | static const CostKindTblEntry XOPCostTbl[] = { | ||||
3193 | { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } }, | ||||
3194 | { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } }, | ||||
3195 | }; | ||||
3196 | |||||
3197 | static const CostKindTblEntry AVX1CostTbl[] = { | ||||
3198 | { ISD::SETCC, MVT::v4f64, { 2, 3, 1, 2 } }, | ||||
3199 | { ISD::SETCC, MVT::v2f64, { 1, 3, 1, 1 } }, | ||||
3200 | { ISD::SETCC, MVT::f64, { 1, 3, 1, 1 } }, | ||||
3201 | { ISD::SETCC, MVT::v8f32, { 2, 3, 1, 2 } }, | ||||
3202 | { ISD::SETCC, MVT::v4f32, { 1, 3, 1, 1 } }, | ||||
3203 | { ISD::SETCC, MVT::f32, { 1, 3, 1, 1 } }, | ||||
3204 | |||||
3205 | // AVX1 does not support 8-wide integer compare. | ||||
3206 | { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } }, | ||||
3207 | { ISD::SETCC, MVT::v8i32, { 4, 2, 5, 6 } }, | ||||
3208 | { ISD::SETCC, MVT::v16i16, { 4, 2, 5, 6 } }, | ||||
3209 | { ISD::SETCC, MVT::v32i8, { 4, 2, 5, 6 } }, | ||||
3210 | |||||
3211 | { ISD::SELECT, MVT::v4f64, { 3, 3, 1, 2 } }, // vblendvpd | ||||
3212 | { ISD::SELECT, MVT::v8f32, { 3, 3, 1, 2 } }, // vblendvps | ||||
3213 | { ISD::SELECT, MVT::v4i64, { 3, 3, 1, 2 } }, // vblendvpd | ||||
3214 | { ISD::SELECT, MVT::v8i32, { 3, 3, 1, 2 } }, // vblendvps | ||||
3215 | { ISD::SELECT, MVT::v16i16, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps | ||||
3216 | { ISD::SELECT, MVT::v32i8, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps | ||||
3217 | }; | ||||
3218 | |||||
3219 | static const CostKindTblEntry SSE42CostTbl[] = { | ||||
3220 | { ISD::SETCC, MVT::v2i64, { 1, 2, 1, 2 } }, | ||||
3221 | }; | ||||
3222 | |||||
3223 | static const CostKindTblEntry SSE41CostTbl[] = { | ||||
3224 | { ISD::SETCC, MVT::v2f64, { 1, 5, 1, 1 } }, | ||||
3225 | { ISD::SETCC, MVT::v4f32, { 1, 5, 1, 1 } }, | ||||
3226 | |||||
3227 | { ISD::SELECT, MVT::v2f64, { 2, 2, 1, 2 } }, // blendvpd | ||||
3228 | { ISD::SELECT, MVT::f64, { 2, 2, 1, 2 } }, // blendvpd | ||||
3229 | { ISD::SELECT, MVT::v4f32, { 2, 2, 1, 2 } }, // blendvps | ||||
3230 | { ISD::SELECT, MVT::f32 , { 2, 2, 1, 2 } }, // blendvps | ||||
3231 | { ISD::SELECT, MVT::v2i64, { 2, 2, 1, 2 } }, // pblendvb | ||||
3232 | { ISD::SELECT, MVT::v4i32, { 2, 2, 1, 2 } }, // pblendvb | ||||
3233 | { ISD::SELECT, MVT::v8i16, { 2, 2, 1, 2 } }, // pblendvb | ||||
3234 | { ISD::SELECT, MVT::v16i8, { 2, 2, 1, 2 } }, // pblendvb | ||||
3235 | }; | ||||
3236 | |||||
3237 | static const CostKindTblEntry SSE2CostTbl[] = { | ||||
3238 | { ISD::SETCC, MVT::v2f64, { 2, 5, 1, 1 } }, | ||||
3239 | { ISD::SETCC, MVT::f64, { 1, 5, 1, 1 } }, | ||||
3240 | |||||
3241 | { ISD::SETCC, MVT::v2i64, { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion | ||||
3242 | { ISD::SETCC, MVT::v4i32, { 1, 1, 1, 1 } }, | ||||
3243 | { ISD::SETCC, MVT::v8i16, { 1, 1, 1, 1 } }, | ||||
3244 | { ISD::SETCC, MVT::v16i8, { 1, 1, 1, 1 } }, | ||||
3245 | |||||
3246 | { ISD::SELECT, MVT::v2f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd | ||||
3247 | { ISD::SELECT, MVT::f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd | ||||
3248 | { ISD::SELECT, MVT::v2i64, { 2, 2, 3, 3 } }, // pand + pandn + por | ||||
3249 | { ISD::SELECT, MVT::v4i32, { 2, 2, 3, 3 } }, // pand + pandn + por | ||||
3250 | { ISD::SELECT, MVT::v8i16, { 2, 2, 3, 3 } }, // pand + pandn + por | ||||
3251 | { ISD::SELECT, MVT::v16i8, { 2, 2, 3, 3 } }, // pand + pandn + por | ||||
3252 | }; | ||||
3253 | |||||
3254 | static const CostKindTblEntry SSE1CostTbl[] = { | ||||
3255 | { ISD::SETCC, MVT::v4f32, { 2, 5, 1, 1 } }, | ||||
3256 | { ISD::SETCC, MVT::f32, { 1, 5, 1, 1 } }, | ||||
3257 | |||||
3258 | { ISD::SELECT, MVT::v4f32, { 2, 2, 3, 3 } }, // andps + andnps + orps | ||||
3259 | { ISD::SELECT, MVT::f32, { 2, 2, 3, 3 } }, // andps + andnps + orps | ||||
3260 | }; | ||||
3261 | |||||
3262 | if (ST->useSLMArithCosts()) | ||||
3263 | if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) | ||||
3264 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
3265 | return LT.first * (ExtraCost + *KindCost); | ||||
3266 | |||||
3267 | if (ST->hasBWI()) | ||||
3268 | if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) | ||||
3269 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
3270 | return LT.first * (ExtraCost + *KindCost); | ||||
3271 | |||||
3272 | if (ST->hasAVX512()) | ||||
3273 | if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) | ||||
3274 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
3275 | return LT.first * (ExtraCost + *KindCost); | ||||
3276 | |||||
3277 | if (ST->hasAVX2()) | ||||
3278 | if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) | ||||
3279 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
3280 | return LT.first * (ExtraCost + *KindCost); | ||||
3281 | |||||
3282 | if (ST->hasXOP()) | ||||
3283 | if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy)) | ||||
3284 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
3285 | return LT.first * (ExtraCost + *KindCost); | ||||
3286 | |||||
3287 | if (ST->hasAVX()) | ||||
3288 | if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) | ||||
3289 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
3290 | return LT.first * (ExtraCost + *KindCost); | ||||
3291 | |||||
3292 | if (ST->hasSSE42()) | ||||
3293 | if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) | ||||
3294 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
3295 | return LT.first * (ExtraCost + *KindCost); | ||||
3296 | |||||
3297 | if (ST->hasSSE41()) | ||||
3298 | if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) | ||||
3299 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
3300 | return LT.first * (ExtraCost + *KindCost); | ||||
3301 | |||||
3302 | if (ST->hasSSE2()) | ||||
3303 | if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) | ||||
3304 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
3305 | return LT.first * (ExtraCost + *KindCost); | ||||
3306 | |||||
3307 | if (ST->hasSSE1()) | ||||
3308 | if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) | ||||
3309 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
3310 | return LT.first * (ExtraCost + *KindCost); | ||||
3311 | |||||
3312 | // Assume a 3cy latency for fp select ops. | ||||
3313 | if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select) | ||||
3314 | if (ValTy->getScalarType()->isFloatingPointTy()) | ||||
3315 | return 3; | ||||
3316 | |||||
3317 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); | ||||
3318 | } | ||||
3319 | |||||
3320 | unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; } | ||||
3321 | |||||
3322 | InstructionCost | ||||
3323 | X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, | ||||
3324 | TTI::TargetCostKind CostKind) { | ||||
3325 | // Costs should match the codegen from: | ||||
3326 | // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll | ||||
3327 | // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll | ||||
3328 | // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll | ||||
3329 | // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll | ||||
3330 | // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll | ||||
3331 | |||||
3332 | // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not | ||||
3333 | // specialized in these tables yet. | ||||
3334 | static const CostKindTblEntry AVX512VBMI2CostTbl[] = { | ||||
3335 | { ISD::FSHL, MVT::v8i64, { 1, 1, 1, 1 } }, | ||||
3336 | { ISD::FSHL, MVT::v4i64, { 1, 1, 1, 1 } }, | ||||
3337 | { ISD::FSHL, MVT::v2i64, { 1, 1, 1, 1 } }, | ||||
3338 | { ISD::FSHL, MVT::v16i32, { 1, 1, 1, 1 } }, | ||||
3339 | { ISD::FSHL, MVT::v8i32, { 1, 1, 1, 1 } }, | ||||
3340 | { ISD::FSHL, MVT::v4i32, { 1, 1, 1, 1 } }, | ||||
3341 | { ISD::FSHL, MVT::v32i16, { 1, 1, 1, 1 } }, | ||||
3342 | { ISD::FSHL, MVT::v16i16, { 1, 1, 1, 1 } }, | ||||
3343 | { ISD::FSHL, MVT::v8i16, { 1, 1, 1, 1 } }, | ||||
3344 | { ISD::ROTL, MVT::v32i16, { 1, 1, 1, 1 } }, | ||||
3345 | { ISD::ROTL, MVT::v16i16, { 1, 1, 1, 1 } }, | ||||
3346 | { ISD::ROTL, MVT::v8i16, { 1, 1, 1, 1 } }, | ||||
3347 | { ISD::ROTR, MVT::v32i16, { 1, 1, 1, 1 } }, | ||||
3348 | { ISD::ROTR, MVT::v16i16, { 1, 1, 1, 1 } }, | ||||
3349 | { ISD::ROTR, MVT::v8i16, { 1, 1, 1, 1 } }, | ||||
3350 | }; | ||||
3351 | static const CostKindTblEntry AVX512BITALGCostTbl[] = { | ||||
3352 | { ISD::CTPOP, MVT::v32i16, { 1, 1, 1, 1 } }, | ||||
3353 | { ISD::CTPOP, MVT::v64i8, { 1, 1, 1, 1 } }, | ||||
3354 | { ISD::CTPOP, MVT::v16i16, { 1, 1, 1, 1 } }, | ||||
3355 | { ISD::CTPOP, MVT::v32i8, { 1, 1, 1, 1 } }, | ||||
3356 | { ISD::CTPOP, MVT::v8i16, { 1, 1, 1, 1 } }, | ||||
3357 | { ISD::CTPOP, MVT::v16i8, { 1, 1, 1, 1 } }, | ||||
3358 | }; | ||||
3359 | static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = { | ||||
3360 | { ISD::CTPOP, MVT::v8i64, { 1, 1, 1, 1 } }, | ||||
3361 | { ISD::CTPOP, MVT::v16i32, { 1, 1, 1, 1 } }, | ||||
3362 | { ISD::CTPOP, MVT::v4i64, { 1, 1, 1, 1 } }, | ||||
3363 | { ISD::CTPOP, MVT::v8i32, { 1, 1, 1, 1 } }, | ||||
3364 | { ISD::CTPOP, MVT::v2i64, { 1, 1, 1, 1 } }, | ||||
3365 | { ISD::CTPOP, MVT::v4i32, { 1, 1, 1, 1 } }, | ||||
3366 | }; | ||||
3367 | static const CostKindTblEntry AVX512CDCostTbl[] = { | ||||
3368 | { ISD::CTLZ, MVT::v8i64, { 1, 5, 1, 1 } }, | ||||
3369 | { ISD::CTLZ, MVT::v16i32, { 1, 5, 1, 1 } }, | ||||
3370 | { ISD::CTLZ, MVT::v32i16, { 18, 27, 23, 27 } }, | ||||
3371 | { ISD::CTLZ, MVT::v64i8, { 3, 16, 9, 11 } }, | ||||
3372 | { ISD::CTLZ, MVT::v4i64, { 1, 5, 1, 1 } }, | ||||
3373 | { ISD::CTLZ, MVT::v8i32, { 1, 5, 1, 1 } }, | ||||
3374 | { ISD::CTLZ, MVT::v16i16, { 8, 19, 11, 13 } }, | ||||
3375 | { ISD::CTLZ, MVT::v32i8, { 2, 11, 9, 10 } }, | ||||
3376 | { ISD::CTLZ, MVT::v2i64, { 1, 5, 1, 1 } }, | ||||
3377 | { ISD::CTLZ, MVT::v4i32, { 1, 5, 1, 1 } }, | ||||
3378 | { ISD::CTLZ, MVT::v8i16, { 3, 15, 4, 6 } }, | ||||
3379 | { ISD::CTLZ, MVT::v16i8, { 2, 10, 9, 10 } }, | ||||
3380 | |||||
3381 | { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } }, | ||||
3382 | { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } }, | ||||
3383 | { ISD::CTTZ, MVT::v4i64, { 1, 8, 6, 6 } }, | ||||
3384 | { ISD::CTTZ, MVT::v8i32, { 1, 8, 6, 6 } }, | ||||
3385 | { ISD::CTTZ, MVT::v2i64, { 1, 8, 6, 6 } }, | ||||
3386 | { ISD::CTTZ, MVT::v4i32, { 1, 8, 6, 6 } }, | ||||
3387 | }; | ||||
3388 | static const CostKindTblEntry AVX512BWCostTbl[] = { | ||||
3389 | { ISD::ABS, MVT::v32i16, { 1, 1, 1, 1 } }, | ||||
3390 | { ISD::ABS, MVT::v64i8, { 1, 1, 1, 1 } }, | ||||
3391 | { ISD::BITREVERSE, MVT::v8i64, { 3 } }, | ||||
3392 | { ISD::BITREVERSE, MVT::v16i32, { 3 } }, | ||||
3393 | { ISD::BITREVERSE, MVT::v32i16, { 3 } }, | ||||
3394 | { ISD::BITREVERSE, MVT::v64i8, { 2 } }, | ||||
3395 | { ISD::BSWAP, MVT::v8i64, { 1 } }, | ||||
3396 | { ISD::BSWAP, MVT::v16i32, { 1 } }, | ||||
3397 | { ISD::BSWAP, MVT::v32i16, { 1 } }, | ||||
3398 | { ISD::CTLZ, MVT::v8i64, { 8, 22, 23, 23 } }, | ||||
3399 | { ISD::CTLZ, MVT::v16i32, { 8, 23, 25, 25 } }, | ||||
3400 | { ISD::CTLZ, MVT::v32i16, { 4, 15, 15, 16 } }, | ||||
3401 | { ISD::CTLZ, MVT::v64i8, { 3, 12, 10, 9 } }, | ||||
3402 | { ISD::CTPOP, MVT::v2i64, { 3, 7, 10, 10 } }, | ||||
3403 | { ISD::CTPOP, MVT::v4i64, { 3, 7, 10, 10 } }, | ||||
3404 | { ISD::CTPOP, MVT::v8i64, { 3, 8, 10, 12 } }, | ||||
3405 | { ISD::CTPOP, MVT::v4i32, { 7, 11, 14, 14 } }, | ||||
3406 | { ISD::CTPOP, MVT::v8i32, { 7, 11, 14, 14 } }, | ||||
3407 | { ISD::CTPOP, MVT::v16i32, { 7, 12, 14, 16 } }, | ||||
3408 | { ISD::CTPOP, MVT::v8i16, { 2, 7, 11, 11 } }, | ||||
3409 | { ISD::CTPOP, MVT::v16i16, { 2, 7, 11, 11 } }, | ||||
3410 | { ISD::CTPOP, MVT::v32i16, { 3, 7, 11, 13 } }, | ||||
3411 | { ISD::CTPOP, MVT::v16i8, { 2, 4, 8, 8 } }, | ||||
3412 | { ISD::CTPOP, MVT::v32i8, { 2, 4, 8, 8 } }, | ||||
3413 | { ISD::CTPOP, MVT::v64i8, { 2, 5, 8, 10 } }, | ||||
3414 | { ISD::CTTZ, MVT::v8i16, { 3, 9, 14, 14 } }, | ||||
3415 | { ISD::CTTZ, MVT::v16i16, { 3, 9, 14, 14 } }, | ||||
3416 | { ISD::CTTZ, MVT::v32i16, { 3, 10, 14, 16 } }, | ||||
3417 | { ISD::CTTZ, MVT::v16i8, { 2, 6, 11, 11 } }, | ||||
3418 | { ISD::CTTZ, MVT::v32i8, { 2, 6, 11, 11 } }, | ||||
3419 | { ISD::CTTZ, MVT::v64i8, { 3, 7, 11, 13 } }, | ||||
3420 | { ISD::ROTL, MVT::v32i16, { 2, 8, 6, 8 } }, | ||||
3421 | { ISD::ROTL, MVT::v16i16, { 2, 8, 6, 7 } }, | ||||
3422 | { ISD::ROTL, MVT::v8i16, { 2, 7, 6, 7 } }, | ||||
3423 | { ISD::ROTL, MVT::v64i8, { 5, 6, 11, 12 } }, | ||||
3424 | { ISD::ROTL, MVT::v32i8, { 5, 15, 7, 10 } }, | ||||
3425 | { ISD::ROTL, MVT::v16i8, { 5, 15, 7, 10 } }, | ||||
3426 | { ISD::ROTR, MVT::v32i16, { 2, 8, 6, 8 } }, | ||||
3427 | { ISD::ROTR, MVT::v16i16, { 2, 8, 6, 7 } }, | ||||
3428 | { ISD::ROTR, MVT::v8i16, { 2, 7, 6, 7 } }, | ||||
3429 | { ISD::ROTR, MVT::v64i8, { 5, 6, 12, 14 } }, | ||||
3430 | { ISD::ROTR, MVT::v32i8, { 5, 14, 6, 9 } }, | ||||
3431 | { ISD::ROTR, MVT::v16i8, { 5, 14, 6, 9 } }, | ||||
3432 | { ISD::SADDSAT, MVT::v32i16, { 1 } }, | ||||
3433 | { ISD::SADDSAT, MVT::v64i8, { 1 } }, | ||||
3434 | { ISD::SMAX, MVT::v32i16, { 1, 1, 1, 1 } }, | ||||
3435 | { ISD::SMAX, MVT::v64i8, { 1, 1, 1, 1 } }, | ||||
3436 | { ISD::SMIN, MVT::v32i16, { 1, 1, 1, 1 } }, | ||||
3437 | { ISD::SMIN, MVT::v64i8, { 1, 1, 1, 1 } }, | ||||
3438 | { ISD::SSUBSAT, MVT::v32i16, { 1 } }, | ||||
3439 | { ISD::SSUBSAT, MVT::v64i8, { 1 } }, | ||||
3440 | { ISD::UADDSAT, MVT::v32i16, { 1 } }, | ||||
3441 | { ISD::UADDSAT, MVT::v64i8, { 1 } }, | ||||
3442 | { ISD::UMAX, MVT::v32i16, { 1, 1, 1, 1 } }, | ||||
3443 | { ISD::UMAX, MVT::v64i8, { 1, 1, 1, 1 } }, | ||||
3444 | { ISD::UMIN, MVT::v32i16, { 1, 1, 1, 1 } }, | ||||
3445 | { ISD::UMIN, MVT::v64i8, { 1, 1, 1, 1 } }, | ||||
3446 | { ISD::USUBSAT, MVT::v32i16, { 1 } }, | ||||
3447 | { ISD::USUBSAT, MVT::v64i8, { 1 } }, | ||||
3448 | }; | ||||
3449 | static const CostKindTblEntry AVX512CostTbl[] = { | ||||
3450 | { ISD::ABS, MVT::v8i64, { 1, 1, 1, 1 } }, | ||||
3451 | { ISD::ABS, MVT::v4i64, { 1, 1, 1, 1 } }, | ||||
3452 | { ISD::ABS, MVT::v2i64, { 1, 1, 1, 1 } }, | ||||
3453 | { ISD::ABS, MVT::v16i32, { 1, 1, 1, 1 } }, | ||||
3454 | { ISD::ABS, MVT::v8i32, { 1, 1, 1, 1 } }, | ||||
3455 | { ISD::ABS, MVT::v32i16, { 2, 7, 4, 4 } }, | ||||
3456 | { ISD::ABS, MVT::v16i16, { 1, 1, 1, 1 } }, | ||||
3457 | { ISD::ABS, MVT::v64i8, { 2, 7, 4, 4 } }, | ||||
3458 | { ISD::ABS, MVT::v32i8, { 1, 1, 1, 1 } }, | ||||
3459 | { ISD::BITREVERSE, MVT::v8i64, { 36 } }, | ||||
3460 | { ISD::BITREVERSE, MVT::v16i32, { 24 } }, | ||||
3461 | { ISD::BITREVERSE, MVT::v32i16, { 10 } }, | ||||
3462 | { ISD::BITREVERSE, MVT::v64i8, { 10 } }, | ||||
3463 | { ISD::BSWAP, MVT::v8i64, { 4 } }, | ||||
3464 | { ISD::BSWAP, MVT::v16i32, { 4 } }, | ||||
3465 | { ISD::BSWAP, MVT::v32i16, { 4 } }, | ||||
3466 | { ISD::CTLZ, MVT::v8i64, { 10, 28, 32, 32 } }, | ||||
3467 | { ISD::CTLZ, MVT::v16i32, { 12, 30, 38, 38 } }, | ||||
3468 | { ISD::CTLZ, MVT::v32i16, { 8, 15, 29, 29 } }, | ||||
3469 | { ISD::CTLZ, MVT::v64i8, { 6, 11, 19, 19 } }, | ||||
3470 | { ISD::CTPOP, MVT::v8i64, { 16, 16, 19, 19 } }, | ||||
3471 | { ISD::CTPOP, MVT::v16i32, { 24, 19, 27, 27 } }, | ||||
3472 | { ISD::CTPOP, MVT::v32i16, { 18, 15, 22, 22 } }, | ||||
3473 | { ISD::CTPOP, MVT::v64i8, { 12, 11, 16, 16 } }, | ||||
3474 | { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } }, | ||||
3475 | { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } }, | ||||
3476 | { ISD::CTTZ, MVT::v32i16, { 7, 17, 27, 27 } }, | ||||
3477 | { ISD::CTTZ, MVT::v64i8, { 6, 13, 21, 21 } }, | ||||
3478 | { ISD::ROTL, MVT::v8i64, { 1, 1, 1, 1 } }, | ||||
3479 | { ISD::ROTL, MVT::v4i64, { 1, 1, 1, 1 } }, | ||||
3480 | { ISD::ROTL, MVT::v2i64, { 1, 1, 1, 1 } }, | ||||
3481 | { ISD::ROTL, MVT::v16i32, { 1, 1, 1, 1 } }, | ||||
3482 | { ISD::ROTL, MVT::v8i32, { 1, 1, 1, 1 } }, | ||||
3483 | { ISD::ROTL, MVT::v4i32, { 1, 1, 1, 1 } }, | ||||
3484 | { ISD::ROTR, MVT::v8i64, { 1, 1, 1, 1 } }, | ||||
3485 | { ISD::ROTR, MVT::v4i64, { 1, 1, 1, 1 } }, | ||||
3486 | { ISD::ROTR, MVT::v2i64, { 1, 1, 1, 1 } }, | ||||
3487 | { ISD::ROTR, MVT::v16i32, { 1, 1, 1, 1 } }, | ||||
3488 | { ISD::ROTR, MVT::v8i32, { 1, 1, 1, 1 } }, | ||||
3489 | { ISD::ROTR, MVT::v4i32, { 1, 1, 1, 1 } }, | ||||
3490 | { ISD::SMAX, MVT::v8i64, { 1, 3, 1, 1 } }, | ||||
3491 | { ISD::SMAX, MVT::v16i32, { 1, 1, 1, 1 } }, | ||||
3492 | { ISD::SMAX, MVT::v32i16, { 3, 7, 5, 5 } }, | ||||
3493 | { ISD::SMAX, MVT::v64i8, { 3, 7, 5, 5 } }, | ||||
3494 | { ISD::SMAX, MVT::v4i64, { 1, 3, 1, 1 } }, | ||||
3495 | { ISD::SMAX, MVT::v2i64, { 1, 3, 1, 1 } }, | ||||
3496 | { ISD::SMIN, MVT::v8i64, { 1, 3, 1, 1 } }, | ||||
3497 | { ISD::SMIN, MVT::v16i32, { 1, 1, 1, 1 } }, | ||||
3498 | { ISD::SMIN, MVT::v32i16, { 3, 7, 5, 5 } }, | ||||
3499 | { ISD::SMIN, MVT::v64i8, { 3, 7, 5, 5 } }, | ||||
3500 | { ISD::SMIN, MVT::v4i64, { 1, 3, 1, 1 } }, | ||||
3501 | { ISD::SMIN, MVT::v2i64, { 1, 3, 1, 1 } }, | ||||
3502 | { ISD::UMAX, MVT::v8i64, { 1, 3, 1, 1 } }, | ||||
3503 | { ISD::UMAX, MVT::v16i32, { 1, 1, 1, 1 } }, | ||||
3504 | { ISD::UMAX, MVT::v32i16, { 3, 7, 5, 5 } }, | ||||
3505 | { ISD::UMAX, MVT::v64i8, { 3, 7, 5, 5 } }, | ||||
3506 | { ISD::UMAX, MVT::v4i64, { 1, 3, 1, 1 } }, | ||||
3507 | { ISD::UMAX, MVT::v2i64, { 1, 3, 1, 1 } }, | ||||
3508 | { ISD::UMIN, MVT::v8i64, { 1, 3, 1, 1 } }, | ||||
3509 | { ISD::UMIN, MVT::v16i32, { 1, 1, 1, 1 } }, | ||||
3510 | { ISD::UMIN, MVT::v32i16, { 3, 7, 5, 5 } }, | ||||
3511 | { ISD::UMIN, MVT::v64i8, { 3, 7, 5, 5 } }, | ||||
3512 | { ISD::UMIN, MVT::v4i64, { 1, 3, 1, 1 } }, | ||||
3513 | { ISD::UMIN, MVT::v2i64, { 1, 3, 1, 1 } }, | ||||
3514 | { ISD::USUBSAT, MVT::v16i32, { 2 } }, // pmaxud + psubd | ||||
3515 | { ISD::USUBSAT, MVT::v2i64, { 2 } }, // pmaxuq + psubq | ||||
3516 | { ISD::USUBSAT, MVT::v4i64, { 2 } }, // pmaxuq + psubq | ||||
3517 | { ISD::USUBSAT, MVT::v8i64, { 2 } }, // pmaxuq + psubq | ||||
3518 | { ISD::UADDSAT, MVT::v16i32, { 3 } }, // not + pminud + paddd | ||||
3519 | { ISD::UADDSAT, MVT::v2i64, { 3 } }, // not + pminuq + paddq | ||||
3520 | { ISD::UADDSAT, MVT::v4i64, { 3 } }, // not + pminuq + paddq | ||||
3521 | { ISD::UADDSAT, MVT::v8i64, { 3 } }, // not + pminuq + paddq | ||||
3522 | { ISD::SADDSAT, MVT::v32i16, { 2 } }, | ||||
3523 | { ISD::SADDSAT, MVT::v64i8, { 2 } }, | ||||
3524 | { ISD::SSUBSAT, MVT::v32i16, { 2 } }, | ||||
3525 | { ISD::SSUBSAT, MVT::v64i8, { 2 } }, | ||||
3526 | { ISD::UADDSAT, MVT::v32i16, { 2 } }, | ||||
3527 | { ISD::UADDSAT, MVT::v64i8, { 2 } }, | ||||
3528 | { ISD::USUBSAT, MVT::v32i16, { 2 } }, | ||||
3529 | { ISD::USUBSAT, MVT::v64i8, { 2 } }, | ||||
3530 | { ISD::FMAXNUM, MVT::f32, { 2 } }, | ||||
3531 | { ISD::FMAXNUM, MVT::v4f32, { 2 } }, | ||||
3532 | { ISD::FMAXNUM, MVT::v8f32, { 2 } }, | ||||
3533 | { ISD::FMAXNUM, MVT::v16f32, { 2 } }, | ||||
3534 | { ISD::FMAXNUM, MVT::f64, { 2 } }, | ||||
3535 | { ISD::FMAXNUM, MVT::v2f64, { 2 } }, | ||||
3536 | { ISD::FMAXNUM, MVT::v4f64, { 2 } }, | ||||
3537 | { ISD::FMAXNUM, MVT::v8f64, { 2 } }, | ||||
3538 | { ISD::FSQRT, MVT::f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/ | ||||
3539 | { ISD::FSQRT, MVT::v4f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/ | ||||
3540 | { ISD::FSQRT, MVT::v8f32, { 6, 12, 1, 1 } }, // Skylake from http://www.agner.org/ | ||||
3541 | { ISD::FSQRT, MVT::v16f32, { 12, 20, 1, 3 } }, // Skylake from http://www.agner.org/ | ||||
3542 | { ISD::FSQRT, MVT::f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/ | ||||
3543 | { ISD::FSQRT, MVT::v2f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/ | ||||
3544 | { ISD::FSQRT, MVT::v4f64, { 12, 18, 1, 1 } }, // Skylake from http://www.agner.org/ | ||||
3545 | { ISD::FSQRT, MVT::v8f64, { 24, 32, 1, 3 } }, // Skylake from http://www.agner.org/ | ||||
3546 | }; | ||||
3547 | static const CostKindTblEntry XOPCostTbl[] = { | ||||
3548 | { ISD::BITREVERSE, MVT::v4i64, { 4 } }, | ||||
3549 | { ISD::BITREVERSE, MVT::v8i32, { 4 } }, | ||||
3550 | { ISD::BITREVERSE, MVT::v16i16, { 4 } }, | ||||
3551 | { ISD::BITREVERSE, MVT::v32i8, { 4 } }, | ||||
3552 | { ISD::BITREVERSE, MVT::v2i64, { 1 } }, | ||||
3553 | { ISD::BITREVERSE, MVT::v4i32, { 1 } }, | ||||
3554 | { ISD::BITREVERSE, MVT::v8i16, { 1 } }, | ||||
3555 | { ISD::BITREVERSE, MVT::v16i8, { 1 } }, | ||||
3556 | { ISD::BITREVERSE, MVT::i64, { 3 } }, | ||||
3557 | { ISD::BITREVERSE, MVT::i32, { 3 } }, | ||||
3558 | { ISD::BITREVERSE, MVT::i16, { 3 } }, | ||||
3559 | { ISD::BITREVERSE, MVT::i8, { 3 } }, | ||||
3560 | // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y)) | ||||
3561 | { ISD::ROTL, MVT::v4i64, { 4, 7, 5, 6 } }, | ||||
3562 | { ISD::ROTL, MVT::v8i32, { 4, 7, 5, 6 } }, | ||||
3563 | { ISD::ROTL, MVT::v16i16, { 4, 7, 5, 6 } }, | ||||
3564 | { ISD::ROTL, MVT::v32i8, { 4, 7, 5, 6 } }, | ||||
3565 | { ISD::ROTL, MVT::v2i64, { 1, 3, 1, 1 } }, | ||||
3566 | { ISD::ROTL, MVT::v4i32, { 1, 3, 1, 1 } }, | ||||
3567 | { ISD::ROTL, MVT::v8i16, { 1, 3, 1, 1 } }, | ||||
3568 | { ISD::ROTL, MVT::v16i8, { 1, 3, 1, 1 } }, | ||||
3569 | { ISD::ROTR, MVT::v4i64, { 4, 7, 8, 9 } }, | ||||
3570 | { ISD::ROTR, MVT::v8i32, { 4, 7, 8, 9 } }, | ||||
3571 | { ISD::ROTR, MVT::v16i16, { 4, 7, 8, 9 } }, | ||||
3572 | { ISD::ROTR, MVT::v32i8, { 4, 7, 8, 9 } }, | ||||
3573 | { ISD::ROTR, MVT::v2i64, { 1, 3, 3, 3 } }, | ||||
3574 | { ISD::ROTR, MVT::v4i32, { 1, 3, 3, 3 } }, | ||||
3575 | { ISD::ROTR, MVT::v8i16, { 1, 3, 3, 3 } }, | ||||
3576 | { ISD::ROTR, MVT::v16i8, { 1, 3, 3, 3 } } | ||||
3577 | }; | ||||
3578 | static const CostKindTblEntry AVX2CostTbl[] = { | ||||
3579 | { ISD::ABS, MVT::v2i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X) | ||||
3580 | { ISD::ABS, MVT::v4i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X) | ||||
3581 | { ISD::ABS, MVT::v4i32, { 1, 1, 1, 1 } }, | ||||
3582 | { ISD::ABS, MVT::v8i32, { 1, 1, 1, 2 } }, | ||||
3583 | { ISD::ABS, MVT::v8i16, { 1, 1, 1, 1 } }, | ||||
3584 | { ISD::ABS, MVT::v16i16, { 1, 1, 1, 2 } }, | ||||
3585 | { ISD::ABS, MVT::v16i8, { 1, 1, 1, 1 } }, | ||||
3586 | { ISD::ABS, MVT::v32i8, { 1, 1, 1, 2 } }, | ||||
3587 | { ISD::BITREVERSE, MVT::v2i64, { 3 } }, | ||||
3588 | { ISD::BITREVERSE, MVT::v4i64, { 3 } }, | ||||
3589 | { ISD::BITREVERSE, MVT::v4i32, { 3 } }, | ||||
3590 | { ISD::BITREVERSE, MVT::v8i32, { 3 } }, | ||||
3591 | { ISD::BITREVERSE, MVT::v8i16, { 3 } }, | ||||
3592 | { ISD::BITREVERSE, MVT::v16i16, { 3 } }, | ||||
3593 | { ISD::BITREVERSE, MVT::v16i8, { 3 } }, | ||||
3594 | { ISD::BITREVERSE, MVT::v32i8, { 3 } }, | ||||
3595 | { ISD::BSWAP, MVT::v4i64, { 1 } }, | ||||
3596 | { ISD::BSWAP, MVT::v8i32, { 1 } }, | ||||
3597 | { ISD::BSWAP, MVT::v16i16, { 1 } }, | ||||
3598 | { ISD::CTLZ, MVT::v2i64, { 7, 18, 24, 25 } }, | ||||
3599 | { ISD::CTLZ, MVT::v4i64, { 14, 18, 24, 44 } }, | ||||
3600 | { ISD::CTLZ, MVT::v4i32, { 5, 16, 19, 20 } }, | ||||
3601 | { ISD::CTLZ, MVT::v8i32, { 10, 16, 19, 34 } }, | ||||
3602 | { ISD::CTLZ, MVT::v8i16, { 4, 13, 14, 15 } }, | ||||
3603 | { ISD::CTLZ, MVT::v16i16, { 6, 14, 14, 24 } }, | ||||
3604 | { ISD::CTLZ, MVT::v16i8, { 3, 12, 9, 10 } }, | ||||
3605 | { ISD::CTLZ, MVT::v32i8, { 4, 12, 9, 14 } }, | ||||
3606 | { ISD::CTPOP, MVT::v2i64, { 3, 9, 10, 10 } }, | ||||
3607 | { ISD::CTPOP, MVT::v4i64, { 4, 9, 10, 14 } }, | ||||
3608 | { ISD::CTPOP, MVT::v4i32, { 7, 12, 14, 14 } }, | ||||
3609 | { ISD::CTPOP, MVT::v8i32, { 7, 12, 14, 18 } }, | ||||
3610 | { ISD::CTPOP, MVT::v8i16, { 3, 7, 11, 11 } }, | ||||
3611 | { ISD::CTPOP, MVT::v16i16, { 6, 8, 11, 18 } }, | ||||
3612 | { ISD::CTPOP, MVT::v16i8, { 2, 5, 8, 8 } }, | ||||
3613 | { ISD::CTPOP, MVT::v32i8, { 3, 5, 8, 12 } }, | ||||
3614 | { ISD::CTTZ, MVT::v2i64, { 4, 11, 13, 13 } }, | ||||
3615 | { ISD::CTTZ, MVT::v4i64, { 5, 11, 13, 20 } }, | ||||
3616 | { ISD::CTTZ, MVT::v4i32, { 7, 14, 17, 17 } }, | ||||
3617 | { ISD::CTTZ, MVT::v8i32, { 7, 15, 17, 24 } }, | ||||
3618 | { ISD::CTTZ, MVT::v8i16, { 4, 9, 14, 14 } }, | ||||
3619 | { ISD::CTTZ, MVT::v16i16, { 6, 9, 14, 24 } }, | ||||
3620 | { ISD::CTTZ, MVT::v16i8, { 3, 7, 11, 11 } }, | ||||
3621 | { ISD::CTTZ, MVT::v32i8, { 5, 7, 11, 18 } }, | ||||
3622 | { ISD::SADDSAT, MVT::v16i16, { 1 } }, | ||||
3623 | { ISD::SADDSAT, MVT::v32i8, { 1 } }, | ||||
3624 | { ISD::SMAX, MVT::v2i64, { 2, 7, 2, 3 } }, | ||||
3625 | { ISD::SMAX, MVT::v4i64, { 2, 7, 2, 3 } }, | ||||
3626 | { ISD::SMAX, MVT::v8i32, { 1, 1, 1, 2 } }, | ||||
3627 | { ISD::SMAX, MVT::v16i16, { 1, 1, 1, 2 } }, | ||||
3628 | { ISD::SMAX, MVT::v32i8, { 1, 1, 1, 2 } }, | ||||
3629 | { ISD::SMIN, MVT::v2i64, { 2, 7, 2, 3 } }, | ||||
3630 | { ISD::SMIN, MVT::v4i64, { 2, 7, 2, 3 } }, | ||||
3631 | { ISD::SMIN, MVT::v8i32, { 1, 1, 1, 2 } }, | ||||
3632 | { ISD::SMIN, MVT::v16i16, { 1, 1, 1, 2 } }, | ||||
3633 | { ISD::SMIN, MVT::v32i8, { 1, 1, 1, 2 } }, | ||||
3634 | { ISD::SSUBSAT, MVT::v16i16, { 1 } }, | ||||
3635 | { ISD::SSUBSAT, MVT::v32i8, { 1 } }, | ||||
3636 | { ISD::UADDSAT, MVT::v16i16, { 1 } }, | ||||
3637 | { ISD::UADDSAT, MVT::v32i8, { 1 } }, | ||||
3638 | { ISD::UADDSAT, MVT::v8i32, { 3 } }, // not + pminud + paddd | ||||
3639 | { ISD::UMAX, MVT::v2i64, { 2, 8, 5, 6 } }, | ||||
3640 | { ISD::UMAX, MVT::v4i64, { 2, 8, 5, 8 } }, | ||||
3641 | { ISD::UMAX, MVT::v8i32, { 1, 1, 1, 2 } }, | ||||
3642 | { ISD::UMAX, MVT::v16i16, { 1, 1, 1, 2 } }, | ||||
3643 | { ISD::UMAX, MVT::v32i8, { 1, 1, 1, 2 } }, | ||||
3644 | { ISD::UMIN, MVT::v2i64, { 2, 8, 5, 6 } }, | ||||
3645 | { ISD::UMIN, MVT::v4i64, { 2, 8, 5, 8 } }, | ||||
3646 | { ISD::UMIN, MVT::v8i32, { 1, 1, 1, 2 } }, | ||||
3647 | { ISD::UMIN, MVT::v16i16, { 1, 1, 1, 2 } }, | ||||
3648 | { ISD::UMIN, MVT::v32i8, { 1, 1, 1, 2 } }, | ||||
3649 | { ISD::USUBSAT, MVT::v16i16, { 1 } }, | ||||
3650 | { ISD::USUBSAT, MVT::v32i8, { 1 } }, | ||||
3651 | { ISD::USUBSAT, MVT::v8i32, { 2 } }, // pmaxud + psubd | ||||
3652 | { ISD::FMAXNUM, MVT::v8f32, { 3 } }, // MAXPS + CMPUNORDPS + BLENDVPS | ||||
3653 | { ISD::FMAXNUM, MVT::v4f64, { 3 } }, // MAXPD + CMPUNORDPD + BLENDVPD | ||||
3654 | { ISD::FSQRT, MVT::f32, { 7, 15, 1, 1 } }, // vsqrtss | ||||
3655 | { ISD::FSQRT, MVT::v4f32, { 7, 15, 1, 1 } }, // vsqrtps | ||||
3656 | { ISD::FSQRT, MVT::v8f32, { 14, 21, 1, 3 } }, // vsqrtps | ||||
3657 | { ISD::FSQRT, MVT::f64, { 14, 21, 1, 1 } }, // vsqrtsd | ||||
3658 | { ISD::FSQRT, MVT::v2f64, { 14, 21, 1, 1 } }, // vsqrtpd | ||||
3659 | { ISD::FSQRT, MVT::v4f64, { 28, 35, 1, 3 } }, // vsqrtpd | ||||
3660 | }; | ||||
3661 | static const CostKindTblEntry AVX1CostTbl[] = { | ||||
3662 | { ISD::ABS, MVT::v4i64, { 6, 8, 6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X) | ||||
3663 | { ISD::ABS, MVT::v8i32, { 3, 6, 4, 5 } }, | ||||
3664 | { ISD::ABS, MVT::v16i16, { 3, 6, 4, 5 } }, | ||||
3665 | { ISD::ABS, MVT::v32i8, { 3, 6, 4, 5 } }, | ||||
3666 | { ISD::BITREVERSE, MVT::v4i64, { 12 } }, // 2 x 128-bit Op + extract/insert | ||||
3667 | { ISD::BITREVERSE, MVT::v8i32, { 12 } }, // 2 x 128-bit Op + extract/insert | ||||
3668 | { ISD::BITREVERSE, MVT::v16i16, { 12 } }, // 2 x 128-bit Op + extract/insert | ||||
3669 | { ISD::BITREVERSE, MVT::v32i8, { 12 } }, // 2 x 128-bit Op + extract/insert | ||||
3670 | { ISD::BSWAP, MVT::v4i64, { 4 } }, | ||||
3671 | { ISD::BSWAP, MVT::v8i32, { 4 } }, | ||||
3672 | { ISD::BSWAP, MVT::v16i16, { 4 } }, | ||||
3673 | { ISD::CTLZ, MVT::v4i64, { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert | ||||
3674 | { ISD::CTLZ, MVT::v2i64, { 14, 24, 24, 28 } }, | ||||
3675 | { ISD::CTLZ, MVT::v8i32, { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert | ||||
3676 | { ISD::CTLZ, MVT::v4i32, { 12, 20, 19, 23 } }, | ||||
3677 | { ISD::CTLZ, MVT::v16i16, { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert | ||||
3678 | { ISD::CTLZ, MVT::v8i16, { 9, 16, 14, 18 } }, | ||||
3679 | { ISD::CTLZ, MVT::v32i8, { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert | ||||
3680 | { ISD::CTLZ, MVT::v16i8, { 7, 12, 9, 13 } }, | ||||
3681 | { ISD::CTPOP, MVT::v4i64, { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert | ||||
3682 | { ISD::CTPOP, MVT::v2i64, { 7, 14, 10, 14 } }, | ||||
3683 | { ISD::CTPOP, MVT::v8i32, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert | ||||
3684 | { ISD::CTPOP, MVT::v4i32, { 9, 20, 14, 18 } }, | ||||
3685 | { ISD::CTPOP, MVT::v16i16, { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert | ||||
3686 | { ISD::CTPOP, MVT::v8i16, { 8, 18, 11, 15 } }, | ||||
3687 | { ISD::CTPOP, MVT::v32i8, { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert | ||||
3688 | { ISD::CTPOP, MVT::v16i8, { 6, 12, 8, 12 } }, | ||||
3689 | { ISD::CTTZ, MVT::v4i64, { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert | ||||
3690 | { ISD::CTTZ, MVT::v2i64, { 9, 19, 13, 17 } }, | ||||
3691 | { ISD::CTTZ, MVT::v8i32, { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert | ||||
3692 | { ISD::CTTZ, MVT::v4i32, { 11, 24, 17, 21 } }, | ||||
3693 | { ISD::CTTZ, MVT::v16i16, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert | ||||
3694 | { ISD::CTTZ, MVT::v8i16, { 9, 21, 14, 18 } }, | ||||
3695 | { ISD::CTTZ, MVT::v32i8, { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert | ||||
3696 | { ISD::CTTZ, MVT::v16i8, { 8, 16, 11, 15 } }, | ||||
3697 | { ISD::SADDSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert | ||||
3698 | { ISD::SADDSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert | ||||
3699 | { ISD::SMAX, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert | ||||
3700 | { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 4 } }, | ||||
3701 | { ISD::SMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert | ||||
3702 | { ISD::SMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert | ||||
3703 | { ISD::SMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert | ||||
3704 | { ISD::SMIN, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert | ||||
3705 | { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } }, | ||||
3706 | { ISD::SMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert | ||||
3707 | { ISD::SMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert | ||||
3708 | { ISD::SMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert | ||||
3709 | { ISD::SSUBSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert | ||||
3710 | { ISD::SSUBSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert | ||||
3711 | { ISD::UADDSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert | ||||
3712 | { ISD::UADDSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert | ||||
3713 | { ISD::UADDSAT, MVT::v8i32, { 8 } }, // 2 x 128-bit Op + extract/insert | ||||
3714 | { ISD::UMAX, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert | ||||
3715 | { ISD::UMAX, MVT::v2i64, { 4, 8, 5, 7 } }, | ||||
3716 | { ISD::UMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert | ||||
3717 | { ISD::UMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert | ||||
3718 | { ISD::UMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert | ||||
3719 | { ISD::UMIN, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert | ||||
3720 | { ISD::UMIN, MVT::v2i64, { 4, 8, 5, 7 } }, | ||||
3721 | { ISD::UMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert | ||||
3722 | { ISD::UMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert | ||||
3723 | { ISD::UMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert | ||||
3724 | { ISD::USUBSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert | ||||
3725 | { ISD::USUBSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert | ||||
3726 | { ISD::USUBSAT, MVT::v8i32, { 6 } }, // 2 x 128-bit Op + extract/insert | ||||
3727 | { ISD::FMAXNUM, MVT::f32, { 3 } }, // MAXSS + CMPUNORDSS + BLENDVPS | ||||
3728 | { ISD::FMAXNUM, MVT::v4f32, { 3 } }, // MAXPS + CMPUNORDPS + BLENDVPS | ||||
3729 | { ISD::FMAXNUM, MVT::v8f32, { 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS + ? | ||||
3730 | { ISD::FMAXNUM, MVT::f64, { 3 } }, // MAXSD + CMPUNORDSD + BLENDVPD | ||||
3731 | { ISD::FMAXNUM, MVT::v2f64, { 3 } }, // MAXPD + CMPUNORDPD + BLENDVPD | ||||
3732 | { ISD::FMAXNUM, MVT::v4f64, { 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD + ? | ||||
3733 | { ISD::FSQRT, MVT::f32, { 21, 21, 1, 1 } }, // vsqrtss | ||||
3734 | { ISD::FSQRT, MVT::v4f32, { 21, 21, 1, 1 } }, // vsqrtps | ||||
3735 | { ISD::FSQRT, MVT::v8f32, { 42, 42, 1, 3 } }, // vsqrtps | ||||
3736 | { ISD::FSQRT, MVT::f64, { 27, 27, 1, 1 } }, // vsqrtsd | ||||
3737 | { ISD::FSQRT, MVT::v2f64, { 27, 27, 1, 1 } }, // vsqrtpd | ||||
3738 | { ISD::FSQRT, MVT::v4f64, { 54, 54, 1, 3 } }, // vsqrtpd | ||||
3739 | }; | ||||
3740 | static const CostKindTblEntry GLMCostTbl[] = { | ||||
3741 | { ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss | ||||
3742 | { ISD::FSQRT, MVT::v4f32, { 37, 41, 1, 5 } }, // sqrtps | ||||
3743 | { ISD::FSQRT, MVT::f64, { 34, 35, 1, 1 } }, // sqrtsd | ||||
3744 | { ISD::FSQRT, MVT::v2f64, { 67, 71, 1, 5 } }, // sqrtpd | ||||
3745 | }; | ||||
3746 | static const CostKindTblEntry SLMCostTbl[] = { | ||||
3747 | { ISD::FSQRT, MVT::f32, { 20, 20, 1, 1 } }, // sqrtss | ||||
3748 | { ISD::FSQRT, MVT::v4f32, { 40, 41, 1, 5 } }, // sqrtps | ||||
3749 | { ISD::FSQRT, MVT::f64, { 35, 35, 1, 1 } }, // sqrtsd | ||||
3750 | { ISD::FSQRT, MVT::v2f64, { 70, 71, 1, 5 } }, // sqrtpd | ||||
3751 | }; | ||||
3752 | static const CostKindTblEntry SSE42CostTbl[] = { | ||||
3753 | { ISD::USUBSAT, MVT::v4i32, { 2 } }, // pmaxud + psubd | ||||
3754 | { ISD::UADDSAT, MVT::v4i32, { 3 } }, // not + pminud + paddd | ||||
3755 | { ISD::FSQRT, MVT::f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/ | ||||
3756 | { ISD::FSQRT, MVT::v4f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/ | ||||
3757 | }; | ||||
3758 | static const CostKindTblEntry SSE41CostTbl[] = { | ||||
3759 | { ISD::ABS, MVT::v2i64, { 3, 4, 3, 5 } }, // BLENDVPD(X,PSUBQ(0,X),X) | ||||
3760 | { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 3 } }, | ||||
3761 | { ISD::SMAX, MVT::v4i32, { 1, 1, 1, 1 } }, | ||||
3762 | { ISD::SMAX, MVT::v16i8, { 1, 1, 1, 1 } }, | ||||
3763 | { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } }, | ||||
3764 | { ISD::SMIN, MVT::v4i32, { 1, 1, 1, 1 } }, | ||||
3765 | { ISD::SMIN, MVT::v16i8, { 1, 1, 1, 1 } }, | ||||
3766 | { ISD::UMAX, MVT::v2i64, { 2, 11, 6, 7 } }, | ||||
3767 | { ISD::UMAX, MVT::v4i32, { 1, 1, 1, 1 } }, | ||||
3768 | { ISD::UMAX, MVT::v8i16, { 1, 1, 1, 1 } }, | ||||
3769 | { ISD::UMIN, MVT::v2i64, { 2, 11, 6, 7 } }, | ||||
3770 | { ISD::UMIN, MVT::v4i32, { 1, 1, 1, 1 } }, | ||||
3771 | { ISD::UMIN, MVT::v8i16, { 1, 1, 1, 1 } }, | ||||
3772 | }; | ||||
3773 | static const CostKindTblEntry SSSE3CostTbl[] = { | ||||
3774 | { ISD::ABS, MVT::v4i32, { 1, 2, 1, 1 } }, | ||||
3775 | { ISD::ABS, MVT::v8i16, { 1, 2, 1, 1 } }, | ||||
3776 | { ISD::ABS, MVT::v16i8, { 1, 2, 1, 1 } }, | ||||
3777 | { ISD::BITREVERSE, MVT::v2i64, { 5 } }, | ||||
3778 | { ISD::BITREVERSE, MVT::v4i32, { 5 } }, | ||||
3779 | { ISD::BITREVERSE, MVT::v8i16, { 5 } }, | ||||
3780 | { ISD::BITREVERSE, MVT::v16i8, { 5 } }, | ||||
3781 | { ISD::BSWAP, MVT::v2i64, { 1 } }, | ||||
3782 | { ISD::BSWAP, MVT::v4i32, { 1 } }, | ||||
3783 | { ISD::BSWAP, MVT::v8i16, { 1 } }, | ||||
3784 | { ISD::CTLZ, MVT::v2i64, { 18, 28, 28, 35 } }, | ||||
3785 | { ISD::CTLZ, MVT::v4i32, { 15, 20, 22, 28 } }, | ||||
3786 | { ISD::CTLZ, MVT::v8i16, { 13, 17, 16, 22 } }, | ||||
3787 | { ISD::CTLZ, MVT::v16i8, { 11, 15, 10, 16 } }, | ||||
3788 | { ISD::CTPOP, MVT::v2i64, { 13, 19, 12, 18 } }, | ||||
3789 | { ISD::CTPOP, MVT::v4i32, { 18, 24, 16, 22 } }, | ||||
3790 | { ISD::CTPOP, MVT::v8i16, { 13, 18, 14, 20 } }, | ||||
3791 | { ISD::CTPOP, MVT::v16i8, { 11, 12, 10, 16 } }, | ||||
3792 | { ISD::CTTZ, MVT::v2i64, { 13, 25, 15, 22 } }, | ||||
3793 | { ISD::CTTZ, MVT::v4i32, { 18, 26, 19, 25 } }, | ||||
3794 | { ISD::CTTZ, MVT::v8i16, { 13, 20, 17, 23 } }, | ||||
3795 | { ISD::CTTZ, MVT::v16i8, { 11, 16, 13, 19 } } | ||||
3796 | }; | ||||
3797 | static const CostKindTblEntry SSE2CostTbl[] = { | ||||
3798 | { ISD::ABS, MVT::v2i64, { 3, 6, 5, 5 } }, | ||||
3799 | { ISD::ABS, MVT::v4i32, { 1, 4, 4, 4 } }, | ||||
3800 | { ISD::ABS, MVT::v8i16, { 1, 2, 3, 3 } }, | ||||
3801 | { ISD::ABS, MVT::v16i8, { 1, 2, 3, 3 } }, | ||||
3802 | { ISD::BITREVERSE, MVT::v2i64, { 29 } }, | ||||
3803 | { ISD::BITREVERSE, MVT::v4i32, { 27 } }, | ||||
3804 | { ISD::BITREVERSE, MVT::v8i16, { 27 } }, | ||||
3805 | { ISD::BITREVERSE, MVT::v16i8, { 20 } }, | ||||
3806 | { ISD::BSWAP, MVT::v2i64, { 7 } }, | ||||
3807 | { ISD::BSWAP, MVT::v4i32, { 7 } }, | ||||
3808 | { ISD::BSWAP, MVT::v8i16, { 7 } }, | ||||
3809 | { ISD::CTLZ, MVT::v2i64, { 10, 45, 36, 38 } }, | ||||
3810 | { ISD::CTLZ, MVT::v4i32, { 10, 45, 38, 40 } }, | ||||
3811 | { ISD::CTLZ, MVT::v8i16, { 9, 38, 32, 34 } }, | ||||
3812 | { ISD::CTLZ, MVT::v16i8, { 8, 39, 29, 32 } }, | ||||
3813 | { ISD::CTPOP, MVT::v2i64, { 12, 26, 16, 18 } }, | ||||
3814 | { ISD::CTPOP, MVT::v4i32, { 15, 29, 21, 23 } }, | ||||
3815 | { ISD::CTPOP, MVT::v8i16, { 13, 25, 18, 20 } }, | ||||
3816 | { ISD::CTPOP, MVT::v16i8, { 10, 21, 14, 16 } }, | ||||
3817 | { ISD::CTTZ, MVT::v2i64, { 14, 28, 19, 21 } }, | ||||
3818 | { ISD::CTTZ, MVT::v4i32, { 18, 31, 24, 26 } }, | ||||
3819 | { ISD::CTTZ, MVT::v8i16, { 16, 27, 21, 23 } }, | ||||
3820 | { ISD::CTTZ, MVT::v16i8, { 13, 23, 17, 19 } }, | ||||
3821 | { ISD::SADDSAT, MVT::v8i16, { 1 } }, | ||||
3822 | { ISD::SADDSAT, MVT::v16i8, { 1 } }, | ||||
3823 | { ISD::SMAX, MVT::v2i64, { 4, 8, 15, 15 } }, | ||||
3824 | { ISD::SMAX, MVT::v4i32, { 2, 4, 5, 5 } }, | ||||
3825 | { ISD::SMAX, MVT::v8i16, { 1, 1, 1, 1 } }, | ||||
3826 | { ISD::SMAX, MVT::v16i8, { 2, 4, 5, 5 } }, | ||||
3827 | { ISD::SMIN, MVT::v2i64, { 4, 8, 15, 15 } }, | ||||
3828 | { ISD::SMIN, MVT::v4i32, { 2, 4, 5, 5 } }, | ||||
3829 | { ISD::SMIN, MVT::v8i16, { 1, 1, 1, 1 } }, | ||||
3830 | { ISD::SMIN, MVT::v16i8, { 2, 4, 5, 5 } }, | ||||
3831 | { ISD::SSUBSAT, MVT::v8i16, { 1 } }, | ||||
3832 | { ISD::SSUBSAT, MVT::v16i8, { 1 } }, | ||||
3833 | { ISD::UADDSAT, MVT::v8i16, { 1 } }, | ||||
3834 | { ISD::UADDSAT, MVT::v16i8, { 1 } }, | ||||
3835 | { ISD::UMAX, MVT::v2i64, { 4, 8, 15, 15 } }, | ||||
3836 | { ISD::UMAX, MVT::v4i32, { 2, 5, 8, 8 } }, | ||||
3837 | { ISD::UMAX, MVT::v8i16, { 1, 3, 3, 3 } }, | ||||
3838 | { ISD::UMAX, MVT::v16i8, { 1, 1, 1, 1 } }, | ||||
3839 | { ISD::UMIN, MVT::v2i64, { 4, 8, 15, 15 } }, | ||||
3840 | { ISD::UMIN, MVT::v4i32, { 2, 5, 8, 8 } }, | ||||
3841 | { ISD::UMIN, MVT::v8i16, { 1, 3, 3, 3 } }, | ||||
3842 | { ISD::UMIN, MVT::v16i8, { 1, 1, 1, 1 } }, | ||||
3843 | { ISD::USUBSAT, MVT::v8i16, { 1 } }, | ||||
3844 | { ISD::USUBSAT, MVT::v16i8, { 1 } }, | ||||
3845 | { ISD::FMAXNUM, MVT::f64, { 4 } }, | ||||
3846 | { ISD::FMAXNUM, MVT::v2f64, { 4 } }, | ||||
3847 | { ISD::FSQRT, MVT::f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/ | ||||
3848 | { ISD::FSQRT, MVT::v2f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/ | ||||
3849 | }; | ||||
3850 | static const CostKindTblEntry SSE1CostTbl[] = { | ||||
3851 | { ISD::FMAXNUM, MVT::f32, { 4 } }, | ||||
3852 | { ISD::FMAXNUM, MVT::v4f32, { 4 } }, | ||||
3853 | { ISD::FSQRT, MVT::f32, { 28, 30, 1, 2 } }, // Pentium III from http://www.agner.org/ | ||||
3854 | { ISD::FSQRT, MVT::v4f32, { 56, 56, 1, 2 } }, // Pentium III from http://www.agner.org/ | ||||
3855 | }; | ||||
3856 | static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets | ||||
3857 | { ISD::CTTZ, MVT::i64, { 1 } }, | ||||
3858 | }; | ||||
3859 | static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets | ||||
3860 | { ISD::CTTZ, MVT::i32, { 1 } }, | ||||
3861 | { ISD::CTTZ, MVT::i16, { 1 } }, | ||||
3862 | { ISD::CTTZ, MVT::i8, { 1 } }, | ||||
3863 | }; | ||||
3864 | static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets | ||||
3865 | { ISD::CTLZ, MVT::i64, { 1 } }, | ||||
3866 | }; | ||||
3867 | static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets | ||||
3868 | { ISD::CTLZ, MVT::i32, { 1 } }, | ||||
3869 | { ISD::CTLZ, MVT::i16, { 2 } }, | ||||
3870 | { ISD::CTLZ, MVT::i8, { 2 } }, | ||||
3871 | }; | ||||
3872 | static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets | ||||
3873 | { ISD::CTPOP, MVT::i64, { 1, 1, 1, 1 } }, // popcnt | ||||
3874 | }; | ||||
3875 | static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets | ||||
3876 | { ISD::CTPOP, MVT::i32, { 1, 1, 1, 1 } }, // popcnt | ||||
3877 | { ISD::CTPOP, MVT::i16, { 1, 1, 2, 2 } }, // popcnt(zext()) | ||||
3878 | { ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext()) | ||||
3879 | }; | ||||
3880 | static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets | ||||
3881 | { ISD::ABS, MVT::i64, { 1, 2, 3, 4 } }, // SUB+CMOV | ||||
3882 | { ISD::BITREVERSE, MVT::i64, { 14 } }, | ||||
3883 | { ISD::BSWAP, MVT::i64, { 1 } }, | ||||
3884 | { ISD::CTLZ, MVT::i64, { 4 } }, // BSR+XOR or BSR+XOR+CMOV | ||||
3885 | { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 1, 1, 1 } }, // BSR+XOR | ||||
3886 | { ISD::CTTZ, MVT::i64, { 3 } }, // TEST+BSF+CMOV/BRANCH | ||||
3887 | { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 1, 1, 1 } }, // BSR | ||||
3888 | { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } }, | ||||
3889 | { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } }, | ||||
3890 | { ISD::ROTR, MVT::i64, { 2, 3, 1, 3 } }, | ||||
3891 | { ISD::FSHL, MVT::i64, { 4, 4, 1, 4 } }, | ||||
3892 | { ISD::SMAX, MVT::i64, { 1, 3, 2, 3 } }, | ||||
3893 | { ISD::SMIN, MVT::i64, { 1, 3, 2, 3 } }, | ||||
3894 | { ISD::UMAX, MVT::i64, { 1, 3, 2, 3 } }, | ||||
3895 | { ISD::UMIN, MVT::i64, { 1, 3, 2, 3 } }, | ||||
3896 | { ISD::SADDO, MVT::i64, { 1 } }, | ||||
3897 | { ISD::UADDO, MVT::i64, { 1 } }, | ||||
3898 | { ISD::UMULO, MVT::i64, { 2 } }, // mulq + seto | ||||
3899 | }; | ||||
3900 | static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets | ||||
3901 | { ISD::ABS, MVT::i32, { 1, 2, 3, 4 } }, // SUB+XOR+SRA or SUB+CMOV | ||||
3902 | { ISD::ABS, MVT::i16, { 2, 2, 3, 4 } }, // SUB+XOR+SRA or SUB+CMOV | ||||
3903 | { ISD::ABS, MVT::i8, { 2, 4, 4, 4 } }, // SUB+XOR+SRA | ||||
3904 | { ISD::BITREVERSE, MVT::i32, { 14 } }, | ||||
3905 | { ISD::BITREVERSE, MVT::i16, { 14 } }, | ||||
3906 | { ISD::BITREVERSE, MVT::i8, { 11 } }, | ||||
3907 | { ISD::BSWAP, MVT::i32, { 1 } }, | ||||
3908 | { ISD::BSWAP, MVT::i16, { 1 } }, // ROL | ||||
3909 | { ISD::CTLZ, MVT::i32, { 4 } }, // BSR+XOR or BSR+XOR+CMOV | ||||
3910 | { ISD::CTLZ, MVT::i16, { 4 } }, // BSR+XOR or BSR+XOR+CMOV | ||||
3911 | { ISD::CTLZ, MVT::i8, { 4 } }, // BSR+XOR or BSR+XOR+CMOV | ||||
3912 | { ISD::CTLZ_ZERO_UNDEF, MVT::i32,{ 1, 1, 1, 1 } }, // BSR+XOR | ||||
3913 | { ISD::CTLZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 3, 3 } }, // BSR+XOR | ||||
3914 | { ISD::CTLZ_ZERO_UNDEF, MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR | ||||
3915 | { ISD::CTTZ, MVT::i32, { 3 } }, // TEST+BSF+CMOV/BRANCH | ||||
3916 | { ISD::CTTZ, MVT::i16, { 3 } }, // TEST+BSF+CMOV/BRANCH | ||||
3917 | { ISD::CTTZ, MVT::i8, { 3 } }, // TEST+BSF+CMOV/BRANCH | ||||
3918 | { ISD::CTTZ_ZERO_UNDEF, MVT::i32,{ 1, 1, 1, 1 } }, // BSF | ||||
3919 | { ISD::CTTZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 1, 1 } }, // BSF | ||||
3920 | { ISD::CTTZ_ZERO_UNDEF, MVT::i8, { 2, 2, 1, 1 } }, // BSF | ||||
3921 | { ISD::CTPOP, MVT::i32, { 8, 7, 15, 15 } }, | ||||
3922 | { ISD::CTPOP, MVT::i16, { 9, 8, 17, 17 } }, | ||||
3923 | { ISD::CTPOP, MVT::i8, { 7, 6, 13, 13 } }, | ||||
3924 | { ISD::ROTL, MVT::i32, { 2, 3, 1, 3 } }, | ||||
3925 | { ISD::ROTL, MVT::i16, { 2, 3, 1, 3 } }, | ||||
3926 | { ISD::ROTL, MVT::i8, { 2, 3, 1, 3 } }, | ||||
3927 | { ISD::ROTR, MVT::i32, { 2, 3, 1, 3 } }, | ||||
3928 | { ISD::ROTR, MVT::i16, { 2, 3, 1, 3 } }, | ||||
3929 | { ISD::ROTR, MVT::i8, { 2, 3, 1, 3 } }, | ||||
3930 | { ISD::FSHL, MVT::i32, { 4, 4, 1, 4 } }, | ||||
3931 | { ISD::FSHL, MVT::i16, { 4, 4, 2, 5 } }, | ||||
3932 | { ISD::FSHL, MVT::i8, { 4, 4, 2, 5 } }, | ||||
3933 | { ISD::SMAX, MVT::i32, { 1, 2, 2, 3 } }, | ||||
3934 | { ISD::SMAX, MVT::i16, { 1, 4, 2, 4 } }, | ||||
3935 | { ISD::SMAX, MVT::i8, { 1, 4, 2, 4 } }, | ||||
3936 | { ISD::SMIN, MVT::i32, { 1, 2, 2, 3 } }, | ||||
3937 | { ISD::SMIN, MVT::i16, { 1, 4, 2, 4 } }, | ||||
3938 | { ISD::SMIN, MVT::i8, { 1, 4, 2, 4 } }, | ||||
3939 | { ISD::UMAX, MVT::i32, { 1, 2, 2, 3 } }, | ||||
3940 | { ISD::UMAX, MVT::i16, { 1, 4, 2, 4 } }, | ||||
3941 | { ISD::UMAX, MVT::i8, { 1, 4, 2, 4 } }, | ||||
3942 | { ISD::UMIN, MVT::i32, { 1, 2, 2, 3 } }, | ||||
3943 | { ISD::UMIN, MVT::i16, { 1, 4, 2, 4 } }, | ||||
3944 | { ISD::UMIN, MVT::i8, { 1, 4, 2, 4 } }, | ||||
3945 | { ISD::SADDO, MVT::i32, { 1 } }, | ||||
3946 | { ISD::SADDO, MVT::i16, { 1 } }, | ||||
3947 | { ISD::SADDO, MVT::i8, { 1 } }, | ||||
3948 | { ISD::UADDO, MVT::i32, { 1 } }, | ||||
3949 | { ISD::UADDO, MVT::i16, { 1 } }, | ||||
3950 | { ISD::UADDO, MVT::i8, { 1 } }, | ||||
3951 | { ISD::UMULO, MVT::i32, { 2 } }, // mul + seto | ||||
3952 | { ISD::UMULO, MVT::i16, { 2 } }, | ||||
3953 | { ISD::UMULO, MVT::i8, { 2 } }, | ||||
3954 | }; | ||||
3955 | |||||
3956 | Type *RetTy = ICA.getReturnType(); | ||||
3957 | Type *OpTy = RetTy; | ||||
3958 | Intrinsic::ID IID = ICA.getID(); | ||||
3959 | unsigned ISD = ISD::DELETED_NODE; | ||||
3960 | switch (IID) { | ||||
3961 | default: | ||||
3962 | break; | ||||
3963 | case Intrinsic::abs: | ||||
3964 | ISD = ISD::ABS; | ||||
3965 | break; | ||||
3966 | case Intrinsic::bitreverse: | ||||
3967 | ISD = ISD::BITREVERSE; | ||||
3968 | break; | ||||
3969 | case Intrinsic::bswap: | ||||
3970 | ISD = ISD::BSWAP; | ||||
3971 | break; | ||||
3972 | case Intrinsic::ctlz: | ||||
3973 | ISD = ISD::CTLZ; | ||||
3974 | break; | ||||
3975 | case Intrinsic::ctpop: | ||||
3976 | ISD = ISD::CTPOP; | ||||
3977 | break; | ||||
3978 | case Intrinsic::cttz: | ||||
3979 | ISD = ISD::CTTZ; | ||||
3980 | break; | ||||
3981 | case Intrinsic::fshl: | ||||
3982 | ISD = ISD::FSHL; | ||||
3983 | if (!ICA.isTypeBasedOnly()) { | ||||
3984 | const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); | ||||
3985 | if (Args[0] == Args[1]) | ||||
3986 | ISD = ISD::ROTL; | ||||
3987 | } | ||||
3988 | break; | ||||
3989 | case Intrinsic::fshr: | ||||
3990 | // FSHR has same costs so don't duplicate. | ||||
3991 | ISD = ISD::FSHL; | ||||
3992 | if (!ICA.isTypeBasedOnly()) { | ||||
3993 | const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); | ||||
3994 | if (Args[0] == Args[1]) | ||||
3995 | ISD = ISD::ROTR; | ||||
3996 | } | ||||
3997 | break; | ||||
3998 | case Intrinsic::maxnum: | ||||
3999 | case Intrinsic::minnum: | ||||
4000 | // FMINNUM has same costs so don't duplicate. | ||||
4001 | ISD = ISD::FMAXNUM; | ||||
4002 | break; | ||||
4003 | case Intrinsic::sadd_sat: | ||||
4004 | ISD = ISD::SADDSAT; | ||||
4005 | break; | ||||
4006 | case Intrinsic::smax: | ||||
4007 | ISD = ISD::SMAX; | ||||
4008 | break; | ||||
4009 | case Intrinsic::smin: | ||||
4010 | ISD = ISD::SMIN; | ||||
4011 | break; | ||||
4012 | case Intrinsic::ssub_sat: | ||||
4013 | ISD = ISD::SSUBSAT; | ||||
4014 | break; | ||||
4015 | case Intrinsic::uadd_sat: | ||||
4016 | ISD = ISD::UADDSAT; | ||||
4017 | break; | ||||
4018 | case Intrinsic::umax: | ||||
4019 | ISD = ISD::UMAX; | ||||
4020 | break; | ||||
4021 | case Intrinsic::umin: | ||||
4022 | ISD = ISD::UMIN; | ||||
4023 | break; | ||||
4024 | case Intrinsic::usub_sat: | ||||
4025 | ISD = ISD::USUBSAT; | ||||
4026 | break; | ||||
4027 | case Intrinsic::sqrt: | ||||
4028 | ISD = ISD::FSQRT; | ||||
4029 | break; | ||||
4030 | case Intrinsic::sadd_with_overflow: | ||||
4031 | case Intrinsic::ssub_with_overflow: | ||||
4032 | // SSUBO has same costs so don't duplicate. | ||||
4033 | ISD = ISD::SADDO; | ||||
4034 | OpTy = RetTy->getContainedType(0); | ||||
4035 | break; | ||||
4036 | case Intrinsic::uadd_with_overflow: | ||||
4037 | case Intrinsic::usub_with_overflow: | ||||
4038 | // USUBO has same costs so don't duplicate. | ||||
4039 | ISD = ISD::UADDO; | ||||
4040 | OpTy = RetTy->getContainedType(0); | ||||
4041 | break; | ||||
4042 | case Intrinsic::umul_with_overflow: | ||||
4043 | case Intrinsic::smul_with_overflow: | ||||
4044 | // SMULO has same costs so don't duplicate. | ||||
4045 | ISD = ISD::UMULO; | ||||
4046 | OpTy = RetTy->getContainedType(0); | ||||
4047 | break; | ||||
4048 | } | ||||
4049 | |||||
4050 | if (ISD != ISD::DELETED_NODE) { | ||||
4051 | // Legalize the type. | ||||
4052 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy); | ||||
4053 | MVT MTy = LT.second; | ||||
4054 | |||||
4055 | // Attempt to lookup cost. | ||||
4056 | if (ISD == ISD::BITREVERSE && ST->hasGFNI() && ST->hasSSSE3() && | ||||
4057 | MTy.isVector()) { | ||||
4058 | // With PSHUFB the code is very similar for all types. If we have integer | ||||
4059 | // byte operations, we just need a GF2P8AFFINEQB for vXi8. For other types | ||||
4060 | // we also need a PSHUFB. | ||||
4061 | unsigned Cost = MTy.getVectorElementType() == MVT::i8 ? 1 : 2; | ||||
4062 | |||||
4063 | // Without byte operations, we need twice as many GF2P8AFFINEQB and PSHUFB | ||||
4064 | // instructions. We also need an extract and an insert. | ||||
4065 | if (!(MTy.is128BitVector() || (ST->hasAVX2() && MTy.is256BitVector()) || | ||||
4066 | (ST->hasBWI() && MTy.is512BitVector()))) | ||||
4067 | Cost = Cost * 2 + 2; | ||||
4068 | |||||
4069 | return LT.first * Cost; | ||||
4070 | } | ||||
4071 | |||||
4072 | // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost. | ||||
4073 | if (((ISD == ISD::CTTZ && !ST->hasBMI()) || | ||||
4074 | (ISD == ISD::CTLZ && !ST->hasLZCNT())) && | ||||
4075 | !MTy.isVector() && !ICA.isTypeBasedOnly()) { | ||||
4076 | const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); | ||||
4077 | if (auto *Cst = dyn_cast<ConstantInt>(Args[1])) | ||||
4078 | if (Cst->isAllOnesValue()) | ||||
4079 | ISD = ISD == ISD::CTTZ ? ISD::CTTZ_ZERO_UNDEF : ISD::CTLZ_ZERO_UNDEF; | ||||
4080 | } | ||||
4081 | |||||
4082 | // FSQRT is a single instruction. | ||||
4083 | if (ISD == ISD::FSQRT && CostKind == TTI::TCK_CodeSize) | ||||
4084 | return LT.first; | ||||
4085 | |||||
4086 | auto adjustTableCost = [](int ISD, unsigned Cost, | ||||
4087 | InstructionCost LegalizationCost, | ||||
4088 | FastMathFlags FMF) { | ||||
4089 | // If there are no NANs to deal with, then these are reduced to a | ||||
4090 | // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we | ||||
4091 | // assume is used in the non-fast case. | ||||
4092 | if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) { | ||||
4093 | if (FMF.noNaNs()) | ||||
4094 | return LegalizationCost * 1; | ||||
4095 | } | ||||
4096 | return LegalizationCost * (int)Cost; | ||||
4097 | }; | ||||
4098 | |||||
4099 | if (ST->useGLMDivSqrtCosts()) | ||||
4100 | if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy)) | ||||
4101 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
4102 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | ||||
4103 | ICA.getFlags()); | ||||
4104 | |||||
4105 | if (ST->useSLMArithCosts()) | ||||
4106 | if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) | ||||
4107 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
4108 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | ||||
4109 | ICA.getFlags()); | ||||
4110 | |||||
4111 | if (ST->hasVBMI2()) | ||||
4112 | if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy)) | ||||
4113 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
4114 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | ||||
4115 | ICA.getFlags()); | ||||
4116 | |||||
4117 | if (ST->hasBITALG()) | ||||
4118 | if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy)) | ||||
4119 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
4120 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | ||||
4121 | ICA.getFlags()); | ||||
4122 | |||||
4123 | if (ST->hasVPOPCNTDQ()) | ||||
4124 | if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy)) | ||||
4125 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
4126 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | ||||
4127 | ICA.getFlags()); | ||||
4128 | |||||
4129 | if (ST->hasCDI()) | ||||
4130 | if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy)) | ||||
4131 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
4132 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | ||||
4133 | ICA.getFlags()); | ||||
4134 | |||||
4135 | if (ST->hasBWI()) | ||||
4136 | if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) | ||||
4137 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
4138 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | ||||
4139 | ICA.getFlags()); | ||||
4140 | |||||
4141 | if (ST->hasAVX512()) | ||||
4142 | if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) | ||||
4143 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
4144 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | ||||
4145 | ICA.getFlags()); | ||||
4146 | |||||
4147 | if (ST->hasXOP()) | ||||
4148 | if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy)) | ||||
4149 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
4150 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | ||||
4151 | ICA.getFlags()); | ||||
4152 | |||||
4153 | if (ST->hasAVX2()) | ||||
4154 | if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) | ||||
4155 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
4156 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | ||||
4157 | ICA.getFlags()); | ||||
4158 | |||||
4159 | if (ST->hasAVX()) | ||||
4160 | if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) | ||||
4161 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
4162 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | ||||
4163 | ICA.getFlags()); | ||||
4164 | |||||
4165 | if (ST->hasSSE42()) | ||||
4166 | if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) | ||||
4167 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
4168 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | ||||
4169 | ICA.getFlags()); | ||||
4170 | |||||
4171 | if (ST->hasSSE41()) | ||||
4172 | if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) | ||||
4173 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
4174 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | ||||
4175 | ICA.getFlags()); | ||||
4176 | |||||
4177 | if (ST->hasSSSE3()) | ||||
4178 | if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy)) | ||||
4179 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
4180 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | ||||
4181 | ICA.getFlags()); | ||||
4182 | |||||
4183 | if (ST->hasSSE2()) | ||||
4184 | if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) | ||||
4185 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
4186 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | ||||
4187 | ICA.getFlags()); | ||||
4188 | |||||
4189 | if (ST->hasSSE1()) | ||||
4190 | if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) | ||||
4191 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
4192 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | ||||
4193 | ICA.getFlags()); | ||||
4194 | |||||
4195 | if (ST->hasBMI()) { | ||||
4196 | if (ST->is64Bit()) | ||||
4197 | if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy)) | ||||
4198 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
4199 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | ||||
4200 | ICA.getFlags()); | ||||
4201 | |||||
4202 | if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy)) | ||||
4203 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
4204 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | ||||
4205 | ICA.getFlags()); | ||||
4206 | } | ||||
4207 | |||||
4208 | if (ST->hasLZCNT()) { | ||||
4209 | if (ST->is64Bit()) | ||||
4210 | if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy)) | ||||
4211 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
4212 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | ||||
4213 | ICA.getFlags()); | ||||
4214 | |||||
4215 | if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy)) | ||||
4216 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
4217 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | ||||
4218 | ICA.getFlags()); | ||||
4219 | } | ||||
4220 | |||||
4221 | if (ST->hasPOPCNT()) { | ||||
4222 | if (ST->is64Bit()) | ||||
4223 | if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy)) | ||||
4224 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
4225 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | ||||
4226 | ICA.getFlags()); | ||||
4227 | |||||
4228 | if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy)) | ||||
4229 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
4230 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | ||||
4231 | ICA.getFlags()); | ||||
4232 | } | ||||
4233 | |||||
4234 | if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) { | ||||
4235 | if (const Instruction *II = ICA.getInst()) { | ||||
4236 | if (II->hasOneUse() && isa<StoreInst>(II->user_back())) | ||||
4237 | return TTI::TCC_Free; | ||||
4238 | if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) { | ||||
4239 | if (LI->hasOneUse()) | ||||
4240 | return TTI::TCC_Free; | ||||
4241 | } | ||||
4242 | } | ||||
4243 | } | ||||
4244 | |||||
4245 | if (ST->is64Bit()) | ||||
4246 | if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy)) | ||||
4247 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
4248 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | ||||
4249 | ICA.getFlags()); | ||||
4250 | |||||
4251 | if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy)) | ||||
4252 | if (auto KindCost = Entry->Cost[CostKind]) | ||||
4253 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, ICA.getFlags()); | ||||
4254 | } | ||||
4255 | |||||
4256 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); | ||||
4257 | } | ||||
4258 | |||||
4259 | InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, | ||||
4260 | TTI::TargetCostKind CostKind, | ||||
4261 | unsigned Index, Value *Op0, | ||||
4262 | Value *Op1) { | ||||
4263 | static const CostTblEntry SLMCostTbl[] = { | ||||
4264 | { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 }, | ||||
4265 | { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 }, | ||||
4266 | { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 }, | ||||
4267 | { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 } | ||||
4268 | }; | ||||
4269 | |||||
4270 | assert(Val->isVectorTy() && "This must be a vector type")(static_cast <bool> (Val->isVectorTy() && "This must be a vector type" ) ? void (0) : __assert_fail ("Val->isVectorTy() && \"This must be a vector type\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4270, __extension__ __PRETTY_FUNCTION__)); | ||||
4271 | Type *ScalarType = Val->getScalarType(); | ||||
4272 | InstructionCost RegisterFileMoveCost = 0; | ||||
4273 | |||||
4274 | // Non-immediate extraction/insertion can be handled as a sequence of | ||||
4275 | // aliased loads+stores via the stack. | ||||
4276 | if (Index == -1U && (Opcode == Instruction::ExtractElement || | ||||
4277 | Opcode == Instruction::InsertElement)) { | ||||
4278 | // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns: | ||||
4279 | // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0. | ||||
4280 | |||||
4281 | // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling. | ||||
4282 | assert(isa<FixedVectorType>(Val) && "Fixed vector type expected")(static_cast <bool> (isa<FixedVectorType>(Val) && "Fixed vector type expected") ? void (0) : __assert_fail ("isa<FixedVectorType>(Val) && \"Fixed vector type expected\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4282, __extension__ __PRETTY_FUNCTION__)); | ||||
4283 | Align VecAlign = DL.getPrefTypeAlign(Val); | ||||
4284 | Align SclAlign = DL.getPrefTypeAlign(ScalarType); | ||||
4285 | |||||
4286 | // Extract - store vector to stack, load scalar. | ||||
4287 | if (Opcode == Instruction::ExtractElement) { | ||||
4288 | return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) + | ||||
4289 | getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0, | ||||
4290 | CostKind); | ||||
4291 | } | ||||
4292 | // Insert - store vector to stack, store scalar, load vector. | ||||
4293 | if (Opcode == Instruction::InsertElement) { | ||||
4294 | return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) + | ||||
4295 | getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0, | ||||
4296 | CostKind) + | ||||
4297 | getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind); | ||||
4298 | } | ||||
4299 | } | ||||
4300 | |||||
4301 | if (Index != -1U && (Opcode
| ||||
4302 | Opcode
| ||||
4303 | // Extraction of vXi1 elements are now efficiently handled by MOVMSK. | ||||
4304 | if (Opcode
| ||||
4305 | ScalarType->getScalarSizeInBits() == 1 && | ||||
4306 | cast<FixedVectorType>(Val)->getNumElements() > 1) | ||||
4307 | return 1; | ||||
4308 | |||||
4309 | // Legalize the type. | ||||
4310 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val); | ||||
4311 | |||||
4312 | // This type is legalized to a scalar type. | ||||
4313 | if (!LT.second.isVector()) | ||||
4314 | return 0; | ||||
4315 | |||||
4316 | // The type may be split. Normalize the index to the new type. | ||||
4317 | unsigned SizeInBits = LT.second.getSizeInBits(); | ||||
4318 | unsigned NumElts = LT.second.getVectorNumElements(); | ||||
4319 | unsigned SubNumElts = NumElts; | ||||
4320 | Index = Index % NumElts; | ||||
4321 | |||||
4322 | // For >128-bit vectors, we need to extract higher 128-bit subvectors. | ||||
4323 | // For inserts, we also need to insert the subvector back. | ||||
4324 | if (SizeInBits > 128) { | ||||
4325 | assert((SizeInBits % 128) == 0 && "Illegal vector")(static_cast <bool> ((SizeInBits % 128) == 0 && "Illegal vector") ? void (0) : __assert_fail ("(SizeInBits % 128) == 0 && \"Illegal vector\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4325, __extension__ __PRETTY_FUNCTION__)); | ||||
4326 | unsigned NumSubVecs = SizeInBits / 128; | ||||
4327 | SubNumElts = NumElts / NumSubVecs; | ||||
4328 | if (SubNumElts <= Index) { | ||||
4329 | RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1); | ||||
4330 | Index %= SubNumElts; | ||||
4331 | } | ||||
4332 | } | ||||
4333 | |||||
4334 | MVT MScalarTy = LT.second.getScalarType(); | ||||
4335 | auto IsCheapPInsrPExtrInsertPS = [&]() { | ||||
4336 | // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets. | ||||
4337 | // Also, assume insertps is relatively cheap on all >= SSE41 targets. | ||||
4338 | return (MScalarTy == MVT::i16 && ST->hasSSE2()) || | ||||
4339 | (MScalarTy.isInteger() && ST->hasSSE41()) || | ||||
4340 | (MScalarTy == MVT::f32 && ST->hasSSE41() && | ||||
4341 | Opcode == Instruction::InsertElement); | ||||
4342 | }; | ||||
4343 | |||||
4344 | if (Index
| ||||
4345 | // Floating point scalars are already located in index #0. | ||||
4346 | // Many insertions to #0 can fold away for scalar fp-ops, so let's assume | ||||
4347 | // true for all. | ||||
4348 | if (ScalarType->isFloatingPointTy()) | ||||
4349 | return RegisterFileMoveCost; | ||||
4350 | |||||
4351 | if (Opcode
| ||||
4352 | isa_and_nonnull<UndefValue>(Op0)) { | ||||
4353 | // Consider the gather cost to be cheap. | ||||
4354 | if (isa_and_nonnull<LoadInst>(Op1)) | ||||
4355 | return RegisterFileMoveCost; | ||||
4356 | if (!IsCheapPInsrPExtrInsertPS()) { | ||||
4357 | // mov constant-to-GPR + movd/movq GPR -> XMM. | ||||
4358 | if (isa_and_nonnull<Constant>(Op1) && Op1->getType()->isIntegerTy()) | ||||
| |||||
4359 | return 2 + RegisterFileMoveCost; | ||||
4360 | // Assume movd/movq GPR -> XMM is relatively cheap on all targets. | ||||
4361 | return 1 + RegisterFileMoveCost; | ||||
4362 | } | ||||
4363 | } | ||||
4364 | |||||
4365 | // Assume movd/movq XMM -> GPR is relatively cheap on all targets. | ||||
4366 | if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement) | ||||
4367 | return 1 + RegisterFileMoveCost; | ||||
4368 | } | ||||
4369 | |||||
4370 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | ||||
4371 | assert(ISD && "Unexpected vector opcode")(static_cast <bool> (ISD && "Unexpected vector opcode" ) ? void (0) : __assert_fail ("ISD && \"Unexpected vector opcode\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4371, __extension__ __PRETTY_FUNCTION__)); | ||||
4372 | if (ST->useSLMArithCosts()) | ||||
4373 | if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy)) | ||||
4374 | return Entry->Cost + RegisterFileMoveCost; | ||||
4375 | |||||
4376 | // Consider cheap cases. | ||||
4377 | if (IsCheapPInsrPExtrInsertPS()) | ||||
4378 | return 1 + RegisterFileMoveCost; | ||||
4379 | |||||
4380 | // For extractions we just need to shuffle the element to index 0, which | ||||
4381 | // should be very cheap (assume cost = 1). For insertions we need to shuffle | ||||
4382 | // the elements to its destination. In both cases we must handle the | ||||
4383 | // subvector move(s). | ||||
4384 | // If the vector type is already less than 128-bits then don't reduce it. | ||||
4385 | // TODO: Under what circumstances should we shuffle using the full width? | ||||
4386 | InstructionCost ShuffleCost = 1; | ||||
4387 | if (Opcode == Instruction::InsertElement) { | ||||
4388 | auto *SubTy = cast<VectorType>(Val); | ||||
4389 | EVT VT = TLI->getValueType(DL, Val); | ||||
4390 | if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128) | ||||
4391 | SubTy = FixedVectorType::get(ScalarType, SubNumElts); | ||||
4392 | ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, std::nullopt, | ||||
4393 | CostKind, 0, SubTy); | ||||
4394 | } | ||||
4395 | int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1; | ||||
4396 | return ShuffleCost + IntOrFpCost + RegisterFileMoveCost; | ||||
4397 | } | ||||
4398 | |||||
4399 | // Add to the base cost if we know that the extracted element of a vector is | ||||
4400 | // destined to be moved to and used in the integer register file. | ||||
4401 | if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy()) | ||||
4402 | RegisterFileMoveCost += 1; | ||||
4403 | |||||
4404 | return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1) + | ||||
4405 | RegisterFileMoveCost; | ||||
4406 | } | ||||
4407 | |||||
4408 | InstructionCost | ||||
4409 | X86TTIImpl::getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, | ||||
4410 | bool Insert, bool Extract, | ||||
4411 | TTI::TargetCostKind CostKind) { | ||||
4412 | assert(DemandedElts.getBitWidth() ==(static_cast <bool> (DemandedElts.getBitWidth() == cast <FixedVectorType>(Ty)->getNumElements() && "Vector size mismatch" ) ? void (0) : __assert_fail ("DemandedElts.getBitWidth() == cast<FixedVectorType>(Ty)->getNumElements() && \"Vector size mismatch\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4414, __extension__ __PRETTY_FUNCTION__)) | ||||
4413 | cast<FixedVectorType>(Ty)->getNumElements() &&(static_cast <bool> (DemandedElts.getBitWidth() == cast <FixedVectorType>(Ty)->getNumElements() && "Vector size mismatch" ) ? void (0) : __assert_fail ("DemandedElts.getBitWidth() == cast<FixedVectorType>(Ty)->getNumElements() && \"Vector size mismatch\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4414, __extension__ __PRETTY_FUNCTION__)) | ||||
4414 | "Vector size mismatch")(static_cast <bool> (DemandedElts.getBitWidth() == cast <FixedVectorType>(Ty)->getNumElements() && "Vector size mismatch" ) ? void (0) : __assert_fail ("DemandedElts.getBitWidth() == cast<FixedVectorType>(Ty)->getNumElements() && \"Vector size mismatch\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4414, __extension__ __PRETTY_FUNCTION__)); | ||||
4415 | |||||
4416 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); | ||||
4417 | MVT MScalarTy = LT.second.getScalarType(); | ||||
4418 | unsigned LegalVectorBitWidth = LT.second.getSizeInBits(); | ||||
4419 | InstructionCost Cost = 0; | ||||
4420 | |||||
4421 | constexpr unsigned LaneBitWidth = 128; | ||||
4422 | assert((LegalVectorBitWidth < LaneBitWidth ||(static_cast <bool> ((LegalVectorBitWidth < LaneBitWidth || (LegalVectorBitWidth % LaneBitWidth) == 0) && "Illegal vector" ) ? void (0) : __assert_fail ("(LegalVectorBitWidth < LaneBitWidth || (LegalVectorBitWidth % LaneBitWidth) == 0) && \"Illegal vector\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4424, __extension__ __PRETTY_FUNCTION__)) | ||||
4423 | (LegalVectorBitWidth % LaneBitWidth) == 0) &&(static_cast <bool> ((LegalVectorBitWidth < LaneBitWidth || (LegalVectorBitWidth % LaneBitWidth) == 0) && "Illegal vector" ) ? void (0) : __assert_fail ("(LegalVectorBitWidth < LaneBitWidth || (LegalVectorBitWidth % LaneBitWidth) == 0) && \"Illegal vector\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4424, __extension__ __PRETTY_FUNCTION__)) | ||||
4424 | "Illegal vector")(static_cast <bool> ((LegalVectorBitWidth < LaneBitWidth || (LegalVectorBitWidth % LaneBitWidth) == 0) && "Illegal vector" ) ? void (0) : __assert_fail ("(LegalVectorBitWidth < LaneBitWidth || (LegalVectorBitWidth % LaneBitWidth) == 0) && \"Illegal vector\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4424, __extension__ __PRETTY_FUNCTION__)); | ||||
4425 | |||||
4426 | const int NumLegalVectors = *LT.first.getValue(); | ||||
4427 | assert(NumLegalVectors >= 0 && "Negative cost!")(static_cast <bool> (NumLegalVectors >= 0 && "Negative cost!") ? void (0) : __assert_fail ("NumLegalVectors >= 0 && \"Negative cost!\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4427, __extension__ __PRETTY_FUNCTION__)); | ||||
4428 | |||||
4429 | // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much | ||||
4430 | // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT. | ||||
4431 | if (Insert
| ||||
4432 | if ((MScalarTy == MVT::i16 && ST->hasSSE2()) || | ||||
4433 | (MScalarTy.isInteger() && ST->hasSSE41()) || | ||||
4434 | (MScalarTy == MVT::f32 && ST->hasSSE41())) { | ||||
4435 | // For types we can insert directly, insertion into 128-bit sub vectors is | ||||
4436 | // cheap, followed by a cheap chain of concatenations. | ||||
4437 | if (LegalVectorBitWidth <= LaneBitWidth) { | ||||
4438 | Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, | ||||
4439 | /*Extract*/ false, CostKind); | ||||
4440 | } else { | ||||
4441 | // In each 128-lane, if at least one index is demanded but not all | ||||
4442 | // indices are demanded and this 128-lane is not the first 128-lane of | ||||
4443 | // the legalized-vector, then this 128-lane needs a extracti128; If in | ||||
4444 | // each 128-lane, there is at least one demanded index, this 128-lane | ||||
4445 | // needs a inserti128. | ||||
4446 | |||||
4447 | // The following cases will help you build a better understanding: | ||||
4448 | // Assume we insert several elements into a v8i32 vector in avx2, | ||||
4449 | // Case#1: inserting into 1th index needs vpinsrd + inserti128. | ||||
4450 | // Case#2: inserting into 5th index needs extracti128 + vpinsrd + | ||||
4451 | // inserti128. | ||||
4452 | // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128. | ||||
4453 | assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector")(static_cast <bool> ((LegalVectorBitWidth % LaneBitWidth ) == 0 && "Illegal vector") ? void (0) : __assert_fail ("(LegalVectorBitWidth % LaneBitWidth) == 0 && \"Illegal vector\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4453, __extension__ __PRETTY_FUNCTION__)); | ||||
4454 | unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth; | ||||
4455 | unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors; | ||||
4456 | unsigned NumLegalElts = | ||||
4457 | LT.second.getVectorNumElements() * NumLegalVectors; | ||||
4458 | assert(NumLegalElts >= DemandedElts.getBitWidth() &&(static_cast <bool> (NumLegalElts >= DemandedElts.getBitWidth () && "Vector has been legalized to smaller element count" ) ? void (0) : __assert_fail ("NumLegalElts >= DemandedElts.getBitWidth() && \"Vector has been legalized to smaller element count\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4459, __extension__ __PRETTY_FUNCTION__)) | ||||
4459 | "Vector has been legalized to smaller element count")(static_cast <bool> (NumLegalElts >= DemandedElts.getBitWidth () && "Vector has been legalized to smaller element count" ) ? void (0) : __assert_fail ("NumLegalElts >= DemandedElts.getBitWidth() && \"Vector has been legalized to smaller element count\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4459, __extension__ __PRETTY_FUNCTION__)); | ||||
4460 | assert((NumLegalElts % NumLanesTotal) == 0 &&(static_cast <bool> ((NumLegalElts % NumLanesTotal) == 0 && "Unexpected elts per lane") ? void (0) : __assert_fail ("(NumLegalElts % NumLanesTotal) == 0 && \"Unexpected elts per lane\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4461, __extension__ __PRETTY_FUNCTION__)) | ||||
4461 | "Unexpected elts per lane")(static_cast <bool> ((NumLegalElts % NumLanesTotal) == 0 && "Unexpected elts per lane") ? void (0) : __assert_fail ("(NumLegalElts % NumLanesTotal) == 0 && \"Unexpected elts per lane\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4461, __extension__ __PRETTY_FUNCTION__)); | ||||
4462 | unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal; | ||||
4463 | |||||
4464 | APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts); | ||||
4465 | auto *LaneTy = | ||||
4466 | FixedVectorType::get(Ty->getElementType(), NumEltsPerLane); | ||||
4467 | |||||
4468 | for (unsigned I = 0; I != NumLanesTotal; ++I) { | ||||
4469 | APInt LaneEltMask = WidenedDemandedElts.extractBits( | ||||
4470 | NumEltsPerLane, NumEltsPerLane * I); | ||||
4471 | if (LaneEltMask.isNullValue()) | ||||
4472 | continue; | ||||
4473 | // FIXME: we don't need to extract if all non-demanded elements | ||||
4474 | // are legalization-inserted padding. | ||||
4475 | if (!LaneEltMask.isAllOnes()) | ||||
4476 | Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, | ||||
4477 | CostKind, I * NumEltsPerLane, LaneTy); | ||||
4478 | Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert, | ||||
4479 | /*Extract*/ false, CostKind); | ||||
4480 | } | ||||
4481 | |||||
4482 | APInt AffectedLanes = | ||||
4483 | APIntOps::ScaleBitMask(WidenedDemandedElts, NumLanesTotal); | ||||
4484 | APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask( | ||||
4485 | AffectedLanes, NumLegalVectors, /*MatchAllBits=*/true); | ||||
4486 | for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) { | ||||
4487 | for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) { | ||||
4488 | unsigned I = NumLegalLanes * LegalVec + Lane; | ||||
4489 | // No need to insert unaffected lane; or lane 0 of each legal vector | ||||
4490 | // iff ALL lanes of that vector were affected and will be inserted. | ||||
4491 | if (!AffectedLanes[I] || | ||||
4492 | (Lane == 0 && FullyAffectedLegalVectors[LegalVec])) | ||||
4493 | continue; | ||||
4494 | Cost += getShuffleCost(TTI::SK_InsertSubvector, Ty, std::nullopt, | ||||
4495 | CostKind, I * NumEltsPerLane, LaneTy); | ||||
4496 | } | ||||
4497 | } | ||||
4498 | } | ||||
4499 | } else if (LT.second.isVector()) { | ||||
4500 | // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded | ||||
4501 | // integer element as a SCALAR_TO_VECTOR, then we build the vector as a | ||||
4502 | // series of UNPCK followed by CONCAT_VECTORS - all of these can be | ||||
4503 | // considered cheap. | ||||
4504 | if (Ty->isIntOrIntVectorTy()) | ||||
4505 | Cost += DemandedElts.countPopulation(); | ||||
4506 | |||||
4507 | // Get the smaller of the legalized or original pow2-extended number of | ||||
4508 | // vector elements, which represents the number of unpacks we'll end up | ||||
4509 | // performing. | ||||
4510 | unsigned NumElts = LT.second.getVectorNumElements(); | ||||
4511 | unsigned Pow2Elts = | ||||
4512 | PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements()); | ||||
4513 | Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first; | ||||
4514 | } | ||||
4515 | } | ||||
4516 | |||||
4517 | if (Extract) { | ||||
4518 | // vXi1 can be efficiently extracted with MOVMSK. | ||||
4519 | // TODO: AVX512 predicate mask handling. | ||||
4520 | // NOTE: This doesn't work well for roundtrip scalarization. | ||||
4521 | if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) { | ||||
4522 | unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements(); | ||||
4523 | unsigned MaxElts = ST->hasAVX2() ? 32 : 16; | ||||
4524 | unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts; | ||||
4525 | return MOVMSKCost; | ||||
4526 | } | ||||
4527 | |||||
4528 | if (LT.second.isVector()) { | ||||
4529 | unsigned NumLegalElts = | ||||
4530 | LT.second.getVectorNumElements() * NumLegalVectors; | ||||
4531 | assert(NumLegalElts >= DemandedElts.getBitWidth() &&(static_cast <bool> (NumLegalElts >= DemandedElts.getBitWidth () && "Vector has been legalized to smaller element count" ) ? void (0) : __assert_fail ("NumLegalElts >= DemandedElts.getBitWidth() && \"Vector has been legalized to smaller element count\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4532, __extension__ __PRETTY_FUNCTION__)) | ||||
4532 | "Vector has been legalized to smaller element count")(static_cast <bool> (NumLegalElts >= DemandedElts.getBitWidth () && "Vector has been legalized to smaller element count" ) ? void (0) : __assert_fail ("NumLegalElts >= DemandedElts.getBitWidth() && \"Vector has been legalized to smaller element count\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4532, __extension__ __PRETTY_FUNCTION__)); | ||||
4533 | |||||
4534 | // If we're extracting elements from a 128-bit subvector lane, | ||||
4535 | // we only need to extract each lane once, not for every element. | ||||
4536 | if (LegalVectorBitWidth > LaneBitWidth) { | ||||
4537 | unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth; | ||||
4538 | unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors; | ||||
4539 | assert((NumLegalElts % NumLanesTotal) == 0 &&(static_cast <bool> ((NumLegalElts % NumLanesTotal) == 0 && "Unexpected elts per lane") ? void (0) : __assert_fail ("(NumLegalElts % NumLanesTotal) == 0 && \"Unexpected elts per lane\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4540, __extension__ __PRETTY_FUNCTION__)) | ||||
4540 | "Unexpected elts per lane")(static_cast <bool> ((NumLegalElts % NumLanesTotal) == 0 && "Unexpected elts per lane") ? void (0) : __assert_fail ("(NumLegalElts % NumLanesTotal) == 0 && \"Unexpected elts per lane\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4540, __extension__ __PRETTY_FUNCTION__)); | ||||
4541 | unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal; | ||||
4542 | |||||
4543 | // Add cost for each demanded 128-bit subvector extraction. | ||||
4544 | // Luckily this is a lot easier than for insertion. | ||||
4545 | APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts); | ||||
4546 | auto *LaneTy = | ||||
4547 | FixedVectorType::get(Ty->getElementType(), NumEltsPerLane); | ||||
4548 | |||||
4549 | for (unsigned I = 0; I != NumLanesTotal; ++I) { | ||||
4550 | APInt LaneEltMask = WidenedDemandedElts.extractBits( | ||||
4551 | NumEltsPerLane, I * NumEltsPerLane); | ||||
4552 | if (LaneEltMask.isNullValue()) | ||||
4553 | continue; | ||||
4554 | Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, | ||||
4555 | CostKind, I * NumEltsPerLane, LaneTy); | ||||
4556 | Cost += BaseT::getScalarizationOverhead( | ||||
4557 | LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind); | ||||
4558 | } | ||||
4559 | |||||
4560 | return Cost; | ||||
4561 | } | ||||
4562 | } | ||||
4563 | |||||
4564 | // Fallback to default extraction. | ||||
4565 | Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ false, | ||||
4566 | Extract, CostKind); | ||||
4567 | } | ||||
4568 | |||||
4569 | return Cost; | ||||
4570 | } | ||||
4571 | |||||
4572 | InstructionCost | ||||
4573 | X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, | ||||
4574 | int VF, const APInt &DemandedDstElts, | ||||
4575 | TTI::TargetCostKind CostKind) { | ||||
4576 | const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy); | ||||
4577 | // We don't differentiate element types here, only element bit width. | ||||
4578 | EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits); | ||||
4579 | |||||
4580 | auto bailout = [&]() { | ||||
4581 | return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF, | ||||
4582 | DemandedDstElts, CostKind); | ||||
4583 | }; | ||||
4584 | |||||
4585 | // For now, only deal with AVX512 cases. | ||||
4586 | if (!ST->hasAVX512()) | ||||
4587 | return bailout(); | ||||
4588 | |||||
4589 | // Do we have a native shuffle for this element type, or should we promote? | ||||
4590 | unsigned PromEltTyBits = EltTyBits; | ||||
4591 | switch (EltTyBits) { | ||||
4592 | case 32: | ||||
4593 | case 64: | ||||
4594 | break; // AVX512F. | ||||
4595 | case 16: | ||||
4596 | if (!ST->hasBWI()) | ||||
4597 | PromEltTyBits = 32; // promote to i32, AVX512F. | ||||
4598 | break; // AVX512BW | ||||
4599 | case 8: | ||||
4600 | if (!ST->hasVBMI()) | ||||
4601 | PromEltTyBits = 32; // promote to i32, AVX512F. | ||||
4602 | break; // AVX512VBMI | ||||
4603 | case 1: | ||||
4604 | // There is no support for shuffling i1 elements. We *must* promote. | ||||
4605 | if (ST->hasBWI()) { | ||||
4606 | if (ST->hasVBMI()) | ||||
4607 | PromEltTyBits = 8; // promote to i8, AVX512VBMI. | ||||
4608 | else | ||||
4609 | PromEltTyBits = 16; // promote to i16, AVX512BW. | ||||
4610 | break; | ||||
4611 | } | ||||
4612 | PromEltTyBits = 32; // promote to i32, AVX512F. | ||||
4613 | break; | ||||
4614 | default: | ||||
4615 | return bailout(); | ||||
4616 | } | ||||
4617 | auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits); | ||||
4618 | |||||
4619 | auto *SrcVecTy = FixedVectorType::get(EltTy, VF); | ||||
4620 | auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF); | ||||
4621 | |||||
4622 | int NumDstElements = VF * ReplicationFactor; | ||||
4623 | auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements); | ||||
4624 | auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements); | ||||
4625 | |||||
4626 | // Legalize the types. | ||||
4627 | MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second; | ||||
4628 | MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second; | ||||
4629 | MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second; | ||||
4630 | MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second; | ||||
4631 | // They should have legalized into vector types. | ||||
4632 | if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() || | ||||
4633 | !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector()) | ||||
4634 | return bailout(); | ||||
4635 | |||||
4636 | if (PromEltTyBits != EltTyBits) { | ||||
4637 | // If we have to perform the shuffle with wider elt type than our data type, | ||||
4638 | // then we will first need to anyext (we don't care about the new bits) | ||||
4639 | // the source elements, and then truncate Dst elements. | ||||
4640 | InstructionCost PromotionCost; | ||||
4641 | PromotionCost += getCastInstrCost( | ||||
4642 | Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy, | ||||
4643 | TargetTransformInfo::CastContextHint::None, CostKind); | ||||
4644 | PromotionCost += | ||||
4645 | getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy, | ||||
4646 | /*Src=*/PromDstVecTy, | ||||
4647 | TargetTransformInfo::CastContextHint::None, CostKind); | ||||
4648 | return PromotionCost + getReplicationShuffleCost(PromEltTy, | ||||
4649 | ReplicationFactor, VF, | ||||
4650 | DemandedDstElts, CostKind); | ||||
4651 | } | ||||
4652 | |||||
4653 | assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&(static_cast <bool> (LegalSrcVecTy.getScalarSizeInBits( ) == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy .getScalarType() && "We expect that the legalization doesn't affect the element width, " "doesn't coalesce/split elements.") ? void (0) : __assert_fail ("LegalSrcVecTy.getScalarSizeInBits() == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() && \"We expect that the legalization doesn't affect the element width, \" \"doesn't coalesce/split elements.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4656, __extension__ __PRETTY_FUNCTION__)) | ||||
4654 | LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&(static_cast <bool> (LegalSrcVecTy.getScalarSizeInBits( ) == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy .getScalarType() && "We expect that the legalization doesn't affect the element width, " "doesn't coalesce/split elements.") ? void (0) : __assert_fail ("LegalSrcVecTy.getScalarSizeInBits() == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() && \"We expect that the legalization doesn't affect the element width, \" \"doesn't coalesce/split elements.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4656, __extension__ __PRETTY_FUNCTION__)) | ||||
4655 | "We expect that the legalization doesn't affect the element width, "(static_cast <bool> (LegalSrcVecTy.getScalarSizeInBits( ) == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy .getScalarType() && "We expect that the legalization doesn't affect the element width, " "doesn't coalesce/split elements.") ? void (0) : __assert_fail ("LegalSrcVecTy.getScalarSizeInBits() == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() && \"We expect that the legalization doesn't affect the element width, \" \"doesn't coalesce/split elements.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4656, __extension__ __PRETTY_FUNCTION__)) | ||||
4656 | "doesn't coalesce/split elements.")(static_cast <bool> (LegalSrcVecTy.getScalarSizeInBits( ) == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy .getScalarType() && "We expect that the legalization doesn't affect the element width, " "doesn't coalesce/split elements.") ? void (0) : __assert_fail ("LegalSrcVecTy.getScalarSizeInBits() == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() && \"We expect that the legalization doesn't affect the element width, \" \"doesn't coalesce/split elements.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4656, __extension__ __PRETTY_FUNCTION__)); | ||||
4657 | |||||
4658 | unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements(); | ||||
4659 | unsigned NumDstVectors = | ||||
4660 | divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec); | ||||
4661 | |||||
4662 | auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec); | ||||
4663 | |||||
4664 | // Not all the produced Dst elements may be demanded. In our case, | ||||
4665 | // given that a single Dst vector is formed by a single shuffle, | ||||
4666 | // if all elements that will form a single Dst vector aren't demanded, | ||||
4667 | // then we won't need to do that shuffle, so adjust the cost accordingly. | ||||
4668 | APInt DemandedDstVectors = APIntOps::ScaleBitMask( | ||||
4669 | DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors); | ||||
4670 | unsigned NumDstVectorsDemanded = DemandedDstVectors.countPopulation(); | ||||
4671 | |||||
4672 | InstructionCost SingleShuffleCost = getShuffleCost( | ||||
4673 | TTI::SK_PermuteSingleSrc, SingleDstVecTy, /*Mask=*/std::nullopt, CostKind, | ||||
4674 | /*Index=*/0, /*SubTp=*/nullptr); | ||||
4675 | return NumDstVectorsDemanded * SingleShuffleCost; | ||||
4676 | } | ||||
4677 | |||||
4678 | InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, | ||||
4679 | MaybeAlign Alignment, | ||||
4680 | unsigned AddressSpace, | ||||
4681 | TTI::TargetCostKind CostKind, | ||||
4682 | TTI::OperandValueInfo OpInfo, | ||||
4683 | const Instruction *I) { | ||||
4684 | // TODO: Handle other cost kinds. | ||||
4685 | if (CostKind != TTI::TCK_RecipThroughput) { | ||||
4686 | if (auto *SI = dyn_cast_or_null<StoreInst>(I)) { | ||||
4687 | // Store instruction with index and scale costs 2 Uops. | ||||
4688 | // Check the preceding GEP to identify non-const indices. | ||||
4689 | if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) { | ||||
4690 | if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); })) | ||||
4691 | return TTI::TCC_Basic * 2; | ||||
4692 | } | ||||
4693 | } | ||||
4694 | return TTI::TCC_Basic; | ||||
4695 | } | ||||
4696 | |||||
4697 | assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&(static_cast <bool> ((Opcode == Instruction::Load || Opcode == Instruction::Store) && "Invalid Opcode") ? void ( 0) : __assert_fail ("(Opcode == Instruction::Load || Opcode == Instruction::Store) && \"Invalid Opcode\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4698, __extension__ __PRETTY_FUNCTION__)) | ||||
4698 | "Invalid Opcode")(static_cast <bool> ((Opcode == Instruction::Load || Opcode == Instruction::Store) && "Invalid Opcode") ? void ( 0) : __assert_fail ("(Opcode == Instruction::Load || Opcode == Instruction::Store) && \"Invalid Opcode\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4698, __extension__ __PRETTY_FUNCTION__)); | ||||
4699 | // Type legalization can't handle structs | ||||
4700 | if (TLI->getValueType(DL, Src, true) == MVT::Other) | ||||
4701 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, | ||||
4702 | CostKind); | ||||
4703 | |||||
4704 | // Legalize the type. | ||||
4705 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src); | ||||
4706 | |||||
4707 | auto *VTy = dyn_cast<FixedVectorType>(Src); | ||||
4708 | |||||
4709 | InstructionCost Cost = 0; | ||||
4710 | |||||
4711 | // Add a cost for constant load to vector. | ||||
4712 | if (Opcode == Instruction::Store && OpInfo.isConstant()) | ||||
4713 | Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src), | ||||
4714 | /*AddressSpace=*/0, CostKind); | ||||
4715 | |||||
4716 | // Handle the simple case of non-vectors. | ||||
4717 | // NOTE: this assumes that legalization never creates vector from scalars! | ||||
4718 | if (!VTy || !LT.second.isVector()) { | ||||
4719 | // Each load/store unit costs 1. | ||||
4720 | return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1; | ||||
4721 | } | ||||
4722 | |||||
4723 | bool IsLoad = Opcode == Instruction::Load; | ||||
4724 | |||||
4725 | Type *EltTy = VTy->getElementType(); | ||||
4726 | |||||
4727 | const int EltTyBits = DL.getTypeSizeInBits(EltTy); | ||||
4728 | |||||
4729 | // Source of truth: how many elements were there in the original IR vector? | ||||
4730 | const unsigned SrcNumElt = VTy->getNumElements(); | ||||
4731 | |||||
4732 | // How far have we gotten? | ||||
4733 | int NumEltRemaining = SrcNumElt; | ||||
4734 | // Note that we intentionally capture by-reference, NumEltRemaining changes. | ||||
4735 | auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; }; | ||||
4736 | |||||
4737 | const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8); | ||||
4738 | |||||
4739 | // Note that even if we can store 64 bits of an XMM, we still operate on XMM. | ||||
4740 | const unsigned XMMBits = 128; | ||||
4741 | if (XMMBits % EltTyBits != 0) | ||||
4742 | // Vector size must be a multiple of the element size. I.e. no padding. | ||||
4743 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, | ||||
4744 | CostKind); | ||||
4745 | const int NumEltPerXMM = XMMBits / EltTyBits; | ||||
4746 | |||||
4747 | auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM); | ||||
4748 | |||||
4749 | for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0; | ||||
4750 | NumEltRemaining > 0; CurrOpSizeBytes /= 2) { | ||||
4751 | // How many elements would a single op deal with at once? | ||||
4752 | if ((8 * CurrOpSizeBytes) % EltTyBits != 0) | ||||
4753 | // Vector size must be a multiple of the element size. I.e. no padding. | ||||
4754 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, | ||||
4755 | CostKind); | ||||
4756 | int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits; | ||||
4757 | |||||
4758 | assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?")(static_cast <bool> (CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?") ? void (0) : __assert_fail ("CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && \"How'd we get here?\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4758, __extension__ __PRETTY_FUNCTION__)); | ||||
4759 | assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||(static_cast <bool> ((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes )) && "Unless we haven't halved the op size yet, " "we have less than two op's sized units of work left." ) ? void (0) : __assert_fail ("(((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes)) && \"Unless we haven't halved the op size yet, \" \"we have less than two op's sized units of work left.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4762, __extension__ __PRETTY_FUNCTION__)) | ||||
4760 | (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&(static_cast <bool> ((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes )) && "Unless we haven't halved the op size yet, " "we have less than two op's sized units of work left." ) ? void (0) : __assert_fail ("(((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes)) && \"Unless we haven't halved the op size yet, \" \"we have less than two op's sized units of work left.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4762, __extension__ __PRETTY_FUNCTION__)) | ||||
4761 | "Unless we haven't halved the op size yet, "(static_cast <bool> ((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes )) && "Unless we haven't halved the op size yet, " "we have less than two op's sized units of work left." ) ? void (0) : __assert_fail ("(((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes)) && \"Unless we haven't halved the op size yet, \" \"we have less than two op's sized units of work left.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4762, __extension__ __PRETTY_FUNCTION__)) | ||||
4762 | "we have less than two op's sized units of work left.")(static_cast <bool> ((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes )) && "Unless we haven't halved the op size yet, " "we have less than two op's sized units of work left." ) ? void (0) : __assert_fail ("(((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes)) && \"Unless we haven't halved the op size yet, \" \"we have less than two op's sized units of work left.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4762, __extension__ __PRETTY_FUNCTION__)); | ||||
4763 | |||||
4764 | auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM | ||||
4765 | ? FixedVectorType::get(EltTy, CurrNumEltPerOp) | ||||
4766 | : XMMVecTy; | ||||
4767 | |||||
4768 | assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&(static_cast <bool> (CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && "After halving sizes, the vector elt count is no longer a multiple " "of number of elements per operation?") ? void (0) : __assert_fail ("CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && \"After halving sizes, the vector elt count is no longer a multiple \" \"of number of elements per operation?\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4770, __extension__ __PRETTY_FUNCTION__)) | ||||
4769 | "After halving sizes, the vector elt count is no longer a multiple "(static_cast <bool> (CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && "After halving sizes, the vector elt count is no longer a multiple " "of number of elements per operation?") ? void (0) : __assert_fail ("CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && \"After halving sizes, the vector elt count is no longer a multiple \" \"of number of elements per operation?\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4770, __extension__ __PRETTY_FUNCTION__)) | ||||
4770 | "of number of elements per operation?")(static_cast <bool> (CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && "After halving sizes, the vector elt count is no longer a multiple " "of number of elements per operation?") ? void (0) : __assert_fail ("CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && \"After halving sizes, the vector elt count is no longer a multiple \" \"of number of elements per operation?\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4770, __extension__ __PRETTY_FUNCTION__)); | ||||
4771 | auto *CoalescedVecTy = | ||||
4772 | CurrNumEltPerOp == 1 | ||||
4773 | ? CurrVecTy | ||||
4774 | : FixedVectorType::get( | ||||
4775 | IntegerType::get(Src->getContext(), | ||||
4776 | EltTyBits * CurrNumEltPerOp), | ||||
4777 | CurrVecTy->getNumElements() / CurrNumEltPerOp); | ||||
4778 | assert(DL.getTypeSizeInBits(CoalescedVecTy) ==(static_cast <bool> (DL.getTypeSizeInBits(CoalescedVecTy ) == DL.getTypeSizeInBits(CurrVecTy) && "coalesciing elements doesn't change vector width." ) ? void (0) : __assert_fail ("DL.getTypeSizeInBits(CoalescedVecTy) == DL.getTypeSizeInBits(CurrVecTy) && \"coalesciing elements doesn't change vector width.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4780, __extension__ __PRETTY_FUNCTION__)) | ||||
4779 | DL.getTypeSizeInBits(CurrVecTy) &&(static_cast <bool> (DL.getTypeSizeInBits(CoalescedVecTy ) == DL.getTypeSizeInBits(CurrVecTy) && "coalesciing elements doesn't change vector width." ) ? void (0) : __assert_fail ("DL.getTypeSizeInBits(CoalescedVecTy) == DL.getTypeSizeInBits(CurrVecTy) && \"coalesciing elements doesn't change vector width.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4780, __extension__ __PRETTY_FUNCTION__)) | ||||
4780 | "coalesciing elements doesn't change vector width.")(static_cast <bool> (DL.getTypeSizeInBits(CoalescedVecTy ) == DL.getTypeSizeInBits(CurrVecTy) && "coalesciing elements doesn't change vector width." ) ? void (0) : __assert_fail ("DL.getTypeSizeInBits(CoalescedVecTy) == DL.getTypeSizeInBits(CurrVecTy) && \"coalesciing elements doesn't change vector width.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4780, __extension__ __PRETTY_FUNCTION__)); | ||||
4781 | |||||
4782 | while (NumEltRemaining > 0) { | ||||
4783 | assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?")(static_cast <bool> (SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?" ) ? void (0) : __assert_fail ("SubVecEltsLeft >= 0 && \"Subreg element count overconsumtion?\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4783, __extension__ __PRETTY_FUNCTION__)); | ||||
4784 | |||||
4785 | // Can we use this vector size, as per the remaining element count? | ||||
4786 | // Iff the vector is naturally aligned, we can do a wide load regardless. | ||||
4787 | if (NumEltRemaining < CurrNumEltPerOp && | ||||
4788 | (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) && | ||||
4789 | CurrOpSizeBytes != 1) | ||||
4790 | break; // Try smalled vector size. | ||||
4791 | |||||
4792 | bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0; | ||||
4793 | |||||
4794 | // If we have fully processed the previous reg, we need to replenish it. | ||||
4795 | if (SubVecEltsLeft == 0) { | ||||
4796 | SubVecEltsLeft += CurrVecTy->getNumElements(); | ||||
4797 | // And that's free only for the 0'th subvector of a legalized vector. | ||||
4798 | if (!Is0thSubVec) | ||||
4799 | Cost += getShuffleCost(IsLoad ? TTI::ShuffleKind::SK_InsertSubvector | ||||
4800 | : TTI::ShuffleKind::SK_ExtractSubvector, | ||||
4801 | VTy, std::nullopt, CostKind, NumEltDone(), | ||||
4802 | CurrVecTy); | ||||
4803 | } | ||||
4804 | |||||
4805 | // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM, | ||||
4806 | // for smaller widths (32/16/8) we have to insert/extract them separately. | ||||
4807 | // Again, it's free for the 0'th subreg (if op is 32/64 bit wide, | ||||
4808 | // but let's pretend that it is also true for 16/8 bit wide ops...) | ||||
4809 | if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) { | ||||
4810 | int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM; | ||||
4811 | assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "")(static_cast <bool> (NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "") ? void (0) : __assert_fail ("NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && \"\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4811, __extension__ __PRETTY_FUNCTION__)); | ||||
4812 | int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp; | ||||
4813 | APInt DemandedElts = | ||||
4814 | APInt::getBitsSet(CoalescedVecTy->getNumElements(), | ||||
4815 | CoalescedVecEltIdx, CoalescedVecEltIdx + 1); | ||||
4816 | assert(DemandedElts.countPopulation() == 1 && "Inserting single value")(static_cast <bool> (DemandedElts.countPopulation() == 1 && "Inserting single value") ? void (0) : __assert_fail ("DemandedElts.countPopulation() == 1 && \"Inserting single value\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4816, __extension__ __PRETTY_FUNCTION__)); | ||||
4817 | Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad, | ||||
4818 | !IsLoad, CostKind); | ||||
4819 | } | ||||
4820 | |||||
4821 | // This isn't exactly right. We're using slow unaligned 32-byte accesses | ||||
4822 | // as a proxy for a double-pumped AVX memory interface such as on | ||||
4823 | // Sandybridge. | ||||
4824 | if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow()) | ||||
4825 | Cost += 2; | ||||
4826 | else | ||||
4827 | Cost += 1; | ||||
4828 | |||||
4829 | SubVecEltsLeft -= CurrNumEltPerOp; | ||||
4830 | NumEltRemaining -= CurrNumEltPerOp; | ||||
4831 | Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes); | ||||
4832 | } | ||||
4833 | } | ||||
4834 | |||||
4835 | assert(NumEltRemaining <= 0 && "Should have processed all the elements.")(static_cast <bool> (NumEltRemaining <= 0 && "Should have processed all the elements.") ? void (0) : __assert_fail ("NumEltRemaining <= 0 && \"Should have processed all the elements.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4835, __extension__ __PRETTY_FUNCTION__)); | ||||
4836 | |||||
4837 | return Cost; | ||||
4838 | } | ||||
4839 | |||||
4840 | InstructionCost | ||||
4841 | X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment, | ||||
4842 | unsigned AddressSpace, | ||||
4843 | TTI::TargetCostKind CostKind) { | ||||
4844 | bool IsLoad = (Instruction::Load == Opcode); | ||||
4845 | bool IsStore = (Instruction::Store == Opcode); | ||||
4846 | |||||
4847 | auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy); | ||||
4848 | if (!SrcVTy) | ||||
4849 | // To calculate scalar take the regular cost, without mask | ||||
4850 | return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind); | ||||
4851 | |||||
4852 | unsigned NumElem = SrcVTy->getNumElements(); | ||||
4853 | auto *MaskTy = | ||||
4854 | FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem); | ||||
4855 | if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) || | ||||
4856 | (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) { | ||||
4857 | // Scalarization | ||||
4858 | APInt DemandedElts = APInt::getAllOnes(NumElem); | ||||
4859 | InstructionCost MaskSplitCost = getScalarizationOverhead( | ||||
4860 | MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind); | ||||
4861 | InstructionCost ScalarCompareCost = getCmpSelInstrCost( | ||||
4862 | Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr, | ||||
4863 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | ||||
4864 | InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind); | ||||
4865 | InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); | ||||
4866 | InstructionCost ValueSplitCost = getScalarizationOverhead( | ||||
4867 | SrcVTy, DemandedElts, IsLoad, IsStore, CostKind); | ||||
4868 | InstructionCost MemopCost = | ||||
4869 | NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(), | ||||
4870 | Alignment, AddressSpace, CostKind); | ||||
4871 | return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost; | ||||
4872 | } | ||||
4873 | |||||
4874 | // Legalize the type. | ||||
4875 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy); | ||||
4876 | auto VT = TLI->getValueType(DL, SrcVTy); | ||||
4877 | InstructionCost Cost = 0; | ||||
4878 | if (VT.isSimple() && LT.second != VT.getSimpleVT() && | ||||
4879 | LT.second.getVectorNumElements() == NumElem) | ||||
4880 | // Promotion requires extend/truncate for data and a shuffle for mask. | ||||
4881 | Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, std::nullopt, | ||||
4882 | CostKind, 0, nullptr) + | ||||
4883 | getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, std::nullopt, | ||||
4884 | CostKind, 0, nullptr); | ||||
4885 | |||||
4886 | else if (LT.first * LT.second.getVectorNumElements() > NumElem) { | ||||
4887 | auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(), | ||||
4888 | LT.second.getVectorNumElements()); | ||||
4889 | // Expanding requires fill mask with zeroes | ||||
4890 | Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, std::nullopt, | ||||
4891 | CostKind, 0, MaskTy); | ||||
4892 | } | ||||
4893 | |||||
4894 | // Pre-AVX512 - each maskmov load costs 2 + store costs ~8. | ||||
4895 | if (!ST->hasAVX512()) | ||||
4896 | return Cost + LT.first * (IsLoad ? 2 : 8); | ||||
4897 | |||||
4898 | // AVX-512 masked load/store is cheaper | ||||
4899 | return Cost + LT.first; | ||||
4900 | } | ||||
4901 | |||||
4902 | InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty, | ||||
4903 | ScalarEvolution *SE, | ||||
4904 | const SCEV *Ptr) { | ||||
4905 | // Address computations in vectorized code with non-consecutive addresses will | ||||
4906 | // likely result in more instructions compared to scalar code where the | ||||
4907 | // computation can more often be merged into the index mode. The resulting | ||||
4908 | // extra micro-ops can significantly decrease throughput. | ||||
4909 | const unsigned NumVectorInstToHideOverhead = 10; | ||||
4910 | |||||
4911 | // Cost modeling of Strided Access Computation is hidden by the indexing | ||||
4912 | // modes of X86 regardless of the stride value. We dont believe that there | ||||
4913 | // is a difference between constant strided access in gerenal and constant | ||||
4914 | // strided value which is less than or equal to 64. | ||||
4915 | // Even in the case of (loop invariant) stride whose value is not known at | ||||
4916 | // compile time, the address computation will not incur more than one extra | ||||
4917 | // ADD instruction. | ||||
4918 | if (Ty->isVectorTy() && SE && !ST->hasAVX2()) { | ||||
4919 | // TODO: AVX2 is the current cut-off because we don't have correct | ||||
4920 | // interleaving costs for prior ISA's. | ||||
4921 | if (!BaseT::isStridedAccess(Ptr)) | ||||
4922 | return NumVectorInstToHideOverhead; | ||||
4923 | if (!BaseT::getConstantStrideStep(SE, Ptr)) | ||||
4924 | return 1; | ||||
4925 | } | ||||
4926 | |||||
4927 | return BaseT::getAddressComputationCost(Ty, SE, Ptr); | ||||
4928 | } | ||||
4929 | |||||
4930 | InstructionCost | ||||
4931 | X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, | ||||
4932 | std::optional<FastMathFlags> FMF, | ||||
4933 | TTI::TargetCostKind CostKind) { | ||||
4934 | if (TTI::requiresOrderedReduction(FMF)) | ||||
4935 | return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); | ||||
4936 | |||||
4937 | // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput | ||||
4938 | // and make it as the cost. | ||||
4939 | |||||
4940 | static const CostTblEntry SLMCostTblNoPairWise[] = { | ||||
4941 | { ISD::FADD, MVT::v2f64, 3 }, | ||||
4942 | { ISD::ADD, MVT::v2i64, 5 }, | ||||
4943 | }; | ||||
4944 | |||||
4945 | static const CostTblEntry SSE2CostTblNoPairWise[] = { | ||||
4946 | { ISD::FADD, MVT::v2f64, 2 }, | ||||
4947 | { ISD::FADD, MVT::v2f32, 2 }, | ||||
4948 | { ISD::FADD, MVT::v4f32, 4 }, | ||||
4949 | { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". | ||||
4950 | { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32 | ||||
4951 | { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3". | ||||
4952 | { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3". | ||||
4953 | { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3". | ||||
4954 | { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3". | ||||
4955 | { ISD::ADD, MVT::v2i8, 2 }, | ||||
4956 | { ISD::ADD, MVT::v4i8, 2 }, | ||||
4957 | { ISD::ADD, MVT::v8i8, 2 }, | ||||
4958 | { ISD::ADD, MVT::v16i8, 3 }, | ||||
4959 | }; | ||||
4960 | |||||
4961 | static const CostTblEntry AVX1CostTblNoPairWise[] = { | ||||
4962 | { ISD::FADD, MVT::v4f64, 3 }, | ||||
4963 | { ISD::FADD, MVT::v4f32, 3 }, | ||||
4964 | { ISD::FADD, MVT::v8f32, 4 }, | ||||
4965 | { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". | ||||
4966 | { ISD::ADD, MVT::v4i64, 3 }, | ||||
4967 | { ISD::ADD, MVT::v8i32, 5 }, | ||||
4968 | { ISD::ADD, MVT::v16i16, 5 }, | ||||
4969 | { ISD::ADD, MVT::v32i8, 4 }, | ||||
4970 | }; | ||||
4971 | |||||
4972 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | ||||
4973 | assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ? void (0) : __assert_fail ("ISD && \"Invalid opcode\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4973, __extension__ __PRETTY_FUNCTION__)); | ||||
4974 | |||||
4975 | // Before legalizing the type, give a chance to look up illegal narrow types | ||||
4976 | // in the table. | ||||
4977 | // FIXME: Is there a better way to do this? | ||||
4978 | EVT VT = TLI->getValueType(DL, ValTy); | ||||
4979 | if (VT.isSimple()) { | ||||
4980 | MVT MTy = VT.getSimpleVT(); | ||||
4981 | if (ST->useSLMArithCosts()) | ||||
4982 | if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy)) | ||||
4983 | return Entry->Cost; | ||||
4984 | |||||
4985 | if (ST->hasAVX()) | ||||
4986 | if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) | ||||
4987 | return Entry->Cost; | ||||
4988 | |||||
4989 | if (ST->hasSSE2()) | ||||
4990 | if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) | ||||
4991 | return Entry->Cost; | ||||
4992 | } | ||||
4993 | |||||
4994 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); | ||||
4995 | |||||
4996 | MVT MTy = LT.second; | ||||
4997 | |||||
4998 | auto *ValVTy = cast<FixedVectorType>(ValTy); | ||||
4999 | |||||
5000 | // Special case: vXi8 mul reductions are performed as vXi16. | ||||
5001 | if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) { | ||||
5002 | auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16); | ||||
5003 | auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements()); | ||||
5004 | return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy, | ||||
5005 | TargetTransformInfo::CastContextHint::None, | ||||
5006 | CostKind) + | ||||
5007 | getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind); | ||||
5008 | } | ||||
5009 | |||||
5010 | InstructionCost ArithmeticCost = 0; | ||||
5011 | if (LT.first != 1 && MTy.isVector() && | ||||
5012 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { | ||||
5013 | // Type needs to be split. We need LT.first - 1 arithmetic ops. | ||||
5014 | auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(), | ||||
5015 | MTy.getVectorNumElements()); | ||||
5016 | ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind); | ||||
5017 | ArithmeticCost *= LT.first - 1; | ||||
5018 | } | ||||
5019 | |||||
5020 | if (ST->useSLMArithCosts()) | ||||
5021 | if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy)) | ||||
5022 | return ArithmeticCost + Entry->Cost; | ||||
5023 | |||||
5024 | if (ST->hasAVX()) | ||||
5025 | if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) | ||||
5026 | return ArithmeticCost + Entry->Cost; | ||||
5027 | |||||
5028 | if (ST->hasSSE2()) | ||||
5029 | if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) | ||||
5030 | return ArithmeticCost + Entry->Cost; | ||||
5031 | |||||
5032 | // FIXME: These assume a naive kshift+binop lowering, which is probably | ||||
5033 | // conservative in most cases. | ||||
5034 | static const CostTblEntry AVX512BoolReduction[] = { | ||||
5035 | { ISD::AND, MVT::v2i1, 3 }, | ||||
5036 | { ISD::AND, MVT::v4i1, 5 }, | ||||
5037 | { ISD::AND, MVT::v8i1, 7 }, | ||||
5038 | { ISD::AND, MVT::v16i1, 9 }, | ||||
5039 | { ISD::AND, MVT::v32i1, 11 }, | ||||
5040 | { ISD::AND, MVT::v64i1, 13 }, | ||||
5041 | { ISD::OR, MVT::v2i1, 3 }, | ||||
5042 | { ISD::OR, MVT::v4i1, 5 }, | ||||
5043 | { ISD::OR, MVT::v8i1, 7 }, | ||||
5044 | { ISD::OR, MVT::v16i1, 9 }, | ||||
5045 | { ISD::OR, MVT::v32i1, 11 }, | ||||
5046 | { ISD::OR, MVT::v64i1, 13 }, | ||||
5047 | }; | ||||
5048 | |||||
5049 | static const CostTblEntry AVX2BoolReduction[] = { | ||||
5050 | { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp | ||||
5051 | { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp | ||||
5052 | { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp | ||||
5053 | { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp | ||||
5054 | }; | ||||
5055 | |||||
5056 | static const CostTblEntry AVX1BoolReduction[] = { | ||||
5057 | { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp | ||||
5058 | { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp | ||||
5059 | { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp | ||||
5060 | { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp | ||||
5061 | { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp | ||||
5062 | { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp | ||||
5063 | { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp | ||||
5064 | { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp | ||||
5065 | }; | ||||
5066 | |||||
5067 | static const CostTblEntry SSE2BoolReduction[] = { | ||||
5068 | { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp | ||||
5069 | { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp | ||||
5070 | { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp | ||||
5071 | { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp | ||||
5072 | { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp | ||||
5073 | { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp | ||||
5074 | { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp | ||||
5075 | { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp | ||||
5076 | }; | ||||
5077 | |||||
5078 | // Handle bool allof/anyof patterns. | ||||
5079 | if (ValVTy->getElementType()->isIntegerTy(1)) { | ||||
5080 | InstructionCost ArithmeticCost = 0; | ||||
5081 | if (LT.first != 1 && MTy.isVector() && | ||||
5082 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { | ||||
5083 | // Type needs to be split. We need LT.first - 1 arithmetic ops. | ||||
5084 | auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(), | ||||
5085 | MTy.getVectorNumElements()); | ||||
5086 | ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind); | ||||
5087 | ArithmeticCost *= LT.first - 1; | ||||
5088 | } | ||||
5089 | |||||
5090 | if (ST->hasAVX512()) | ||||
5091 | if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy)) | ||||
5092 | return ArithmeticCost + Entry->Cost; | ||||
5093 | if (ST->hasAVX2()) | ||||
5094 | if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy)) | ||||
5095 | return ArithmeticCost + Entry->Cost; | ||||
5096 | if (ST->hasAVX()) | ||||
5097 | if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy)) | ||||
5098 | return ArithmeticCost + Entry->Cost; | ||||
5099 | if (ST->hasSSE2()) | ||||
5100 | if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy)) | ||||
5101 | return ArithmeticCost + Entry->Cost; | ||||
5102 | |||||
5103 | return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind); | ||||
5104 | } | ||||
5105 | |||||
5106 | unsigned NumVecElts = ValVTy->getNumElements(); | ||||
5107 | unsigned ScalarSize = ValVTy->getScalarSizeInBits(); | ||||
5108 | |||||
5109 | // Special case power of 2 reductions where the scalar type isn't changed | ||||
5110 | // by type legalization. | ||||
5111 | if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits()) | ||||
5112 | return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind); | ||||
5113 | |||||
5114 | InstructionCost ReductionCost = 0; | ||||
5115 | |||||
5116 | auto *Ty = ValVTy; | ||||
5117 | if (LT.first != 1 && MTy.isVector() && | ||||
5118 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { | ||||
5119 | // Type needs to be split. We need LT.first - 1 arithmetic ops. | ||||
5120 | Ty = FixedVectorType::get(ValVTy->getElementType(), | ||||
5121 | MTy.getVectorNumElements()); | ||||
5122 | ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind); | ||||
5123 | ReductionCost *= LT.first - 1; | ||||
5124 | NumVecElts = MTy.getVectorNumElements(); | ||||
5125 | } | ||||
5126 | |||||
5127 | // Now handle reduction with the legal type, taking into account size changes | ||||
5128 | // at each level. | ||||
5129 | while (NumVecElts > 1) { | ||||
5130 | // Determine the size of the remaining vector we need to reduce. | ||||
5131 | unsigned Size = NumVecElts * ScalarSize; | ||||
5132 | NumVecElts /= 2; | ||||
5133 | // If we're reducing from 256/512 bits, use an extract_subvector. | ||||
5134 | if (Size > 128) { | ||||
5135 | auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts); | ||||
5136 | ReductionCost += | ||||
5137 | getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, CostKind, | ||||
5138 | NumVecElts, SubTy); | ||||
5139 | Ty = SubTy; | ||||
5140 | } else if (Size == 128) { | ||||
5141 | // Reducing from 128 bits is a permute of v2f64/v2i64. | ||||
5142 | FixedVectorType *ShufTy; | ||||
5143 | if (ValVTy->isFloatingPointTy()) | ||||
5144 | ShufTy = | ||||
5145 | FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2); | ||||
5146 | else | ||||
5147 | ShufTy = | ||||
5148 | FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2); | ||||
5149 | ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, | ||||
5150 | std::nullopt, CostKind, 0, nullptr); | ||||
5151 | } else if (Size == 64) { | ||||
5152 | // Reducing from 64 bits is a shuffle of v4f32/v4i32. | ||||
5153 | FixedVectorType *ShufTy; | ||||
5154 | if (ValVTy->isFloatingPointTy()) | ||||
5155 | ShufTy = | ||||
5156 | FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4); | ||||
5157 | else | ||||
5158 | ShufTy = | ||||
5159 | FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4); | ||||
5160 | ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, | ||||
5161 | std::nullopt, CostKind, 0, nullptr); | ||||
5162 | } else { | ||||
5163 | // Reducing from smaller size is a shift by immediate. | ||||
5164 | auto *ShiftTy = FixedVectorType::get( | ||||
5165 | Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size); | ||||
5166 | ReductionCost += getArithmeticInstrCost( | ||||
5167 | Instruction::LShr, ShiftTy, CostKind, | ||||
5168 | {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, | ||||
5169 | {TargetTransformInfo::OK_UniformConstantValue, TargetTransformInfo::OP_None}); | ||||
5170 | } | ||||
5171 | |||||
5172 | // Add the arithmetic op for this level. | ||||
5173 | ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind); | ||||
5174 | } | ||||
5175 | |||||
5176 | // Add the final extract element to the cost. | ||||
5177 | return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty, | ||||
5178 | CostKind, 0, nullptr, nullptr); | ||||
5179 | } | ||||
5180 | |||||
5181 | InstructionCost X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy, | ||||
5182 | bool IsUnsigned) { | ||||
5183 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); | ||||
5184 | |||||
5185 | MVT MTy = LT.second; | ||||
5186 | |||||
5187 | int ISD; | ||||
5188 | if (Ty->isIntOrIntVectorTy()) { | ||||
5189 | ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN; | ||||
5190 | } else { | ||||
5191 | assert(Ty->isFPOrFPVectorTy() &&(static_cast <bool> (Ty->isFPOrFPVectorTy() && "Expected float point or integer vector type.") ? void (0) : __assert_fail ("Ty->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5192, __extension__ __PRETTY_FUNCTION__)) | ||||
5192 | "Expected float point or integer vector type.")(static_cast <bool> (Ty->isFPOrFPVectorTy() && "Expected float point or integer vector type.") ? void (0) : __assert_fail ("Ty->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5192, __extension__ __PRETTY_FUNCTION__)); | ||||
5193 | ISD = ISD::FMINNUM; | ||||
5194 | } | ||||
5195 | |||||
5196 | static const CostTblEntry SSE1CostTbl[] = { | ||||
5197 | {ISD::FMINNUM, MVT::v4f32, 1}, | ||||
5198 | }; | ||||
5199 | |||||
5200 | static const CostTblEntry SSE2CostTbl[] = { | ||||
5201 | {ISD::FMINNUM, MVT::v2f64, 1}, | ||||
5202 | {ISD::SMIN, MVT::v8i16, 1}, | ||||
5203 | {ISD::UMIN, MVT::v16i8, 1}, | ||||
5204 | }; | ||||
5205 | |||||
5206 | static const CostTblEntry SSE41CostTbl[] = { | ||||
5207 | {ISD::SMIN, MVT::v4i32, 1}, | ||||
5208 | {ISD::UMIN, MVT::v4i32, 1}, | ||||
5209 | {ISD::UMIN, MVT::v8i16, 1}, | ||||
5210 | {ISD::SMIN, MVT::v16i8, 1}, | ||||
5211 | }; | ||||
5212 | |||||
5213 | static const CostTblEntry SSE42CostTbl[] = { | ||||
5214 | {ISD::UMIN, MVT::v2i64, 3}, // xor+pcmpgtq+blendvpd | ||||
5215 | }; | ||||
5216 | |||||
5217 | static const CostTblEntry AVX1CostTbl[] = { | ||||
5218 | {ISD::FMINNUM, MVT::v8f32, 1}, | ||||
5219 | {ISD::FMINNUM, MVT::v4f64, 1}, | ||||
5220 | {ISD::SMIN, MVT::v8i32, 3}, | ||||
5221 | {ISD::UMIN, MVT::v8i32, 3}, | ||||
5222 | {ISD::SMIN, MVT::v16i16, 3}, | ||||
5223 | {ISD::UMIN, MVT::v16i16, 3}, | ||||
5224 | {ISD::SMIN, MVT::v32i8, 3}, | ||||
5225 | {ISD::UMIN, MVT::v32i8, 3}, | ||||
5226 | }; | ||||
5227 | |||||
5228 | static const CostTblEntry AVX2CostTbl[] = { | ||||
5229 | {ISD::SMIN, MVT::v8i32, 1}, | ||||
5230 | {ISD::UMIN, MVT::v8i32, 1}, | ||||
5231 | {ISD::SMIN, MVT::v16i16, 1}, | ||||
5232 | {ISD::UMIN, MVT::v16i16, 1}, | ||||
5233 | {ISD::SMIN, MVT::v32i8, 1}, | ||||
5234 | {ISD::UMIN, MVT::v32i8, 1}, | ||||
5235 | }; | ||||
5236 | |||||
5237 | static const CostTblEntry AVX512CostTbl[] = { | ||||
5238 | {ISD::FMINNUM, MVT::v16f32, 1}, | ||||
5239 | {ISD::FMINNUM, MVT::v8f64, 1}, | ||||
5240 | {ISD::SMIN, MVT::v2i64, 1}, | ||||
5241 | {ISD::UMIN, MVT::v2i64, 1}, | ||||
5242 | {ISD::SMIN, MVT::v4i64, 1}, | ||||
5243 | {ISD::UMIN, MVT::v4i64, 1}, | ||||
5244 | {ISD::SMIN, MVT::v8i64, 1}, | ||||
5245 | {ISD::UMIN, MVT::v8i64, 1}, | ||||
5246 | {ISD::SMIN, MVT::v16i32, 1}, | ||||
5247 | {ISD::UMIN, MVT::v16i32, 1}, | ||||
5248 | }; | ||||
5249 | |||||
5250 | static const CostTblEntry AVX512BWCostTbl[] = { | ||||
5251 | {ISD::SMIN, MVT::v32i16, 1}, | ||||
5252 | {ISD::UMIN, MVT::v32i16, 1}, | ||||
5253 | {ISD::SMIN, MVT::v64i8, 1}, | ||||
5254 | {ISD::UMIN, MVT::v64i8, 1}, | ||||
5255 | }; | ||||
5256 | |||||
5257 | // If we have a native MIN/MAX instruction for this type, use it. | ||||
5258 | if (ST->hasBWI()) | ||||
5259 | if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) | ||||
5260 | return LT.first * Entry->Cost; | ||||
5261 | |||||
5262 | if (ST->hasAVX512()) | ||||
5263 | if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) | ||||
5264 | return LT.first * Entry->Cost; | ||||
5265 | |||||
5266 | if (ST->hasAVX2()) | ||||
5267 | if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) | ||||
5268 | return LT.first * Entry->Cost; | ||||
5269 | |||||
5270 | if (ST->hasAVX()) | ||||
5271 | if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) | ||||
5272 | return LT.first * Entry->Cost; | ||||
5273 | |||||
5274 | if (ST->hasSSE42()) | ||||
5275 | if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) | ||||
5276 | return LT.first * Entry->Cost; | ||||
5277 | |||||
5278 | if (ST->hasSSE41()) | ||||
5279 | if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) | ||||
5280 | return LT.first * Entry->Cost; | ||||
5281 | |||||
5282 | if (ST->hasSSE2()) | ||||
5283 | if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) | ||||
5284 | return LT.first * Entry->Cost; | ||||
5285 | |||||
5286 | if (ST->hasSSE1()) | ||||
5287 | if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) | ||||
5288 | return LT.first * Entry->Cost; | ||||
5289 | |||||
5290 | unsigned CmpOpcode; | ||||
5291 | if (Ty->isFPOrFPVectorTy()) { | ||||
5292 | CmpOpcode = Instruction::FCmp; | ||||
5293 | } else { | ||||
5294 | assert(Ty->isIntOrIntVectorTy() &&(static_cast <bool> (Ty->isIntOrIntVectorTy() && "expecting floating point or integer type for min/max reduction" ) ? void (0) : __assert_fail ("Ty->isIntOrIntVectorTy() && \"expecting floating point or integer type for min/max reduction\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5295, __extension__ __PRETTY_FUNCTION__)) | ||||
5295 | "expecting floating point or integer type for min/max reduction")(static_cast <bool> (Ty->isIntOrIntVectorTy() && "expecting floating point or integer type for min/max reduction" ) ? void (0) : __assert_fail ("Ty->isIntOrIntVectorTy() && \"expecting floating point or integer type for min/max reduction\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5295, __extension__ __PRETTY_FUNCTION__)); | ||||
5296 | CmpOpcode = Instruction::ICmp; | ||||
5297 | } | ||||
5298 | |||||
5299 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; | ||||
5300 | // Otherwise fall back to cmp+select. | ||||
5301 | InstructionCost Result = | ||||
5302 | getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CmpInst::BAD_ICMP_PREDICATE, | ||||
5303 | CostKind) + | ||||
5304 | getCmpSelInstrCost(Instruction::Select, Ty, CondTy, | ||||
5305 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | ||||
5306 | return Result; | ||||
5307 | } | ||||
5308 | |||||
5309 | InstructionCost | ||||
5310 | X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy, | ||||
5311 | bool IsUnsigned, | ||||
5312 | TTI::TargetCostKind CostKind) { | ||||
5313 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); | ||||
5314 | |||||
5315 | MVT MTy = LT.second; | ||||
5316 | |||||
5317 | int ISD; | ||||
5318 | if (ValTy->isIntOrIntVectorTy()) { | ||||
5319 | ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN; | ||||
5320 | } else { | ||||
5321 | assert(ValTy->isFPOrFPVectorTy() &&(static_cast <bool> (ValTy->isFPOrFPVectorTy() && "Expected float point or integer vector type.") ? void (0) : __assert_fail ("ValTy->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5322, __extension__ __PRETTY_FUNCTION__)) | ||||
5322 | "Expected float point or integer vector type.")(static_cast <bool> (ValTy->isFPOrFPVectorTy() && "Expected float point or integer vector type.") ? void (0) : __assert_fail ("ValTy->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5322, __extension__ __PRETTY_FUNCTION__)); | ||||
5323 | ISD = ISD::FMINNUM; | ||||
5324 | } | ||||
5325 | |||||
5326 | // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput | ||||
5327 | // and make it as the cost. | ||||
5328 | |||||
5329 | static const CostTblEntry SSE2CostTblNoPairWise[] = { | ||||
5330 | {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw | ||||
5331 | {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw | ||||
5332 | {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw | ||||
5333 | }; | ||||
5334 | |||||
5335 | static const CostTblEntry SSE41CostTblNoPairWise[] = { | ||||
5336 | {ISD::SMIN, MVT::v2i16, 3}, // same as sse2 | ||||
5337 | {ISD::SMIN, MVT::v4i16, 5}, // same as sse2 | ||||
5338 | {ISD::UMIN, MVT::v2i16, 5}, // same as sse2 | ||||
5339 | {ISD::UMIN, MVT::v4i16, 7}, // same as sse2 | ||||
5340 | {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor | ||||
5341 | {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax | ||||
5342 | {ISD::SMIN, MVT::v2i8, 3}, // pminsb | ||||
5343 | {ISD::SMIN, MVT::v4i8, 5}, // pminsb | ||||
5344 | {ISD::SMIN, MVT::v8i8, 7}, // pminsb | ||||
5345 | {ISD::SMIN, MVT::v16i8, 6}, | ||||
5346 | {ISD::UMIN, MVT::v2i8, 3}, // same as sse2 | ||||
5347 | {ISD::UMIN, MVT::v4i8, 5}, // same as sse2 | ||||
5348 | {ISD::UMIN, MVT::v8i8, 7}, // same as sse2 | ||||
5349 | {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax | ||||
5350 | }; | ||||
5351 | |||||
5352 | static const CostTblEntry AVX1CostTblNoPairWise[] = { | ||||
5353 | {ISD::SMIN, MVT::v16i16, 6}, | ||||
5354 | {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax | ||||
5355 | {ISD::SMIN, MVT::v32i8, 8}, | ||||
5356 | {ISD::UMIN, MVT::v32i8, 8}, | ||||
5357 | }; | ||||
5358 | |||||
5359 | static const CostTblEntry AVX512BWCostTblNoPairWise[] = { | ||||
5360 | {ISD::SMIN, MVT::v32i16, 8}, | ||||
5361 | {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax | ||||
5362 | {ISD::SMIN, MVT::v64i8, 10}, | ||||
5363 | {ISD::UMIN, MVT::v64i8, 10}, | ||||
5364 | }; | ||||
5365 | |||||
5366 | // Before legalizing the type, give a chance to look up illegal narrow types | ||||
5367 | // in the table. | ||||
5368 | // FIXME: Is there a better way to do this? | ||||
5369 | EVT VT = TLI->getValueType(DL, ValTy); | ||||
5370 | if (VT.isSimple()) { | ||||
5371 | MVT MTy = VT.getSimpleVT(); | ||||
5372 | if (ST->hasBWI()) | ||||
5373 | if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy)) | ||||
5374 | return Entry->Cost; | ||||
5375 | |||||
5376 | if (ST->hasAVX()) | ||||
5377 | if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) | ||||
5378 | return Entry->Cost; | ||||
5379 | |||||
5380 | if (ST->hasSSE41()) | ||||
5381 | if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy)) | ||||
5382 | return Entry->Cost; | ||||
5383 | |||||
5384 | if (ST->hasSSE2()) | ||||
5385 | if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) | ||||
5386 | return Entry->Cost; | ||||
5387 | } | ||||
5388 | |||||
5389 | auto *ValVTy = cast<FixedVectorType>(ValTy); | ||||
5390 | unsigned NumVecElts = ValVTy->getNumElements(); | ||||
5391 | |||||
5392 | auto *Ty = ValVTy; | ||||
5393 | InstructionCost MinMaxCost = 0; | ||||
5394 | if (LT.first != 1 && MTy.isVector() && | ||||
5395 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { | ||||
5396 | // Type needs to be split. We need LT.first - 1 operations ops. | ||||
5397 | Ty = FixedVectorType::get(ValVTy->getElementType(), | ||||
5398 | MTy.getVectorNumElements()); | ||||
5399 | auto *SubCondTy = FixedVectorType::get(CondTy->getElementType(), | ||||
5400 | MTy.getVectorNumElements()); | ||||
5401 | MinMaxCost = getMinMaxCost(Ty, SubCondTy, IsUnsigned); | ||||
5402 | MinMaxCost *= LT.first - 1; | ||||
5403 | NumVecElts = MTy.getVectorNumElements(); | ||||
5404 | } | ||||
5405 | |||||
5406 | if (ST->hasBWI()) | ||||
5407 | if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy)) | ||||
5408 | return MinMaxCost + Entry->Cost; | ||||
5409 | |||||
5410 | if (ST->hasAVX()) | ||||
5411 | if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) | ||||
5412 | return MinMaxCost + Entry->Cost; | ||||
5413 | |||||
5414 | if (ST->hasSSE41()) | ||||
5415 | if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy)) | ||||
5416 | return MinMaxCost + Entry->Cost; | ||||
5417 | |||||
5418 | if (ST->hasSSE2()) | ||||
5419 | if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) | ||||
5420 | return MinMaxCost + Entry->Cost; | ||||
5421 | |||||
5422 | unsigned ScalarSize = ValTy->getScalarSizeInBits(); | ||||
5423 | |||||
5424 | // Special case power of 2 reductions where the scalar type isn't changed | ||||
5425 | // by type legalization. | ||||
5426 | if (!isPowerOf2_32(ValVTy->getNumElements()) || | ||||
5427 | ScalarSize != MTy.getScalarSizeInBits()) | ||||
5428 | return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsUnsigned, CostKind); | ||||
5429 | |||||
5430 | // Now handle reduction with the legal type, taking into account size changes | ||||
5431 | // at each level. | ||||
5432 | while (NumVecElts > 1) { | ||||
5433 | // Determine the size of the remaining vector we need to reduce. | ||||
5434 | unsigned Size = NumVecElts * ScalarSize; | ||||
5435 | NumVecElts /= 2; | ||||
5436 | // If we're reducing from 256/512 bits, use an extract_subvector. | ||||
5437 | if (Size > 128) { | ||||
5438 | auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts); | ||||
5439 | MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, | ||||
5440 | CostKind, NumVecElts, SubTy); | ||||
5441 | Ty = SubTy; | ||||
5442 | } else if (Size == 128) { | ||||
5443 | // Reducing from 128 bits is a permute of v2f64/v2i64. | ||||
5444 | VectorType *ShufTy; | ||||
5445 | if (ValTy->isFloatingPointTy()) | ||||
5446 | ShufTy = | ||||
5447 | FixedVectorType::get(Type::getDoubleTy(ValTy->getContext()), 2); | ||||
5448 | else | ||||
5449 | ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2); | ||||
5450 | MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, | ||||
5451 | std::nullopt, CostKind, 0, nullptr); | ||||
5452 | } else if (Size == 64) { | ||||
5453 | // Reducing from 64 bits is a shuffle of v4f32/v4i32. | ||||
5454 | FixedVectorType *ShufTy; | ||||
5455 | if (ValTy->isFloatingPointTy()) | ||||
5456 | ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4); | ||||
5457 | else | ||||
5458 | ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4); | ||||
5459 | MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, | ||||
5460 | std::nullopt, CostKind, 0, nullptr); | ||||
5461 | } else { | ||||
5462 | // Reducing from smaller size is a shift by immediate. | ||||
5463 | auto *ShiftTy = FixedVectorType::get( | ||||
5464 | Type::getIntNTy(ValTy->getContext(), Size), 128 / Size); | ||||
5465 | MinMaxCost += getArithmeticInstrCost( | ||||
5466 | Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput, | ||||
5467 | {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, | ||||
5468 | {TargetTransformInfo::OK_UniformConstantValue, TargetTransformInfo::OP_None}); | ||||
5469 | } | ||||
5470 | |||||
5471 | // Add the arithmetic op for this level. | ||||
5472 | auto *SubCondTy = | ||||
5473 | FixedVectorType::get(CondTy->getElementType(), Ty->getNumElements()); | ||||
5474 | MinMaxCost += getMinMaxCost(Ty, SubCondTy, IsUnsigned); | ||||
5475 | } | ||||
5476 | |||||
5477 | // Add the final extract element to the cost. | ||||
5478 | return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty, | ||||
5479 | CostKind, 0, nullptr, nullptr); | ||||
5480 | } | ||||
5481 | |||||
5482 | /// Calculate the cost of materializing a 64-bit value. This helper | ||||
5483 | /// method might only calculate a fraction of a larger immediate. Therefore it | ||||
5484 | /// is valid to return a cost of ZERO. | ||||
5485 | InstructionCost X86TTIImpl::getIntImmCost(int64_t Val) { | ||||
5486 | if (Val == 0) | ||||
5487 | return TTI::TCC_Free; | ||||
5488 | |||||
5489 | if (isInt<32>(Val)) | ||||
5490 | return TTI::TCC_Basic; | ||||
5491 | |||||
5492 | return 2 * TTI::TCC_Basic; | ||||
5493 | } | ||||
5494 | |||||
5495 | InstructionCost X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, | ||||
5496 | TTI::TargetCostKind CostKind) { | ||||
5497 | assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) : __assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 5497, __extension__ __PRETTY_FUNCTION__)); | ||||
5498 | |||||
5499 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); | ||||
5500 | if (BitSize == 0) | ||||
5501 | return ~0U; | ||||
5502 | |||||
5503 | // Never hoist constants larger than 128bit, because this might lead to | ||||
5504 | // incorrect code generation or assertions in codegen. | ||||
5505 | // Fixme: Create a cost model for types larger than i128 once the codegen | ||||
5506 | // issues have been fixed. | ||||
5507 | if (BitSize > 128) | ||||
5508 | return TTI::TCC_Free; | ||||
5509 | |||||
5510 | if (Imm == 0) | ||||
5511 | return TTI::TCC_Free; | ||||
5512 | |||||
5513 | // Sign-extend all constants to a multiple of 64-bit. | ||||
5514 | APInt ImmVal = Imm; | ||||
5515 | if (BitSize % 64 != 0) | ||||
5516 | ImmVal = Imm.sext(alignTo(BitSize, 64)); | ||||
5517 | |||||
5518 | // Split the constant into 64-bit chunks and calculate the cost for each | ||||
5519 | // chunk. | ||||
5520 | InstructionCost Cost = 0; | ||||
5521 | for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { | ||||
5522 | APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); | ||||
5523 | int64_t Val = Tmp.getSExtValue(); | ||||
5524 | Cost += getIntImmCost(Val); | ||||
5525 | } | ||||
5526 | // We need at least one instruction to materialize the constant. | ||||
5527 | return std::max<InstructionCost>(1, Cost); | ||||
5528 | } | ||||
5529 | |||||
5530 | InstructionCost X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, | ||||
5531 | const APInt &Imm, Type *Ty, | ||||
5532 | TTI::TargetCostKind CostKind, | ||||
5533 | Instruction *Inst) { | ||||
5534 | assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) : __assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 5534, __extension__ __PRETTY_FUNCTION__)); | ||||
5535 | |||||
5536 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); | ||||
5537 | // There is no cost model for constants with a bit size of 0. Return TCC_Free | ||||
5538 | // here, so that constant hoisting will ignore this constant. | ||||
5539 | if (BitSize == 0) | ||||
5540 | return TTI::TCC_Free; | ||||
5541 | |||||
5542 | unsigned ImmIdx = ~0U; | ||||
5543 | switch (Opcode) { | ||||
5544 | default: | ||||
5545 | return TTI::TCC_Free; | ||||
5546 | case Instruction::GetElementPtr: | ||||
5547 | // Always hoist the base address of a GetElementPtr. This prevents the | ||||
5548 | // creation of new constants for every base constant that gets constant | ||||
5549 | // folded with the offset. | ||||
5550 | if (Idx == 0) | ||||
5551 | return 2 * TTI::TCC_Basic; | ||||
5552 | return TTI::TCC_Free; | ||||
5553 | case Instruction::Store: | ||||
5554 | ImmIdx = 0; | ||||
5555 | break; | ||||
5556 | case Instruction::ICmp: | ||||
5557 | // This is an imperfect hack to prevent constant hoisting of | ||||
5558 | // compares that might be trying to check if a 64-bit value fits in | ||||
5559 | // 32-bits. The backend can optimize these cases using a right shift by 32. | ||||
5560 | // Ideally we would check the compare predicate here. There also other | ||||
5561 | // similar immediates the backend can use shifts for. | ||||
5562 | if (Idx == 1 && Imm.getBitWidth() == 64) { | ||||
5563 | uint64_t ImmVal = Imm.getZExtValue(); | ||||
5564 | if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff) | ||||
5565 | return TTI::TCC_Free; | ||||
5566 | } | ||||
5567 | ImmIdx = 1; | ||||
5568 | break; | ||||
5569 | case Instruction::And: | ||||
5570 | // We support 64-bit ANDs with immediates with 32-bits of leading zeroes | ||||
5571 | // by using a 32-bit operation with implicit zero extension. Detect such | ||||
5572 | // immediates here as the normal path expects bit 31 to be sign extended. | ||||
5573 | if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.isIntN(32)) | ||||
5574 | return TTI::TCC_Free; | ||||
5575 | ImmIdx = 1; | ||||
5576 | break; | ||||
5577 | case Instruction::Add: | ||||
5578 | case Instruction::Sub: | ||||
5579 | // For add/sub, we can use the opposite instruction for INT32_MIN. | ||||
5580 | if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000) | ||||
5581 | return TTI::TCC_Free; | ||||
5582 | ImmIdx = 1; | ||||
5583 | break; | ||||
5584 | case Instruction::UDiv: | ||||
5585 | case Instruction::SDiv: | ||||
5586 | case Instruction::URem: | ||||
5587 | case Instruction::SRem: | ||||
5588 | // Division by constant is typically expanded later into a different | ||||
5589 | // instruction sequence. This completely changes the constants. | ||||
5590 | // Report them as "free" to stop ConstantHoist from marking them as opaque. | ||||
5591 | return TTI::TCC_Free; | ||||
5592 | case Instruction::Mul: | ||||
5593 | case Instruction::Or: | ||||
5594 | case Instruction::Xor: | ||||
5595 | ImmIdx = 1; | ||||
5596 | break; | ||||
5597 | // Always return TCC_Free for the shift value of a shift instruction. | ||||
5598 | case Instruction::Shl: | ||||
5599 | case Instruction::LShr: | ||||
5600 | case Instruction::AShr: | ||||
5601 | if (Idx == 1) | ||||
5602 | return TTI::TCC_Free; | ||||
5603 | break; | ||||
5604 | case Instruction::Trunc: | ||||
5605 | case Instruction::ZExt: | ||||
5606 | case Instruction::SExt: | ||||
5607 | case Instruction::IntToPtr: | ||||
5608 | case Instruction::PtrToInt: | ||||
5609 | case Instruction::BitCast: | ||||
5610 | case Instruction::PHI: | ||||
5611 | case Instruction::Call: | ||||
5612 | case Instruction::Select: | ||||
5613 | case Instruction::Ret: | ||||
5614 | case Instruction::Load: | ||||
5615 | break; | ||||
5616 | } | ||||
5617 | |||||
5618 | if (Idx == ImmIdx) { | ||||
5619 | int NumConstants = divideCeil(BitSize, 64); | ||||
5620 | InstructionCost Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); | ||||
5621 | return (Cost <= NumConstants * TTI::TCC_Basic) | ||||
5622 | ? static_cast<int>(TTI::TCC_Free) | ||||
5623 | : Cost; | ||||
5624 | } | ||||
5625 | |||||
5626 | return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); | ||||
5627 | } | ||||
5628 | |||||
5629 | InstructionCost X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, | ||||
5630 | const APInt &Imm, Type *Ty, | ||||
5631 | TTI::TargetCostKind CostKind) { | ||||
5632 | assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) : __assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 5632, __extension__ __PRETTY_FUNCTION__)); | ||||
5633 | |||||
5634 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); | ||||
5635 | // There is no cost model for constants with a bit size of 0. Return TCC_Free | ||||
5636 | // here, so that constant hoisting will ignore this constant. | ||||
5637 | if (BitSize == 0) | ||||
5638 | return TTI::TCC_Free; | ||||
5639 | |||||
5640 | switch (IID) { | ||||
5641 | default: | ||||
5642 | return TTI::TCC_Free; | ||||
5643 | case Intrinsic::sadd_with_overflow: | ||||
5644 | case Intrinsic::uadd_with_overflow: | ||||
5645 | case Intrinsic::ssub_with_overflow: | ||||
5646 | case Intrinsic::usub_with_overflow: | ||||
5647 | case Intrinsic::smul_with_overflow: | ||||
5648 | case Intrinsic::umul_with_overflow: | ||||
5649 | if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(32)) | ||||
5650 | return TTI::TCC_Free; | ||||
5651 | break; | ||||
5652 | case Intrinsic::experimental_stackmap: | ||||
5653 | if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64))) | ||||
5654 | return TTI::TCC_Free; | ||||
5655 | break; | ||||
5656 | case Intrinsic::experimental_patchpoint_void: | ||||
5657 | case Intrinsic::experimental_patchpoint_i64: | ||||
5658 | if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64))) | ||||
5659 | return TTI::TCC_Free; | ||||
5660 | break; | ||||
5661 | } | ||||
5662 | return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); | ||||
5663 | } | ||||
5664 | |||||
5665 | InstructionCost X86TTIImpl::getCFInstrCost(unsigned Opcode, | ||||
5666 | TTI::TargetCostKind CostKind, | ||||
5667 | const Instruction *I) { | ||||
5668 | if (CostKind != TTI::TCK_RecipThroughput) | ||||
5669 | return Opcode == Instruction::PHI ? 0 : 1; | ||||
5670 | // Branches are assumed to be predicted. | ||||
5671 | return 0; | ||||
5672 | } | ||||
5673 | |||||
5674 | int X86TTIImpl::getGatherOverhead() const { | ||||
5675 | // Some CPUs have more overhead for gather. The specified overhead is relative | ||||
5676 | // to the Load operation. "2" is the number provided by Intel architects. This | ||||
5677 | // parameter is used for cost estimation of Gather Op and comparison with | ||||
5678 | // other alternatives. | ||||
5679 | // TODO: Remove the explicit hasAVX512()?, That would mean we would only | ||||
5680 | // enable gather with a -march. | ||||
5681 | if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather())) | ||||
5682 | return 2; | ||||
5683 | |||||
5684 | return 1024; | ||||
5685 | } | ||||
5686 | |||||
5687 | int X86TTIImpl::getScatterOverhead() const { | ||||
5688 | if (ST->hasAVX512()) | ||||
5689 | return 2; | ||||
5690 | |||||
5691 | return 1024; | ||||
5692 | } | ||||
5693 | |||||
5694 | // Return an average cost of Gather / Scatter instruction, maybe improved later. | ||||
5695 | // FIXME: Add TargetCostKind support. | ||||
5696 | InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, | ||||
5697 | const Value *Ptr, Align Alignment, | ||||
5698 | unsigned AddressSpace) { | ||||
5699 | |||||
5700 | assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost")(static_cast <bool> (isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost") ? void (0) : __assert_fail ("isa<VectorType>(SrcVTy) && \"Unexpected type in getGSVectorCost\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5700, __extension__ __PRETTY_FUNCTION__)); | ||||
5701 | unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements(); | ||||
5702 | |||||
5703 | // Try to reduce index size from 64 bit (default for GEP) | ||||
5704 | // to 32. It is essential for VF 16. If the index can't be reduced to 32, the | ||||
5705 | // operation will use 16 x 64 indices which do not fit in a zmm and needs | ||||
5706 | // to split. Also check that the base pointer is the same for all lanes, | ||||
5707 | // and that there's at most one variable index. | ||||
5708 | auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) { | ||||
5709 | unsigned IndexSize = DL.getPointerSizeInBits(); | ||||
5710 | const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); | ||||
5711 | if (IndexSize < 64 || !GEP) | ||||
5712 | return IndexSize; | ||||
5713 | |||||
5714 | unsigned NumOfVarIndices = 0; | ||||
5715 | const Value *Ptrs = GEP->getPointerOperand(); | ||||
5716 | if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs)) | ||||
5717 | return IndexSize; | ||||
5718 | for (unsigned i = 1; i < GEP->getNumOperands(); ++i) { | ||||
5719 | if (isa<Constant>(GEP->getOperand(i))) | ||||
5720 | continue; | ||||
5721 | Type *IndxTy = GEP->getOperand(i)->getType(); | ||||
5722 | if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy)) | ||||
5723 | IndxTy = IndexVTy->getElementType(); | ||||
5724 | if ((IndxTy->getPrimitiveSizeInBits() == 64 && | ||||
5725 | !isa<SExtInst>(GEP->getOperand(i))) || | ||||
5726 | ++NumOfVarIndices > 1) | ||||
5727 | return IndexSize; // 64 | ||||
5728 | } | ||||
5729 | return (unsigned)32; | ||||
5730 | }; | ||||
5731 | |||||
5732 | // Trying to reduce IndexSize to 32 bits for vector 16. | ||||
5733 | // By default the IndexSize is equal to pointer size. | ||||
5734 | unsigned IndexSize = (ST->hasAVX512() && VF >= 16) | ||||
5735 | ? getIndexSizeInBits(Ptr, DL) | ||||
5736 | : DL.getPointerSizeInBits(); | ||||
5737 | |||||
5738 | auto *IndexVTy = FixedVectorType::get( | ||||
5739 | IntegerType::get(SrcVTy->getContext(), IndexSize), VF); | ||||
5740 | std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(IndexVTy); | ||||
5741 | std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcVTy); | ||||
5742 | InstructionCost::CostType SplitFactor = | ||||
5743 | *std::max(IdxsLT.first, SrcLT.first).getValue(); | ||||
5744 | if (SplitFactor > 1) { | ||||
5745 | // Handle splitting of vector of pointers | ||||
5746 | auto *SplitSrcTy = | ||||
5747 | FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor); | ||||
5748 | return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment, | ||||
5749 | AddressSpace); | ||||
5750 | } | ||||
5751 | |||||
5752 | // The gather / scatter cost is given by Intel architects. It is a rough | ||||
5753 | // number since we are looking at one instruction in a time. | ||||
5754 | const int GSOverhead = (Opcode == Instruction::Load) | ||||
5755 | ? getGatherOverhead() | ||||
5756 | : getScatterOverhead(); | ||||
5757 | return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), | ||||
5758 | MaybeAlign(Alignment), AddressSpace, | ||||
5759 | TTI::TCK_RecipThroughput); | ||||
5760 | } | ||||
5761 | |||||
5762 | /// Return the cost of full scalarization of gather / scatter operation. | ||||
5763 | /// | ||||
5764 | /// Opcode - Load or Store instruction. | ||||
5765 | /// SrcVTy - The type of the data vector that should be gathered or scattered. | ||||
5766 | /// VariableMask - The mask is non-constant at compile time. | ||||
5767 | /// Alignment - Alignment for one element. | ||||
5768 | /// AddressSpace - pointer[s] address space. | ||||
5769 | /// | ||||
5770 | /// FIXME: Add TargetCostKind support. | ||||
5771 | InstructionCost X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy, | ||||
5772 | bool VariableMask, Align Alignment, | ||||
5773 | unsigned AddressSpace) { | ||||
5774 | Type *ScalarTy = SrcVTy->getScalarType(); | ||||
5775 | unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements(); | ||||
5776 | APInt DemandedElts = APInt::getAllOnes(VF); | ||||
5777 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; | ||||
5778 | |||||
5779 | InstructionCost MaskUnpackCost = 0; | ||||
5780 | if (VariableMask) { | ||||
5781 | auto *MaskTy = | ||||
5782 | FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF); | ||||
5783 | MaskUnpackCost = getScalarizationOverhead( | ||||
5784 | MaskTy, DemandedElts, /*Insert=*/false, /*Extract=*/true, CostKind); | ||||
5785 | InstructionCost ScalarCompareCost = getCmpSelInstrCost( | ||||
5786 | Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr, | ||||
5787 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | ||||
5788 | InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind); | ||||
5789 | MaskUnpackCost += VF * (BranchCost + ScalarCompareCost); | ||||
5790 | } | ||||
5791 | |||||
5792 | InstructionCost AddressUnpackCost = getScalarizationOverhead( | ||||
5793 | FixedVectorType::get(ScalarTy->getPointerTo(), VF), DemandedElts, | ||||
5794 | /*Insert=*/false, /*Extract=*/true, CostKind); | ||||
5795 | |||||
5796 | // The cost of the scalar loads/stores. | ||||
5797 | InstructionCost MemoryOpCost = | ||||
5798 | VF * getMemoryOpCost(Opcode, ScalarTy, MaybeAlign(Alignment), | ||||
5799 | AddressSpace, CostKind); | ||||
5800 | |||||
5801 | // The cost of forming the vector from loaded scalars/ | ||||
5802 | // scalarizing the vector to perform scalar stores. | ||||
5803 | InstructionCost InsertExtractCost = getScalarizationOverhead( | ||||
5804 | cast<FixedVectorType>(SrcVTy), DemandedElts, | ||||
5805 | /*Insert=*/Opcode == Instruction::Load, | ||||
5806 | /*Extract=*/Opcode == Instruction::Store, CostKind); | ||||
5807 | |||||
5808 | return AddressUnpackCost + MemoryOpCost + MaskUnpackCost + InsertExtractCost; | ||||
5809 | } | ||||
5810 | |||||
5811 | /// Calculate the cost of Gather / Scatter operation | ||||
5812 | InstructionCost X86TTIImpl::getGatherScatterOpCost( | ||||
5813 | unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask, | ||||
5814 | Align Alignment, TTI::TargetCostKind CostKind, | ||||
5815 | const Instruction *I = nullptr) { | ||||
5816 | if (CostKind != TTI::TCK_RecipThroughput) { | ||||
5817 | if ((Opcode == Instruction::Load && | ||||
5818 | isLegalMaskedGather(SrcVTy, Align(Alignment)) && | ||||
5819 | !forceScalarizeMaskedGather(cast<VectorType>(SrcVTy), | ||||
5820 | Align(Alignment))) || | ||||
5821 | (Opcode == Instruction::Store && | ||||
5822 | isLegalMaskedScatter(SrcVTy, Align(Alignment)) && | ||||
5823 | !forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy), | ||||
5824 | Align(Alignment)))) | ||||
5825 | return 1; | ||||
5826 | return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask, | ||||
5827 | Alignment, CostKind, I); | ||||
5828 | } | ||||
5829 | |||||
5830 | assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter")(static_cast <bool> (SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter") ? void (0) : __assert_fail ("SrcVTy->isVectorTy() && \"Unexpected data type for Gather/Scatter\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5830, __extension__ __PRETTY_FUNCTION__)); | ||||
5831 | PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); | ||||
5832 | if (!PtrTy && Ptr->getType()->isVectorTy()) | ||||
5833 | PtrTy = dyn_cast<PointerType>( | ||||
5834 | cast<VectorType>(Ptr->getType())->getElementType()); | ||||
5835 | assert(PtrTy && "Unexpected type for Ptr argument")(static_cast <bool> (PtrTy && "Unexpected type for Ptr argument" ) ? void (0) : __assert_fail ("PtrTy && \"Unexpected type for Ptr argument\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5835, __extension__ __PRETTY_FUNCTION__)); | ||||
5836 | unsigned AddressSpace = PtrTy->getAddressSpace(); | ||||
5837 | |||||
5838 | if ((Opcode == Instruction::Load && | ||||
5839 | (!isLegalMaskedGather(SrcVTy, Align(Alignment)) || | ||||
5840 | forceScalarizeMaskedGather(cast<VectorType>(SrcVTy), | ||||
5841 | Align(Alignment)))) || | ||||
5842 | (Opcode == Instruction::Store && | ||||
5843 | (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) || | ||||
5844 | forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy), | ||||
5845 | Align(Alignment))))) | ||||
5846 | return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, | ||||
5847 | AddressSpace); | ||||
5848 | |||||
5849 | return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace); | ||||
5850 | } | ||||
5851 | |||||
5852 | bool X86TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, | ||||
5853 | const TargetTransformInfo::LSRCost &C2) { | ||||
5854 | // X86 specific here are "instruction number 1st priority". | ||||
5855 | return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, | ||||
5856 | C1.NumIVMuls, C1.NumBaseAdds, | ||||
5857 | C1.ScaleCost, C1.ImmCost, C1.SetupCost) < | ||||
5858 | std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, | ||||
5859 | C2.NumIVMuls, C2.NumBaseAdds, | ||||
5860 | C2.ScaleCost, C2.ImmCost, C2.SetupCost); | ||||
5861 | } | ||||
5862 | |||||
5863 | bool X86TTIImpl::canMacroFuseCmp() { | ||||
5864 | return ST->hasMacroFusion() || ST->hasBranchFusion(); | ||||
5865 | } | ||||
5866 | |||||
5867 | bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) { | ||||
5868 | if (!ST->hasAVX()) | ||||
5869 | return false; | ||||
5870 | |||||
5871 | // The backend can't handle a single element vector. | ||||
5872 | if (isa<VectorType>(DataTy) && | ||||
5873 | cast<FixedVectorType>(DataTy)->getNumElements() == 1) | ||||
5874 | return false; | ||||
5875 | Type *ScalarTy = DataTy->getScalarType(); | ||||
5876 | |||||
5877 | if (ScalarTy->isPointerTy()) | ||||
5878 | return true; | ||||
5879 | |||||
5880 | if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) | ||||
5881 | return true; | ||||
5882 | |||||
5883 | if (ScalarTy->isHalfTy() && ST->hasBWI()) | ||||
5884 | return true; | ||||
5885 | |||||
5886 | if (!ScalarTy->isIntegerTy()) | ||||
5887 | return false; | ||||
5888 | |||||
5889 | unsigned IntWidth = ScalarTy->getIntegerBitWidth(); | ||||
5890 | return IntWidth == 32 || IntWidth == 64 || | ||||
5891 | ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI()); | ||||
5892 | } | ||||
5893 | |||||
5894 | bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) { | ||||
5895 | return isLegalMaskedLoad(DataType, Alignment); | ||||
5896 | } | ||||
5897 | |||||
5898 | bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) { | ||||
5899 | unsigned DataSize = DL.getTypeStoreSize(DataType); | ||||
5900 | // The only supported nontemporal loads are for aligned vectors of 16 or 32 | ||||
5901 | // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2 | ||||
5902 | // (the equivalent stores only require AVX). | ||||
5903 | if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32)) | ||||
5904 | return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2(); | ||||
5905 | |||||
5906 | return false; | ||||
5907 | } | ||||
5908 | |||||
5909 | bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) { | ||||
5910 | unsigned DataSize = DL.getTypeStoreSize(DataType); | ||||
5911 | |||||
5912 | // SSE4A supports nontemporal stores of float and double at arbitrary | ||||
5913 | // alignment. | ||||
5914 | if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy())) | ||||
5915 | return true; | ||||
5916 | |||||
5917 | // Besides the SSE4A subtarget exception above, only aligned stores are | ||||
5918 | // available nontemporaly on any other subtarget. And only stores with a size | ||||
5919 | // of 4..32 bytes (powers of 2, only) are permitted. | ||||
5920 | if (Alignment < DataSize || DataSize < 4 || DataSize > 32 || | ||||
5921 | !isPowerOf2_32(DataSize)) | ||||
5922 | return false; | ||||
5923 | |||||
5924 | // 32-byte vector nontemporal stores are supported by AVX (the equivalent | ||||
5925 | // loads require AVX2). | ||||
5926 | if (DataSize == 32) | ||||
5927 | return ST->hasAVX(); | ||||
5928 | if (DataSize == 16) | ||||
5929 | return ST->hasSSE1(); | ||||
5930 | return true; | ||||
5931 | } | ||||
5932 | |||||
5933 | bool X86TTIImpl::isLegalBroadcastLoad(Type *ElementTy, | ||||
5934 | ElementCount NumElements) const { | ||||
5935 | // movddup | ||||
5936 | return ST->hasSSE3() && !NumElements.isScalable() && | ||||
5937 | NumElements.getFixedValue() == 2 && | ||||
5938 | ElementTy == Type::getDoubleTy(ElementTy->getContext()); | ||||
5939 | } | ||||
5940 | |||||
5941 | bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) { | ||||
5942 | if (!isa<VectorType>(DataTy)) | ||||
5943 | return false; | ||||
5944 | |||||
5945 | if (!ST->hasAVX512()) | ||||
5946 | return false; | ||||
5947 | |||||
5948 | // The backend can't handle a single element vector. | ||||
5949 | if (cast<FixedVectorType>(DataTy)->getNumElements() == 1) | ||||
5950 | return false; | ||||
5951 | |||||
5952 | Type *ScalarTy = cast<VectorType>(DataTy)->getElementType(); | ||||
5953 | |||||
5954 | if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) | ||||
5955 | return true; | ||||
5956 | |||||
5957 | if (!ScalarTy->isIntegerTy()) | ||||
5958 | return false; | ||||
5959 | |||||
5960 | unsigned IntWidth = ScalarTy->getIntegerBitWidth(); | ||||
5961 | return IntWidth == 32 || IntWidth == 64 || | ||||
5962 | ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2()); | ||||
5963 | } | ||||
5964 | |||||
5965 | bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) { | ||||
5966 | return isLegalMaskedExpandLoad(DataTy); | ||||
5967 | } | ||||
5968 | |||||
5969 | bool X86TTIImpl::supportsGather() const { | ||||
5970 | // Some CPUs have better gather performance than others. | ||||
5971 | // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only | ||||
5972 | // enable gather with a -march. | ||||
5973 | return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2()); | ||||
5974 | } | ||||
5975 | |||||
5976 | bool X86TTIImpl::forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) { | ||||
5977 | // Gather / Scatter for vector 2 is not profitable on KNL / SKX | ||||
5978 | // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend | ||||
5979 | // it to 8 elements, but zeroing upper bits of the mask vector will add more | ||||
5980 | // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO: | ||||
5981 | // Check, maybe the gather/scatter instruction is better in the VariableMask | ||||
5982 | // case. | ||||
5983 | unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements(); | ||||
5984 | return NumElts == 1 || | ||||
5985 | (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX()))); | ||||
5986 | } | ||||
5987 | |||||
5988 | bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) { | ||||
5989 | if (!supportsGather()) | ||||
5990 | return false; | ||||
5991 | Type *ScalarTy = DataTy->getScalarType(); | ||||
5992 | if (ScalarTy->isPointerTy()) | ||||
5993 | return true; | ||||
5994 | |||||
5995 | if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) | ||||
5996 | return true; | ||||
5997 | |||||
5998 | if (!ScalarTy->isIntegerTy()) | ||||
5999 | return false; | ||||
6000 | |||||
6001 | unsigned IntWidth = ScalarTy->getIntegerBitWidth(); | ||||
6002 | return IntWidth == 32 || IntWidth == 64; | ||||
6003 | } | ||||
6004 | |||||
6005 | bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, | ||||
6006 | unsigned Opcode1, | ||||
6007 | const SmallBitVector &OpcodeMask) const { | ||||
6008 | // ADDSUBPS 4xf32 SSE3 | ||||
6009 | // VADDSUBPS 4xf32 AVX | ||||
6010 | // VADDSUBPS 8xf32 AVX2 | ||||
6011 | // ADDSUBPD 2xf64 SSE3 | ||||
6012 | // VADDSUBPD 2xf64 AVX | ||||
6013 | // VADDSUBPD 4xf64 AVX2 | ||||
6014 | |||||
6015 | unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements(); | ||||
6016 | assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible")(static_cast <bool> (OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible") ? void (0) : __assert_fail ("OpcodeMask.size() == NumElements && \"Mask and VecTy are incompatible\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 6016, __extension__ __PRETTY_FUNCTION__)); | ||||
6017 | if (!isPowerOf2_32(NumElements)) | ||||
6018 | return false; | ||||
6019 | // Check the opcode pattern. We apply the mask on the opcode arguments and | ||||
6020 | // then check if it is what we expect. | ||||
6021 | for (int Lane : seq<int>(0, NumElements)) { | ||||
6022 | unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0; | ||||
6023 | // We expect FSub for even lanes and FAdd for odd lanes. | ||||
6024 | if (Lane % 2 == 0 && Opc != Instruction::FSub) | ||||
6025 | return false; | ||||
6026 | if (Lane % 2 == 1 && Opc != Instruction::FAdd) | ||||
6027 | return false; | ||||
6028 | } | ||||
6029 | // Now check that the pattern is supported by the target ISA. | ||||
6030 | Type *ElemTy = cast<VectorType>(VecTy)->getElementType(); | ||||
6031 | if (ElemTy->isFloatTy()) | ||||
6032 | return ST->hasSSE3() && NumElements % 4 == 0; | ||||
6033 | if (ElemTy->isDoubleTy()) | ||||
6034 | return ST->hasSSE3() && NumElements % 2 == 0; | ||||
6035 | return false; | ||||
6036 | } | ||||
6037 | |||||
6038 | bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) { | ||||
6039 | // AVX2 doesn't support scatter | ||||
6040 | if (!ST->hasAVX512()) | ||||
6041 | return false; | ||||
6042 | return isLegalMaskedGather(DataType, Alignment); | ||||
6043 | } | ||||
6044 | |||||
6045 | bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) { | ||||
6046 | EVT VT = TLI->getValueType(DL, DataType); | ||||
6047 | return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT); | ||||
6048 | } | ||||
6049 | |||||
6050 | bool X86TTIImpl::isExpensiveToSpeculativelyExecute(const Instruction* I) { | ||||
6051 | // FDIV is always expensive, even if it has a very low uop count. | ||||
6052 | // TODO: Still necessary for recent CPUs with low latency/throughput fdiv? | ||||
6053 | if (I->getOpcode() == Instruction::FDiv) | ||||
6054 | return true; | ||||
6055 | |||||
6056 | return BaseT::isExpensiveToSpeculativelyExecute(I); | ||||
6057 | } | ||||
6058 | |||||
6059 | bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) { | ||||
6060 | return false; | ||||
6061 | } | ||||
6062 | |||||
6063 | bool X86TTIImpl::areInlineCompatible(const Function *Caller, | ||||
6064 | const Function *Callee) const { | ||||
6065 | const TargetMachine &TM = getTLI()->getTargetMachine(); | ||||
6066 | |||||
6067 | // Work this as a subsetting of subtarget features. | ||||
6068 | const FeatureBitset &CallerBits = | ||||
6069 | TM.getSubtargetImpl(*Caller)->getFeatureBits(); | ||||
6070 | const FeatureBitset &CalleeBits = | ||||
6071 | TM.getSubtargetImpl(*Callee)->getFeatureBits(); | ||||
6072 | |||||
6073 | // Check whether features are the same (apart from the ignore list). | ||||
6074 | FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList; | ||||
6075 | FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList; | ||||
6076 | if (RealCallerBits == RealCalleeBits) | ||||
6077 | return true; | ||||
6078 | |||||
6079 | // If the features are a subset, we need to additionally check for calls | ||||
6080 | // that may become ABI-incompatible as a result of inlining. | ||||
6081 | if ((RealCallerBits & RealCalleeBits) != RealCalleeBits) | ||||
6082 | return false; | ||||
6083 | |||||
6084 | for (const Instruction &I : instructions(Callee)) { | ||||
6085 | if (const auto *CB = dyn_cast<CallBase>(&I)) { | ||||
6086 | SmallVector<Type *, 8> Types; | ||||
6087 | for (Value *Arg : CB->args()) | ||||
6088 | Types.push_back(Arg->getType()); | ||||
6089 | if (!CB->getType()->isVoidTy()) | ||||
6090 | Types.push_back(CB->getType()); | ||||
6091 | |||||
6092 | // Simple types are always ABI compatible. | ||||
6093 | auto IsSimpleTy = [](Type *Ty) { | ||||
6094 | return !Ty->isVectorTy() && !Ty->isAggregateType(); | ||||
6095 | }; | ||||
6096 | if (all_of(Types, IsSimpleTy)) | ||||
6097 | continue; | ||||
6098 | |||||
6099 | if (Function *NestedCallee = CB->getCalledFunction()) { | ||||
6100 | // Assume that intrinsics are always ABI compatible. | ||||
6101 | if (NestedCallee->isIntrinsic()) | ||||
6102 | continue; | ||||
6103 | |||||
6104 | // Do a precise compatibility check. | ||||
6105 | if (!areTypesABICompatible(Caller, NestedCallee, Types)) | ||||
6106 | return false; | ||||
6107 | } else { | ||||
6108 | // We don't know the target features of the callee, | ||||
6109 | // assume it is incompatible. | ||||
6110 | return false; | ||||
6111 | } | ||||
6112 | } | ||||
6113 | } | ||||
6114 | return true; | ||||
6115 | } | ||||
6116 | |||||
6117 | bool X86TTIImpl::areTypesABICompatible(const Function *Caller, | ||||
6118 | const Function *Callee, | ||||
6119 | const ArrayRef<Type *> &Types) const { | ||||
6120 | if (!BaseT::areTypesABICompatible(Caller, Callee, Types)) | ||||
6121 | return false; | ||||
6122 | |||||
6123 | // If we get here, we know the target features match. If one function | ||||
6124 | // considers 512-bit vectors legal and the other does not, consider them | ||||
6125 | // incompatible. | ||||
6126 | const TargetMachine &TM = getTLI()->getTargetMachine(); | ||||
6127 | |||||
6128 | if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() == | ||||
6129 | TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs()) | ||||
6130 | return true; | ||||
6131 | |||||
6132 | // Consider the arguments compatible if they aren't vectors or aggregates. | ||||
6133 | // FIXME: Look at the size of vectors. | ||||
6134 | // FIXME: Look at the element types of aggregates to see if there are vectors. | ||||
6135 | return llvm::none_of(Types, | ||||
6136 | [](Type *T) { return T->isVectorTy() || T->isAggregateType(); }); | ||||
6137 | } | ||||
6138 | |||||
6139 | X86TTIImpl::TTI::MemCmpExpansionOptions | ||||
6140 | X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { | ||||
6141 | TTI::MemCmpExpansionOptions Options; | ||||
6142 | Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); | ||||
6143 | Options.NumLoadsPerBlock = 2; | ||||
6144 | // All GPR and vector loads can be unaligned. | ||||
6145 | Options.AllowOverlappingLoads = true; | ||||
6146 | if (IsZeroCmp) { | ||||
6147 | // Only enable vector loads for equality comparison. Right now the vector | ||||
6148 | // version is not as fast for three way compare (see #33329). | ||||
6149 | const unsigned PreferredWidth = ST->getPreferVectorWidth(); | ||||
6150 | if (PreferredWidth >= 512 && ST->hasAVX512()) Options.LoadSizes.push_back(64); | ||||
6151 | if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32); | ||||
6152 | if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16); | ||||
6153 | } | ||||
6154 | if (ST->is64Bit()) { | ||||
6155 | Options.LoadSizes.push_back(8); | ||||
6156 | } | ||||
6157 | Options.LoadSizes.push_back(4); | ||||
6158 | Options.LoadSizes.push_back(2); | ||||
6159 | Options.LoadSizes.push_back(1); | ||||
6160 | return Options; | ||||
6161 | } | ||||
6162 | |||||
6163 | bool X86TTIImpl::prefersVectorizedAddressing() const { | ||||
6164 | return supportsGather(); | ||||
6165 | } | ||||
6166 | |||||
6167 | bool X86TTIImpl::supportsEfficientVectorElementLoadStore() const { | ||||
6168 | return false; | ||||
6169 | } | ||||
6170 | |||||
6171 | bool X86TTIImpl::enableInterleavedAccessVectorization() { | ||||
6172 | // TODO: We expect this to be beneficial regardless of arch, | ||||
6173 | // but there are currently some unexplained performance artifacts on Atom. | ||||
6174 | // As a temporary solution, disable on Atom. | ||||
6175 | return !(ST->isAtom()); | ||||
6176 | } | ||||
6177 | |||||
6178 | // Get estimation for interleaved load/store operations and strided load. | ||||
6179 | // \p Indices contains indices for strided load. | ||||
6180 | // \p Factor - the factor of interleaving. | ||||
6181 | // AVX-512 provides 3-src shuffles that significantly reduces the cost. | ||||
6182 | InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512( | ||||
6183 | unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, | ||||
6184 | ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace, | ||||
6185 | TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) { | ||||
6186 | // VecTy for interleave memop is <VF*Factor x Elt>. | ||||
6187 | // So, for VF=4, Interleave Factor = 3, Element type = i32 we have | ||||
6188 | // VecTy = <12 x i32>. | ||||
6189 | |||||
6190 | // Calculate the number of memory operations (NumOfMemOps), required | ||||
6191 | // for load/store the VecTy. | ||||
6192 | MVT LegalVT = getTypeLegalizationCost(VecTy).second; | ||||
6193 | unsigned VecTySize = DL.getTypeStoreSize(VecTy); | ||||
6194 | unsigned LegalVTSize = LegalVT.getStoreSize(); | ||||
6195 | unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize; | ||||
6196 | |||||
6197 | // Get the cost of one memory operation. | ||||
6198 | auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(), | ||||
6199 | LegalVT.getVectorNumElements()); | ||||
6200 | InstructionCost MemOpCost; | ||||
6201 | bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps; | ||||
6202 | if (UseMaskedMemOp) | ||||
6203 | MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment, | ||||
6204 | AddressSpace, CostKind); | ||||
6205 | else | ||||
6206 | MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, MaybeAlign(Alignment), | ||||
6207 | AddressSpace, CostKind); | ||||
6208 | |||||
6209 | unsigned VF = VecTy->getNumElements() / Factor; | ||||
6210 | MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF); | ||||
6211 | |||||
6212 | InstructionCost MaskCost; | ||||
6213 | if (UseMaskedMemOp) { | ||||
6214 | APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements()); | ||||
6215 | for (unsigned Index : Indices) { | ||||
6216 | assert(Index < Factor && "Invalid index for interleaved memory op")(static_cast <bool> (Index < Factor && "Invalid index for interleaved memory op" ) ? void (0) : __assert_fail ("Index < Factor && \"Invalid index for interleaved memory op\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 6216, __extension__ __PRETTY_FUNCTION__)); | ||||
6217 | for (unsigned Elm = 0; Elm < VF; Elm++) | ||||
6218 | DemandedLoadStoreElts.setBit(Index + Elm * Factor); | ||||
6219 | } | ||||
6220 | |||||
6221 | Type *I1Type = Type::getInt1Ty(VecTy->getContext()); | ||||
6222 | |||||
6223 | MaskCost = getReplicationShuffleCost( | ||||
6224 | I1Type, Factor, VF, | ||||
6225 | UseMaskForGaps ? DemandedLoadStoreElts | ||||
6226 | : APInt::getAllOnes(VecTy->getNumElements()), | ||||
6227 | CostKind); | ||||
6228 | |||||
6229 | // The Gaps mask is invariant and created outside the loop, therefore the | ||||
6230 | // cost of creating it is not accounted for here. However if we have both | ||||
6231 | // a MaskForGaps and some other mask that guards the execution of the | ||||
6232 | // memory access, we need to account for the cost of And-ing the two masks | ||||
6233 | // inside the loop. | ||||
6234 | if (UseMaskForGaps) { | ||||
6235 | auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements()); | ||||
6236 | MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind); | ||||
6237 | } | ||||
6238 | } | ||||
6239 | |||||
6240 | if (Opcode == Instruction::Load) { | ||||
6241 | // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl) | ||||
6242 | // contain the cost of the optimized shuffle sequence that the | ||||
6243 | // X86InterleavedAccess pass will generate. | ||||
6244 | // The cost of loads and stores are computed separately from the table. | ||||
6245 | |||||
6246 | // X86InterleavedAccess support only the following interleaved-access group. | ||||
6247 | static const CostTblEntry AVX512InterleavedLoadTbl[] = { | ||||
6248 | {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8 | ||||
6249 | {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8 | ||||
6250 | {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8 | ||||
6251 | }; | ||||
6252 | |||||
6253 | if (const auto *Entry = | ||||
6254 | CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT)) | ||||
6255 | return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost; | ||||
6256 | //If an entry does not exist, fallback to the default implementation. | ||||
6257 | |||||
6258 | // Kind of shuffle depends on number of loaded values. | ||||
6259 | // If we load the entire data in one register, we can use a 1-src shuffle. | ||||
6260 | // Otherwise, we'll merge 2 sources in each operation. | ||||
6261 | TTI::ShuffleKind ShuffleKind = | ||||
6262 | (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc; | ||||
6263 | |||||
6264 | InstructionCost ShuffleCost = getShuffleCost( | ||||
6265 | ShuffleKind, SingleMemOpTy, std::nullopt, CostKind, 0, nullptr); | ||||
6266 | |||||
6267 | unsigned NumOfLoadsInInterleaveGrp = | ||||
6268 | Indices.size() ? Indices.size() : Factor; | ||||
6269 | auto *ResultTy = FixedVectorType::get(VecTy->getElementType(), | ||||
6270 | VecTy->getNumElements() / Factor); | ||||
6271 | InstructionCost NumOfResults = | ||||
6272 | getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp; | ||||
6273 | |||||
6274 | // About a half of the loads may be folded in shuffles when we have only | ||||
6275 | // one result. If we have more than one result, or the loads are masked, | ||||
6276 | // we do not fold loads at all. | ||||
6277 | unsigned NumOfUnfoldedLoads = | ||||
6278 | UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2; | ||||
6279 | |||||
6280 | // Get a number of shuffle operations per result. | ||||
6281 | unsigned NumOfShufflesPerResult = | ||||
6282 | std::max((unsigned)1, (unsigned)(NumOfMemOps - 1)); | ||||
6283 | |||||
6284 | // The SK_MergeTwoSrc shuffle clobbers one of src operands. | ||||
6285 | // When we have more than one destination, we need additional instructions | ||||
6286 | // to keep sources. | ||||
6287 | InstructionCost NumOfMoves = 0; | ||||
6288 | if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc) | ||||
6289 | NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2; | ||||
6290 | |||||
6291 | InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost + | ||||
6292 | MaskCost + NumOfUnfoldedLoads * MemOpCost + | ||||
6293 | NumOfMoves; | ||||
6294 | |||||
6295 | return Cost; | ||||
6296 | } | ||||
6297 | |||||
6298 | // Store. | ||||
6299 | assert(Opcode == Instruction::Store &&(static_cast <bool> (Opcode == Instruction::Store && "Expected Store Instruction at this point") ? void (0) : __assert_fail ("Opcode == Instruction::Store && \"Expected Store Instruction at this point\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 6300, __extension__ __PRETTY_FUNCTION__)) | ||||
6300 | "Expected Store Instruction at this point")(static_cast <bool> (Opcode == Instruction::Store && "Expected Store Instruction at this point") ? void (0) : __assert_fail ("Opcode == Instruction::Store && \"Expected Store Instruction at this point\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 6300, __extension__ __PRETTY_FUNCTION__)); | ||||
6301 | // X86InterleavedAccess support only the following interleaved-access group. | ||||
6302 | static const CostTblEntry AVX512InterleavedStoreTbl[] = { | ||||
6303 | {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store) | ||||
6304 | {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store) | ||||
6305 | {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store) | ||||
6306 | |||||
6307 | {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store) | ||||
6308 | {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store) | ||||
6309 | {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store) | ||||
6310 | {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store) | ||||
6311 | }; | ||||
6312 | |||||
6313 | if (const auto *Entry = | ||||
6314 | CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT)) | ||||
6315 | return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost; | ||||
6316 | //If an entry does not exist, fallback to the default implementation. | ||||
6317 | |||||
6318 | // There is no strided stores meanwhile. And store can't be folded in | ||||
6319 | // shuffle. | ||||
6320 | unsigned NumOfSources = Factor; // The number of values to be merged. | ||||
6321 | InstructionCost ShuffleCost = getShuffleCost( | ||||
6322 | TTI::SK_PermuteTwoSrc, SingleMemOpTy, std::nullopt, CostKind, 0, nullptr); | ||||
6323 | unsigned NumOfShufflesPerStore = NumOfSources - 1; | ||||
6324 | |||||
6325 | // The SK_MergeTwoSrc shuffle clobbers one of src operands. | ||||
6326 | // We need additional instructions to keep sources. | ||||
6327 | unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2; | ||||
6328 | InstructionCost Cost = | ||||
6329 | MaskCost + | ||||
6330 | NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) + | ||||
6331 | NumOfMoves; | ||||
6332 | return Cost; | ||||
6333 | } | ||||
6334 | |||||
6335 | InstructionCost X86TTIImpl::getInterleavedMemoryOpCost( | ||||
6336 | unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices, | ||||
6337 | Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, | ||||
6338 | bool UseMaskForCond, bool UseMaskForGaps) { | ||||
6339 | auto *VecTy = cast<FixedVectorType>(BaseTy); | ||||
| |||||
6340 | |||||
6341 | auto isSupportedOnAVX512 = [&](Type *VecTy, bool HasBW) { | ||||
6342 | Type *EltTy = cast<VectorType>(VecTy)->getElementType(); | ||||
6343 | if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) || | ||||
6344 | EltTy->isIntegerTy(32) || EltTy->isPointerTy()) | ||||
6345 | return true; | ||||
6346 | if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy()) | ||||
6347 | return HasBW; | ||||
6348 | return false; | ||||
6349 | }; | ||||
6350 | if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI())) | ||||
6351 | return getInterleavedMemoryOpCostAVX512( | ||||
6352 | Opcode, VecTy, Factor, Indices, Alignment, | ||||
6353 | AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps); | ||||
6354 | |||||
6355 | if (UseMaskForCond || UseMaskForGaps) | ||||
6356 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | ||||
6357 | Alignment, AddressSpace, CostKind, | ||||
6358 | UseMaskForCond, UseMaskForGaps); | ||||
6359 | |||||
6360 | // Get estimation for interleaved load/store operations for SSE-AVX2. | ||||
6361 | // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow | ||||
6362 | // computing the cost using a generic formula as a function of generic | ||||
6363 | // shuffles. We therefore use a lookup table instead, filled according to | ||||
6364 | // the instruction sequences that codegen currently generates. | ||||
6365 | |||||
6366 | // VecTy for interleave memop is <VF*Factor x Elt>. | ||||
6367 | // So, for VF=4, Interleave Factor = 3, Element type = i32 we have | ||||
6368 | // VecTy = <12 x i32>. | ||||
6369 | MVT LegalVT = getTypeLegalizationCost(VecTy).second; | ||||
6370 | |||||
6371 | // This function can be called with VecTy=<6xi128>, Factor=3, in which case | ||||
6372 | // the VF=2, while v2i128 is an unsupported MVT vector type | ||||
6373 | // (see MachineValueType.h::getVectorVT()). | ||||
6374 | if (!LegalVT.isVector()) | ||||
6375 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | ||||
6376 | Alignment, AddressSpace, CostKind); | ||||
6377 | |||||
6378 | unsigned VF = VecTy->getNumElements() / Factor; | ||||
6379 | Type *ScalarTy = VecTy->getElementType(); | ||||
6380 | // Deduplicate entries, model floats/pointers as appropriately-sized integers. | ||||
6381 | if (!ScalarTy->isIntegerTy()) | ||||
6382 | ScalarTy = | ||||
6383 | Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy)); | ||||
6384 | |||||
6385 | // Get the cost of all the memory operations. | ||||
6386 | // FIXME: discount dead loads. | ||||
6387 | InstructionCost MemOpCosts = getMemoryOpCost( | ||||
6388 | Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind); | ||||
6389 | |||||
6390 | auto *VT = FixedVectorType::get(ScalarTy, VF); | ||||
6391 | EVT ETy = TLI->getValueType(DL, VT); | ||||
6392 | if (!ETy.isSimple()) | ||||
6393 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | ||||
6394 | Alignment, AddressSpace, CostKind); | ||||
6395 | |||||
6396 | // TODO: Complete for other data-types and strides. | ||||
6397 | // Each combination of Stride, element bit width and VF results in a different | ||||
6398 | // sequence; The cost tables are therefore accessed with: | ||||
6399 | // Factor (stride) and VectorType=VFxiN. | ||||
6400 | // The Cost accounts only for the shuffle sequence; | ||||
6401 | // The cost of the loads/stores is accounted for separately. | ||||
6402 | // | ||||
6403 | static const CostTblEntry AVX2InterleavedLoadTbl[] = { | ||||
6404 | {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8 | ||||
6405 | {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8 | ||||
6406 | {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8 | ||||
6407 | {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8 | ||||
6408 | {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8 | ||||
6409 | |||||
6410 | {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16 | ||||
6411 | {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16 | ||||
6412 | {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16 | ||||
6413 | |||||
6414 | {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32 | ||||
6415 | {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32 | ||||
6416 | {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32 | ||||
6417 | |||||
6418 | {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64 | ||||
6419 | {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64 | ||||
6420 | {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64 | ||||
6421 | {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64 | ||||
6422 | |||||
6423 | {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8 | ||||
6424 | {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8 | ||||
6425 | {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8 | ||||
6426 | {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8 | ||||
6427 | {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8 | ||||
6428 | |||||
6429 | {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16 | ||||
6430 | {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16 | ||||
6431 | {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16 | ||||
6432 | {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16 | ||||
6433 | {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16 | ||||
6434 | |||||
6435 | {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32 | ||||
6436 | {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32 | ||||
6437 | {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32 | ||||
6438 | {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32 | ||||
6439 | {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32 | ||||
6440 | |||||
6441 | {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64 | ||||
6442 | {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64 | ||||
6443 | {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64 | ||||
6444 | {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64 | ||||
6445 | |||||
6446 | {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8 | ||||
6447 | {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8 | ||||
6448 | {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8 | ||||
6449 | {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8 | ||||
6450 | {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8 | ||||
6451 | |||||
6452 | {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16 | ||||
6453 | {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16 | ||||
6454 | {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16 | ||||
6455 | {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16 | ||||
6456 | {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16 | ||||
6457 | |||||
6458 | {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32 | ||||
6459 | {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32 | ||||
6460 | {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32 | ||||
6461 | {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32 | ||||
6462 | {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32 | ||||
6463 | |||||
6464 | {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64 | ||||
6465 | {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64 | ||||
6466 | {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64 | ||||
6467 | {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64 | ||||
6468 | |||||
6469 | {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8 | ||||
6470 | {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8 | ||||
6471 | {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8 | ||||
6472 | {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8 | ||||
6473 | {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8 | ||||
6474 | |||||
6475 | {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16 | ||||
6476 | {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16 | ||||
6477 | {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16 | ||||
6478 | {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16 | ||||
6479 | {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16 | ||||
6480 | |||||
6481 | {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32 | ||||
6482 | {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32 | ||||
6483 | {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32 | ||||
6484 | {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32 | ||||
6485 | |||||
6486 | {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64 | ||||
6487 | {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64 | ||||
6488 | {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64 | ||||
6489 | |||||
6490 | {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32 | ||||
6491 | }; | ||||
6492 | |||||
6493 | static const CostTblEntry SSSE3InterleavedLoadTbl[] = { | ||||
6494 | {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16 | ||||
6495 | }; | ||||
6496 | |||||
6497 | static const CostTblEntry SSE2InterleavedLoadTbl[] = { | ||||
6498 | {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16 | ||||
6499 | {2, MVT::v4i16, 7}, // (load 8i16 and) deinterleave into 2 x 4i16 | ||||
6500 | |||||
6501 | {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32 | ||||
6502 | {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32 | ||||
6503 | |||||
6504 | {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64 | ||||
6505 | }; | ||||
6506 | |||||
6507 | static const CostTblEntry AVX2InterleavedStoreTbl[] = { | ||||
6508 | {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store) | ||||
6509 | {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store) | ||||
6510 | |||||
6511 | {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store) | ||||
6512 | {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store) | ||||
6513 | {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store) | ||||
6514 | |||||
6515 | {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store) | ||||
6516 | {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store) | ||||
6517 | {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store) | ||||
6518 | {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store) | ||||
6519 | |||||
6520 | {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store) | ||||
6521 | {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store) | ||||
6522 | {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store) | ||||
6523 | {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store) | ||||
6524 | {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store) | ||||
6525 | |||||
6526 | {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store) | ||||
6527 | {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store) | ||||
6528 | {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store) | ||||
6529 | {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store) | ||||
6530 | {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store) | ||||
6531 | |||||
6532 | {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store) | ||||
6533 | {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store) | ||||
6534 | {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store) | ||||
6535 | {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store) | ||||
6536 | {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store) | ||||
6537 | |||||
6538 | {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store) | ||||
6539 | {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store) | ||||
6540 | {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store) | ||||
6541 | {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store) | ||||
6542 | {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store) | ||||
6543 | |||||
6544 | {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store) | ||||
6545 | {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store) | ||||
6546 | {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store) | ||||
6547 | {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store) | ||||
6548 | |||||
6549 | {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store) | ||||
6550 | {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store) | ||||
6551 | {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store) | ||||
6552 | {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store) | ||||
6553 | {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store) | ||||
6554 | |||||
6555 | {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store) | ||||
6556 | {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store) | ||||
6557 | {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store) | ||||
6558 | {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store) | ||||
6559 | {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store) | ||||
6560 | |||||
6561 | {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store) | ||||
6562 | {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store) | ||||
6563 | {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store) | ||||
6564 | {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store) | ||||
6565 | {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store) | ||||
6566 | |||||
6567 | {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store) | ||||
6568 | {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store) | ||||
6569 | {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store) | ||||
6570 | {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store) | ||||
6571 | |||||
6572 | {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store) | ||||
6573 | {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store) | ||||
6574 | {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store) | ||||
6575 | {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store) | ||||
6576 | {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store) | ||||
6577 | |||||
6578 | {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store) | ||||
6579 | {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store) | ||||
6580 | {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store) | ||||
6581 | {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store) | ||||
6582 | {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store) | ||||
6583 | |||||
6584 | {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store) | ||||
6585 | {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store) | ||||
6586 | {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store) | ||||
6587 | {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store) | ||||
6588 | |||||
6589 | {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store) | ||||
6590 | {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store) | ||||
6591 | {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store) | ||||
6592 | }; | ||||
6593 | |||||
6594 | static const CostTblEntry SSE2InterleavedStoreTbl[] = { | ||||
6595 | {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store) | ||||
6596 | {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store) | ||||
6597 | {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store) | ||||
6598 | |||||
6599 | {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store) | ||||
6600 | {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store) | ||||
6601 | |||||
6602 | {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store) | ||||
6603 | }; | ||||
6604 | |||||
6605 | if (Opcode == Instruction::Load) { | ||||
6606 | auto GetDiscountedCost = [Factor, NumMembers = Indices.size(), | ||||
6607 | MemOpCosts](const CostTblEntry *Entry) { | ||||
6608 | // NOTE: this is just an approximation! | ||||
6609 | // It can over/under -estimate the cost! | ||||
6610 | return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor); | ||||
6611 | }; | ||||
6612 | |||||
6613 | if (ST->hasAVX2()) | ||||
6614 | if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor, | ||||
6615 | ETy.getSimpleVT())) | ||||
6616 | return GetDiscountedCost(Entry); | ||||
6617 | |||||
6618 | if (ST->hasSSSE3()) | ||||
6619 | if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor, | ||||
6620 | ETy.getSimpleVT())) | ||||
6621 | return GetDiscountedCost(Entry); | ||||
6622 | |||||
6623 | if (ST->hasSSE2()) | ||||
6624 | if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor, | ||||
6625 | ETy.getSimpleVT())) | ||||
6626 | return GetDiscountedCost(Entry); | ||||
6627 | } else { | ||||
6628 | assert(Opcode == Instruction::Store &&(static_cast <bool> (Opcode == Instruction::Store && "Expected Store Instruction at this point") ? void (0) : __assert_fail ("Opcode == Instruction::Store && \"Expected Store Instruction at this point\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 6629, __extension__ __PRETTY_FUNCTION__)) | ||||
6629 | "Expected Store Instruction at this point")(static_cast <bool> (Opcode == Instruction::Store && "Expected Store Instruction at this point") ? void (0) : __assert_fail ("Opcode == Instruction::Store && \"Expected Store Instruction at this point\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 6629, __extension__ __PRETTY_FUNCTION__)); | ||||
6630 | assert((!Indices.size() || Indices.size() == Factor) &&(static_cast <bool> ((!Indices.size() || Indices.size() == Factor) && "Interleaved store only supports fully-interleaved groups." ) ? void (0) : __assert_fail ("(!Indices.size() || Indices.size() == Factor) && \"Interleaved store only supports fully-interleaved groups.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 6631, __extension__ __PRETTY_FUNCTION__)) | ||||
6631 | "Interleaved store only supports fully-interleaved groups.")(static_cast <bool> ((!Indices.size() || Indices.size() == Factor) && "Interleaved store only supports fully-interleaved groups." ) ? void (0) : __assert_fail ("(!Indices.size() || Indices.size() == Factor) && \"Interleaved store only supports fully-interleaved groups.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 6631, __extension__ __PRETTY_FUNCTION__)); | ||||
6632 | if (ST->hasAVX2()) | ||||
6633 | if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor, | ||||
6634 | ETy.getSimpleVT())) | ||||
6635 | return MemOpCosts + Entry->Cost; | ||||
6636 | |||||
6637 | if (ST->hasSSE2()) | ||||
6638 | if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor, | ||||
6639 | ETy.getSimpleVT())) | ||||
6640 | return MemOpCosts + Entry->Cost; | ||||
6641 | } | ||||
6642 | |||||
6643 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | ||||
6644 | Alignment, AddressSpace, CostKind, | ||||
6645 | UseMaskForCond, UseMaskForGaps); | ||||
6646 | } | ||||
6647 | |||||
6648 | InstructionCost X86TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, | ||||
6649 | int64_t BaseOffset, | ||||
6650 | bool HasBaseReg, int64_t Scale, | ||||
6651 | unsigned AddrSpace) const { | ||||
6652 | // Scaling factors are not free at all. | ||||
6653 | // An indexed folded instruction, i.e., inst (reg1, reg2, scale), | ||||
6654 | // will take 2 allocations in the out of order engine instead of 1 | ||||
6655 | // for plain addressing mode, i.e. inst (reg1). | ||||
6656 | // E.g., | ||||
6657 | // vaddps (%rsi,%rdx), %ymm0, %ymm1 | ||||
6658 | // Requires two allocations (one for the load, one for the computation) | ||||
6659 | // whereas: | ||||
6660 | // vaddps (%rsi), %ymm0, %ymm1 | ||||
6661 | // Requires just 1 allocation, i.e., freeing allocations for other operations | ||||
6662 | // and having less micro operations to execute. | ||||
6663 | // | ||||
6664 | // For some X86 architectures, this is even worse because for instance for | ||||
6665 | // stores, the complex addressing mode forces the instruction to use the | ||||
6666 | // "load" ports instead of the dedicated "store" port. | ||||
6667 | // E.g., on Haswell: | ||||
6668 | // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3. | ||||
6669 | // vmovaps %ymm1, (%r8) can use port 2, 3, or 7. | ||||
6670 | TargetLoweringBase::AddrMode AM; | ||||
6671 | AM.BaseGV = BaseGV; | ||||
6672 | AM.BaseOffs = BaseOffset; | ||||
6673 | AM.HasBaseReg = HasBaseReg; | ||||
6674 | AM.Scale = Scale; | ||||
6675 | if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) | ||||
6676 | // Scale represents reg2 * scale, thus account for 1 | ||||
6677 | // as soon as we use a second register. | ||||
6678 | return AM.Scale != 0; | ||||
6679 | return -1; | ||||
6680 | } |
1 | //===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===// | ||||||||
2 | // | ||||||||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||||||||
4 | // See https://llvm.org/LICENSE.txt for license information. | ||||||||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||||||
6 | // | ||||||||
7 | //===----------------------------------------------------------------------===// | ||||||||
8 | // | ||||||||
9 | /// \file | ||||||||
10 | /// This file provides a helper that implements much of the TTI interface in | ||||||||
11 | /// terms of the target-independent code generator and TargetLowering | ||||||||
12 | /// interfaces. | ||||||||
13 | // | ||||||||
14 | //===----------------------------------------------------------------------===// | ||||||||
15 | |||||||||
16 | #ifndef LLVM_CODEGEN_BASICTTIIMPL_H | ||||||||
17 | #define LLVM_CODEGEN_BASICTTIIMPL_H | ||||||||
18 | |||||||||
19 | #include "llvm/ADT/APInt.h" | ||||||||
20 | #include "llvm/ADT/ArrayRef.h" | ||||||||
21 | #include "llvm/ADT/BitVector.h" | ||||||||
22 | #include "llvm/ADT/SmallPtrSet.h" | ||||||||
23 | #include "llvm/ADT/SmallVector.h" | ||||||||
24 | #include "llvm/Analysis/LoopInfo.h" | ||||||||
25 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" | ||||||||
26 | #include "llvm/Analysis/TargetTransformInfo.h" | ||||||||
27 | #include "llvm/Analysis/TargetTransformInfoImpl.h" | ||||||||
28 | #include "llvm/CodeGen/ISDOpcodes.h" | ||||||||
29 | #include "llvm/CodeGen/TargetLowering.h" | ||||||||
30 | #include "llvm/CodeGen/TargetSubtargetInfo.h" | ||||||||
31 | #include "llvm/CodeGen/ValueTypes.h" | ||||||||
32 | #include "llvm/IR/BasicBlock.h" | ||||||||
33 | #include "llvm/IR/Constant.h" | ||||||||
34 | #include "llvm/IR/Constants.h" | ||||||||
35 | #include "llvm/IR/DataLayout.h" | ||||||||
36 | #include "llvm/IR/DerivedTypes.h" | ||||||||
37 | #include "llvm/IR/InstrTypes.h" | ||||||||
38 | #include "llvm/IR/Instruction.h" | ||||||||
39 | #include "llvm/IR/Instructions.h" | ||||||||
40 | #include "llvm/IR/Intrinsics.h" | ||||||||
41 | #include "llvm/IR/Operator.h" | ||||||||
42 | #include "llvm/IR/Type.h" | ||||||||
43 | #include "llvm/IR/Value.h" | ||||||||
44 | #include "llvm/Support/Casting.h" | ||||||||
45 | #include "llvm/Support/CommandLine.h" | ||||||||
46 | #include "llvm/Support/ErrorHandling.h" | ||||||||
47 | #include "llvm/Support/MachineValueType.h" | ||||||||
48 | #include "llvm/Support/MathExtras.h" | ||||||||
49 | #include "llvm/Target/TargetMachine.h" | ||||||||
50 | #include "llvm/Target/TargetOptions.h" | ||||||||
51 | #include <algorithm> | ||||||||
52 | #include <cassert> | ||||||||
53 | #include <cstdint> | ||||||||
54 | #include <limits> | ||||||||
55 | #include <optional> | ||||||||
56 | #include <utility> | ||||||||
57 | |||||||||
58 | namespace llvm { | ||||||||
59 | |||||||||
60 | class Function; | ||||||||
61 | class GlobalValue; | ||||||||
62 | class LLVMContext; | ||||||||
63 | class ScalarEvolution; | ||||||||
64 | class SCEV; | ||||||||
65 | class TargetMachine; | ||||||||
66 | |||||||||
67 | extern cl::opt<unsigned> PartialUnrollingThreshold; | ||||||||
68 | |||||||||
69 | /// Base class which can be used to help build a TTI implementation. | ||||||||
70 | /// | ||||||||
71 | /// This class provides as much implementation of the TTI interface as is | ||||||||
72 | /// possible using the target independent parts of the code generator. | ||||||||
73 | /// | ||||||||
74 | /// In order to subclass it, your class must implement a getST() method to | ||||||||
75 | /// return the subtarget, and a getTLI() method to return the target lowering. | ||||||||
76 | /// We need these methods implemented in the derived class so that this class | ||||||||
77 | /// doesn't have to duplicate storage for them. | ||||||||
78 | template <typename T> | ||||||||
79 | class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { | ||||||||
80 | private: | ||||||||
81 | using BaseT = TargetTransformInfoImplCRTPBase<T>; | ||||||||
82 | using TTI = TargetTransformInfo; | ||||||||
83 | |||||||||
84 | /// Helper function to access this as a T. | ||||||||
85 | T *thisT() { return static_cast<T *>(this); } | ||||||||
86 | |||||||||
87 | /// Estimate a cost of Broadcast as an extract and sequence of insert | ||||||||
88 | /// operations. | ||||||||
89 | InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy, | ||||||||
90 | TTI::TargetCostKind CostKind) { | ||||||||
91 | InstructionCost Cost = 0; | ||||||||
92 | // Broadcast cost is equal to the cost of extracting the zero'th element | ||||||||
93 | // plus the cost of inserting it into every element of the result vector. | ||||||||
94 | Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, | ||||||||
95 | CostKind, 0, nullptr, nullptr); | ||||||||
96 | |||||||||
97 | for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { | ||||||||
98 | Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, | ||||||||
99 | CostKind, i, nullptr, nullptr); | ||||||||
100 | } | ||||||||
101 | return Cost; | ||||||||
102 | } | ||||||||
103 | |||||||||
104 | /// Estimate a cost of shuffle as a sequence of extract and insert | ||||||||
105 | /// operations. | ||||||||
106 | InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy, | ||||||||
107 | TTI::TargetCostKind CostKind) { | ||||||||
108 | InstructionCost Cost = 0; | ||||||||
109 | // Shuffle cost is equal to the cost of extracting element from its argument | ||||||||
110 | // plus the cost of inserting them onto the result vector. | ||||||||
111 | |||||||||
112 | // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from | ||||||||
113 | // index 0 of first vector, index 1 of second vector,index 2 of first | ||||||||
114 | // vector and finally index 3 of second vector and insert them at index | ||||||||
115 | // <0,1,2,3> of result vector. | ||||||||
116 | for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { | ||||||||
117 | Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, | ||||||||
118 | CostKind, i, nullptr, nullptr); | ||||||||
119 | Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, | ||||||||
120 | CostKind, i, nullptr, nullptr); | ||||||||
121 | } | ||||||||
122 | return Cost; | ||||||||
123 | } | ||||||||
124 | |||||||||
125 | /// Estimate a cost of subvector extraction as a sequence of extract and | ||||||||
126 | /// insert operations. | ||||||||
127 | InstructionCost getExtractSubvectorOverhead(VectorType *VTy, | ||||||||
128 | TTI::TargetCostKind CostKind, | ||||||||
129 | int Index, | ||||||||
130 | FixedVectorType *SubVTy) { | ||||||||
131 | assert(VTy && SubVTy &&(static_cast <bool> (VTy && SubVTy && "Can only extract subvectors from vectors" ) ? void (0) : __assert_fail ("VTy && SubVTy && \"Can only extract subvectors from vectors\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 132, __extension__ __PRETTY_FUNCTION__)) | ||||||||
132 | "Can only extract subvectors from vectors")(static_cast <bool> (VTy && SubVTy && "Can only extract subvectors from vectors" ) ? void (0) : __assert_fail ("VTy && SubVTy && \"Can only extract subvectors from vectors\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 132, __extension__ __PRETTY_FUNCTION__)); | ||||||||
133 | int NumSubElts = SubVTy->getNumElements(); | ||||||||
134 | assert((!isa<FixedVectorType>(VTy) ||(static_cast <bool> ((!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>( VTy)->getNumElements()) && "SK_ExtractSubvector index out of range" ) ? void (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_ExtractSubvector index out of range\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 137, __extension__ __PRETTY_FUNCTION__)) | ||||||||
135 | (Index + NumSubElts) <=(static_cast <bool> ((!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>( VTy)->getNumElements()) && "SK_ExtractSubvector index out of range" ) ? void (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_ExtractSubvector index out of range\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 137, __extension__ __PRETTY_FUNCTION__)) | ||||||||
136 | (int)cast<FixedVectorType>(VTy)->getNumElements()) &&(static_cast <bool> ((!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>( VTy)->getNumElements()) && "SK_ExtractSubvector index out of range" ) ? void (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_ExtractSubvector index out of range\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 137, __extension__ __PRETTY_FUNCTION__)) | ||||||||
137 | "SK_ExtractSubvector index out of range")(static_cast <bool> ((!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>( VTy)->getNumElements()) && "SK_ExtractSubvector index out of range" ) ? void (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_ExtractSubvector index out of range\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 137, __extension__ __PRETTY_FUNCTION__)); | ||||||||
138 | |||||||||
139 | InstructionCost Cost = 0; | ||||||||
140 | // Subvector extraction cost is equal to the cost of extracting element from | ||||||||
141 | // the source type plus the cost of inserting them into the result vector | ||||||||
142 | // type. | ||||||||
143 | for (int i = 0; i != NumSubElts; ++i) { | ||||||||
144 | Cost += | ||||||||
145 | thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, | ||||||||
146 | CostKind, i + Index, nullptr, nullptr); | ||||||||
147 | Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, | ||||||||
148 | CostKind, i, nullptr, nullptr); | ||||||||
149 | } | ||||||||
150 | return Cost; | ||||||||
151 | } | ||||||||
152 | |||||||||
153 | /// Estimate a cost of subvector insertion as a sequence of extract and | ||||||||
154 | /// insert operations. | ||||||||
155 | InstructionCost getInsertSubvectorOverhead(VectorType *VTy, | ||||||||
156 | TTI::TargetCostKind CostKind, | ||||||||
157 | int Index, | ||||||||
158 | FixedVectorType *SubVTy) { | ||||||||
159 | assert(VTy && SubVTy &&(static_cast <bool> (VTy && SubVTy && "Can only insert subvectors into vectors" ) ? void (0) : __assert_fail ("VTy && SubVTy && \"Can only insert subvectors into vectors\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 160, __extension__ __PRETTY_FUNCTION__)) | ||||||||
160 | "Can only insert subvectors into vectors")(static_cast <bool> (VTy && SubVTy && "Can only insert subvectors into vectors" ) ? void (0) : __assert_fail ("VTy && SubVTy && \"Can only insert subvectors into vectors\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 160, __extension__ __PRETTY_FUNCTION__)); | ||||||||
161 | int NumSubElts = SubVTy->getNumElements(); | ||||||||
162 | assert((!isa<FixedVectorType>(VTy) ||(static_cast <bool> ((!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>( VTy)->getNumElements()) && "SK_InsertSubvector index out of range" ) ? void (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_InsertSubvector index out of range\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 165, __extension__ __PRETTY_FUNCTION__)) | ||||||||
163 | (Index + NumSubElts) <=(static_cast <bool> ((!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>( VTy)->getNumElements()) && "SK_InsertSubvector index out of range" ) ? void (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_InsertSubvector index out of range\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 165, __extension__ __PRETTY_FUNCTION__)) | ||||||||
164 | (int)cast<FixedVectorType>(VTy)->getNumElements()) &&(static_cast <bool> ((!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>( VTy)->getNumElements()) && "SK_InsertSubvector index out of range" ) ? void (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_InsertSubvector index out of range\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 165, __extension__ __PRETTY_FUNCTION__)) | ||||||||
165 | "SK_InsertSubvector index out of range")(static_cast <bool> ((!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>( VTy)->getNumElements()) && "SK_InsertSubvector index out of range" ) ? void (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_InsertSubvector index out of range\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 165, __extension__ __PRETTY_FUNCTION__)); | ||||||||
166 | |||||||||
167 | InstructionCost Cost = 0; | ||||||||
168 | // Subvector insertion cost is equal to the cost of extracting element from | ||||||||
169 | // the source type plus the cost of inserting them into the result vector | ||||||||
170 | // type. | ||||||||
171 | for (int i = 0; i != NumSubElts; ++i) { | ||||||||
172 | Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, | ||||||||
173 | CostKind, i, nullptr, nullptr); | ||||||||
174 | Cost += | ||||||||
175 | thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, CostKind, | ||||||||
176 | i + Index, nullptr, nullptr); | ||||||||
177 | } | ||||||||
178 | return Cost; | ||||||||
179 | } | ||||||||
180 | |||||||||
181 | /// Local query method delegates up to T which *must* implement this! | ||||||||
182 | const TargetSubtargetInfo *getST() const { | ||||||||
183 | return static_cast<const T *>(this)->getST(); | ||||||||
184 | } | ||||||||
185 | |||||||||
186 | /// Local query method delegates up to T which *must* implement this! | ||||||||
187 | const TargetLoweringBase *getTLI() const { | ||||||||
188 | return static_cast<const T *>(this)->getTLI(); | ||||||||
189 | } | ||||||||
190 | |||||||||
191 | static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) { | ||||||||
192 | switch (M) { | ||||||||
193 | case TTI::MIM_Unindexed: | ||||||||
194 | return ISD::UNINDEXED; | ||||||||
195 | case TTI::MIM_PreInc: | ||||||||
196 | return ISD::PRE_INC; | ||||||||
197 | case TTI::MIM_PreDec: | ||||||||
198 | return ISD::PRE_DEC; | ||||||||
199 | case TTI::MIM_PostInc: | ||||||||
200 | return ISD::POST_INC; | ||||||||
201 | case TTI::MIM_PostDec: | ||||||||
202 | return ISD::POST_DEC; | ||||||||
203 | } | ||||||||
204 | llvm_unreachable("Unexpected MemIndexedMode")::llvm::llvm_unreachable_internal("Unexpected MemIndexedMode" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 204); | ||||||||
205 | } | ||||||||
206 | |||||||||
207 | InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, | ||||||||
208 | Align Alignment, | ||||||||
209 | bool VariableMask, | ||||||||
210 | bool IsGatherScatter, | ||||||||
211 | TTI::TargetCostKind CostKind) { | ||||||||
212 | // We cannot scalarize scalable vectors, so return Invalid. | ||||||||
213 | if (isa<ScalableVectorType>(DataTy)) | ||||||||
214 | return InstructionCost::getInvalid(); | ||||||||
215 | |||||||||
216 | auto *VT = cast<FixedVectorType>(DataTy); | ||||||||
217 | // Assume the target does not have support for gather/scatter operations | ||||||||
218 | // and provide a rough estimate. | ||||||||
219 | // | ||||||||
220 | // First, compute the cost of the individual memory operations. | ||||||||
221 | InstructionCost AddrExtractCost = | ||||||||
222 | IsGatherScatter | ||||||||
223 | ? getVectorInstrCost(Instruction::ExtractElement, | ||||||||
224 | FixedVectorType::get( | ||||||||
225 | PointerType::get(VT->getElementType(), 0), | ||||||||
226 | VT->getNumElements()), | ||||||||
227 | CostKind, -1, nullptr, nullptr) | ||||||||
228 | : 0; | ||||||||
229 | InstructionCost LoadCost = | ||||||||
230 | VT->getNumElements() * | ||||||||
231 | (AddrExtractCost + | ||||||||
232 | getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind)); | ||||||||
233 | |||||||||
234 | // Next, compute the cost of packing the result in a vector. | ||||||||
235 | InstructionCost PackingCost = | ||||||||
236 | getScalarizationOverhead(VT, Opcode != Instruction::Store, | ||||||||
237 | Opcode == Instruction::Store, CostKind); | ||||||||
238 | |||||||||
239 | InstructionCost ConditionalCost = 0; | ||||||||
240 | if (VariableMask) { | ||||||||
241 | // Compute the cost of conditionally executing the memory operations with | ||||||||
242 | // variable masks. This includes extracting the individual conditions, a | ||||||||
243 | // branches and PHIs to combine the results. | ||||||||
244 | // NOTE: Estimating the cost of conditionally executing the memory | ||||||||
245 | // operations accurately is quite difficult and the current solution | ||||||||
246 | // provides a very rough estimate only. | ||||||||
247 | ConditionalCost = | ||||||||
248 | VT->getNumElements() * | ||||||||
249 | (getVectorInstrCost( | ||||||||
250 | Instruction::ExtractElement, | ||||||||
251 | FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()), | ||||||||
252 | VT->getNumElements()), | ||||||||
253 | CostKind, -1, nullptr, nullptr) + | ||||||||
254 | getCFInstrCost(Instruction::Br, CostKind) + | ||||||||
255 | getCFInstrCost(Instruction::PHI, CostKind)); | ||||||||
256 | } | ||||||||
257 | |||||||||
258 | return LoadCost + PackingCost + ConditionalCost; | ||||||||
259 | } | ||||||||
260 | |||||||||
261 | protected: | ||||||||
262 | explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL) | ||||||||
263 | : BaseT(DL) {} | ||||||||
264 | virtual ~BasicTTIImplBase() = default; | ||||||||
265 | |||||||||
266 | using TargetTransformInfoImplBase::DL; | ||||||||
267 | |||||||||
268 | public: | ||||||||
269 | /// \name Scalar TTI Implementations | ||||||||
270 | /// @{ | ||||||||
271 | bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, | ||||||||
272 | unsigned AddressSpace, Align Alignment, | ||||||||
273 | unsigned *Fast) const { | ||||||||
274 | EVT E = EVT::getIntegerVT(Context, BitWidth); | ||||||||
275 | return getTLI()->allowsMisalignedMemoryAccesses( | ||||||||
276 | E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast); | ||||||||
277 | } | ||||||||
278 | |||||||||
279 | bool hasBranchDivergence() { return false; } | ||||||||
280 | |||||||||
281 | bool useGPUDivergenceAnalysis() { return false; } | ||||||||
282 | |||||||||
283 | bool isSourceOfDivergence(const Value *V) { return false; } | ||||||||
284 | |||||||||
285 | bool isAlwaysUniform(const Value *V) { return false; } | ||||||||
286 | |||||||||
287 | unsigned getFlatAddressSpace() { | ||||||||
288 | // Return an invalid address space. | ||||||||
289 | return -1; | ||||||||
290 | } | ||||||||
291 | |||||||||
292 | bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, | ||||||||
293 | Intrinsic::ID IID) const { | ||||||||
294 | return false; | ||||||||
295 | } | ||||||||
296 | |||||||||
297 | bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const { | ||||||||
298 | return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS); | ||||||||
299 | } | ||||||||
300 | |||||||||
301 | unsigned getAssumedAddrSpace(const Value *V) const { | ||||||||
302 | return getTLI()->getTargetMachine().getAssumedAddrSpace(V); | ||||||||
303 | } | ||||||||
304 | |||||||||
305 | bool isSingleThreaded() const { | ||||||||
306 | return getTLI()->getTargetMachine().Options.ThreadModel == | ||||||||
307 | ThreadModel::Single; | ||||||||
308 | } | ||||||||
309 | |||||||||
310 | std::pair<const Value *, unsigned> | ||||||||
311 | getPredicatedAddrSpace(const Value *V) const { | ||||||||
312 | return getTLI()->getTargetMachine().getPredicatedAddrSpace(V); | ||||||||
313 | } | ||||||||
314 | |||||||||
315 | Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, | ||||||||
316 | Value *NewV) const { | ||||||||
317 | return nullptr; | ||||||||
318 | } | ||||||||
319 | |||||||||
320 | bool isLegalAddImmediate(int64_t imm) { | ||||||||
321 | return getTLI()->isLegalAddImmediate(imm); | ||||||||
322 | } | ||||||||
323 | |||||||||
324 | bool isLegalICmpImmediate(int64_t imm) { | ||||||||
325 | return getTLI()->isLegalICmpImmediate(imm); | ||||||||
326 | } | ||||||||
327 | |||||||||
328 | bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, | ||||||||
329 | bool HasBaseReg, int64_t Scale, | ||||||||
330 | unsigned AddrSpace, Instruction *I = nullptr) { | ||||||||
331 | TargetLoweringBase::AddrMode AM; | ||||||||
332 | AM.BaseGV = BaseGV; | ||||||||
333 | AM.BaseOffs = BaseOffset; | ||||||||
334 | AM.HasBaseReg = HasBaseReg; | ||||||||
335 | AM.Scale = Scale; | ||||||||
336 | return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I); | ||||||||
337 | } | ||||||||
338 | |||||||||
339 | unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, | ||||||||
340 | Type *ScalarValTy) const { | ||||||||
341 | auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) { | ||||||||
342 | auto *SrcTy = FixedVectorType::get(ScalarMemTy, VF / 2); | ||||||||
343 | EVT VT = getTLI()->getValueType(DL, SrcTy); | ||||||||
344 | if (getTLI()->isOperationLegal(ISD::STORE, VT) || | ||||||||
345 | getTLI()->isOperationCustom(ISD::STORE, VT)) | ||||||||
346 | return true; | ||||||||
347 | |||||||||
348 | EVT ValVT = | ||||||||
349 | getTLI()->getValueType(DL, FixedVectorType::get(ScalarValTy, VF / 2)); | ||||||||
350 | EVT LegalizedVT = | ||||||||
351 | getTLI()->getTypeToTransformTo(ScalarMemTy->getContext(), VT); | ||||||||
352 | return getTLI()->isTruncStoreLegal(LegalizedVT, ValVT); | ||||||||
353 | }; | ||||||||
354 | while (VF > 2 && IsSupportedByTarget(VF)) | ||||||||
355 | VF /= 2; | ||||||||
356 | return VF; | ||||||||
357 | } | ||||||||
358 | |||||||||
359 | bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty, | ||||||||
360 | const DataLayout &DL) const { | ||||||||
361 | EVT VT = getTLI()->getValueType(DL, Ty); | ||||||||
362 | return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT); | ||||||||
363 | } | ||||||||
364 | |||||||||
365 | bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty, | ||||||||
366 | const DataLayout &DL) const { | ||||||||
367 | EVT VT = getTLI()->getValueType(DL, Ty); | ||||||||
368 | return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT); | ||||||||
369 | } | ||||||||
370 | |||||||||
371 | bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) { | ||||||||
372 | return TargetTransformInfoImplBase::isLSRCostLess(C1, C2); | ||||||||
373 | } | ||||||||
374 | |||||||||
375 | bool isNumRegsMajorCostOfLSR() { | ||||||||
376 | return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR(); | ||||||||
377 | } | ||||||||
378 | |||||||||
379 | bool isProfitableLSRChainElement(Instruction *I) { | ||||||||
380 | return TargetTransformInfoImplBase::isProfitableLSRChainElement(I); | ||||||||
381 | } | ||||||||
382 | |||||||||
383 | InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, | ||||||||
384 | int64_t BaseOffset, bool HasBaseReg, | ||||||||
385 | int64_t Scale, unsigned AddrSpace) { | ||||||||
386 | TargetLoweringBase::AddrMode AM; | ||||||||
387 | AM.BaseGV = BaseGV; | ||||||||
388 | AM.BaseOffs = BaseOffset; | ||||||||
389 | AM.HasBaseReg = HasBaseReg; | ||||||||
390 | AM.Scale = Scale; | ||||||||
391 | if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) | ||||||||
392 | return 0; | ||||||||
393 | return -1; | ||||||||
394 | } | ||||||||
395 | |||||||||
396 | bool isTruncateFree(Type *Ty1, Type *Ty2) { | ||||||||
397 | return getTLI()->isTruncateFree(Ty1, Ty2); | ||||||||
398 | } | ||||||||
399 | |||||||||
400 | bool isProfitableToHoist(Instruction *I) { | ||||||||
401 | return getTLI()->isProfitableToHoist(I); | ||||||||
402 | } | ||||||||
403 | |||||||||
404 | bool useAA() const { return getST()->useAA(); } | ||||||||
405 | |||||||||
406 | bool isTypeLegal(Type *Ty) { | ||||||||
407 | EVT VT = getTLI()->getValueType(DL, Ty); | ||||||||
408 | return getTLI()->isTypeLegal(VT); | ||||||||
409 | } | ||||||||
410 | |||||||||
411 | unsigned getRegUsageForType(Type *Ty) { | ||||||||
412 | EVT ETy = getTLI()->getValueType(DL, Ty); | ||||||||
413 | return getTLI()->getNumRegisters(Ty->getContext(), ETy); | ||||||||
414 | } | ||||||||
415 | |||||||||
416 | InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, | ||||||||
417 | ArrayRef<const Value *> Operands, | ||||||||
418 | TTI::TargetCostKind CostKind) { | ||||||||
419 | return BaseT::getGEPCost(PointeeType, Ptr, Operands, CostKind); | ||||||||
420 | } | ||||||||
421 | |||||||||
422 | unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, | ||||||||
423 | unsigned &JumpTableSize, | ||||||||
424 | ProfileSummaryInfo *PSI, | ||||||||
425 | BlockFrequencyInfo *BFI) { | ||||||||
426 | /// Try to find the estimated number of clusters. Note that the number of | ||||||||
427 | /// clusters identified in this function could be different from the actual | ||||||||
428 | /// numbers found in lowering. This function ignore switches that are | ||||||||
429 | /// lowered with a mix of jump table / bit test / BTree. This function was | ||||||||
430 | /// initially intended to be used when estimating the cost of switch in | ||||||||
431 | /// inline cost heuristic, but it's a generic cost model to be used in other | ||||||||
432 | /// places (e.g., in loop unrolling). | ||||||||
433 | unsigned N = SI.getNumCases(); | ||||||||
434 | const TargetLoweringBase *TLI = getTLI(); | ||||||||
435 | const DataLayout &DL = this->getDataLayout(); | ||||||||
436 | |||||||||
437 | JumpTableSize = 0; | ||||||||
438 | bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent()); | ||||||||
439 | |||||||||
440 | // Early exit if both a jump table and bit test are not allowed. | ||||||||
441 | if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N)) | ||||||||
442 | return N; | ||||||||
443 | |||||||||
444 | APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue(); | ||||||||
445 | APInt MinCaseVal = MaxCaseVal; | ||||||||
446 | for (auto CI : SI.cases()) { | ||||||||
447 | const APInt &CaseVal = CI.getCaseValue()->getValue(); | ||||||||
448 | if (CaseVal.sgt(MaxCaseVal)) | ||||||||
449 | MaxCaseVal = CaseVal; | ||||||||
450 | if (CaseVal.slt(MinCaseVal)) | ||||||||
451 | MinCaseVal = CaseVal; | ||||||||
452 | } | ||||||||
453 | |||||||||
454 | // Check if suitable for a bit test | ||||||||
455 | if (N <= DL.getIndexSizeInBits(0u)) { | ||||||||
456 | SmallPtrSet<const BasicBlock *, 4> Dests; | ||||||||
457 | for (auto I : SI.cases()) | ||||||||
458 | Dests.insert(I.getCaseSuccessor()); | ||||||||
459 | |||||||||
460 | if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal, | ||||||||
461 | DL)) | ||||||||
462 | return 1; | ||||||||
463 | } | ||||||||
464 | |||||||||
465 | // Check if suitable for a jump table. | ||||||||
466 | if (IsJTAllowed) { | ||||||||
467 | if (N < 2 || N < TLI->getMinimumJumpTableEntries()) | ||||||||
468 | return N; | ||||||||
469 | uint64_t Range = | ||||||||
470 | (MaxCaseVal - MinCaseVal) | ||||||||
471 | .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1; | ||||||||
472 | // Check whether a range of clusters is dense enough for a jump table | ||||||||
473 | if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) { | ||||||||
474 | JumpTableSize = Range; | ||||||||
475 | return 1; | ||||||||
476 | } | ||||||||
477 | } | ||||||||
478 | return N; | ||||||||
479 | } | ||||||||
480 | |||||||||
481 | bool shouldBuildLookupTables() { | ||||||||
482 | const TargetLoweringBase *TLI = getTLI(); | ||||||||
483 | return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) || | ||||||||
484 | TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other); | ||||||||
485 | } | ||||||||
486 | |||||||||
487 | bool shouldBuildRelLookupTables() const { | ||||||||
488 | const TargetMachine &TM = getTLI()->getTargetMachine(); | ||||||||
489 | // If non-PIC mode, do not generate a relative lookup table. | ||||||||
490 | if (!TM.isPositionIndependent()) | ||||||||
491 | return false; | ||||||||
492 | |||||||||
493 | /// Relative lookup table entries consist of 32-bit offsets. | ||||||||
494 | /// Do not generate relative lookup tables for large code models | ||||||||
495 | /// in 64-bit achitectures where 32-bit offsets might not be enough. | ||||||||
496 | if (TM.getCodeModel() == CodeModel::Medium || | ||||||||
497 | TM.getCodeModel() == CodeModel::Large) | ||||||||
498 | return false; | ||||||||
499 | |||||||||
500 | Triple TargetTriple = TM.getTargetTriple(); | ||||||||
501 | if (!TargetTriple.isArch64Bit()) | ||||||||
502 | return false; | ||||||||
503 | |||||||||
504 | // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it | ||||||||
505 | // there. | ||||||||
506 | if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin()) | ||||||||
507 | return false; | ||||||||
508 | |||||||||
509 | return true; | ||||||||
510 | } | ||||||||
511 | |||||||||
512 | bool haveFastSqrt(Type *Ty) { | ||||||||
513 | const TargetLoweringBase *TLI = getTLI(); | ||||||||
514 | EVT VT = TLI->getValueType(DL, Ty); | ||||||||
515 | return TLI->isTypeLegal(VT) && | ||||||||
516 | TLI->isOperationLegalOrCustom(ISD::FSQRT, VT); | ||||||||
517 | } | ||||||||
518 | |||||||||
519 | bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) { | ||||||||
520 | return true; | ||||||||
521 | } | ||||||||
522 | |||||||||
523 | InstructionCost getFPOpCost(Type *Ty) { | ||||||||
524 | // Check whether FADD is available, as a proxy for floating-point in | ||||||||
525 | // general. | ||||||||
526 | const TargetLoweringBase *TLI = getTLI(); | ||||||||
527 | EVT VT = TLI->getValueType(DL, Ty); | ||||||||
528 | if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT)) | ||||||||
529 | return TargetTransformInfo::TCC_Basic; | ||||||||
530 | return TargetTransformInfo::TCC_Expensive; | ||||||||
531 | } | ||||||||
532 | |||||||||
533 | unsigned getInliningThresholdMultiplier() { return 1; } | ||||||||
534 | unsigned adjustInliningThreshold(const CallBase *CB) { return 0; } | ||||||||
535 | |||||||||
536 | int getInlinerVectorBonusPercent() { return 150; } | ||||||||
537 | |||||||||
538 | void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, | ||||||||
539 | TTI::UnrollingPreferences &UP, | ||||||||
540 | OptimizationRemarkEmitter *ORE) { | ||||||||
541 | // This unrolling functionality is target independent, but to provide some | ||||||||
542 | // motivation for its intended use, for x86: | ||||||||
543 | |||||||||
544 | // According to the Intel 64 and IA-32 Architectures Optimization Reference | ||||||||
545 | // Manual, Intel Core models and later have a loop stream detector (and | ||||||||
546 | // associated uop queue) that can benefit from partial unrolling. | ||||||||
547 | // The relevant requirements are: | ||||||||
548 | // - The loop must have no more than 4 (8 for Nehalem and later) branches | ||||||||
549 | // taken, and none of them may be calls. | ||||||||
550 | // - The loop can have no more than 18 (28 for Nehalem and later) uops. | ||||||||
551 | |||||||||
552 | // According to the Software Optimization Guide for AMD Family 15h | ||||||||
553 | // Processors, models 30h-4fh (Steamroller and later) have a loop predictor | ||||||||
554 | // and loop buffer which can benefit from partial unrolling. | ||||||||
555 | // The relevant requirements are: | ||||||||
556 | // - The loop must have fewer than 16 branches | ||||||||
557 | // - The loop must have less than 40 uops in all executed loop branches | ||||||||
558 | |||||||||
559 | // The number of taken branches in a loop is hard to estimate here, and | ||||||||
560 | // benchmarking has revealed that it is better not to be conservative when | ||||||||
561 | // estimating the branch count. As a result, we'll ignore the branch limits | ||||||||
562 | // until someone finds a case where it matters in practice. | ||||||||
563 | |||||||||
564 | unsigned MaxOps; | ||||||||
565 | const TargetSubtargetInfo *ST = getST(); | ||||||||
566 | if (PartialUnrollingThreshold.getNumOccurrences() > 0) | ||||||||
567 | MaxOps = PartialUnrollingThreshold; | ||||||||
568 | else if (ST->getSchedModel().LoopMicroOpBufferSize > 0) | ||||||||
569 | MaxOps = ST->getSchedModel().LoopMicroOpBufferSize; | ||||||||
570 | else | ||||||||
571 | return; | ||||||||
572 | |||||||||
573 | // Scan the loop: don't unroll loops with calls. | ||||||||
574 | for (BasicBlock *BB : L->blocks()) { | ||||||||
575 | for (Instruction &I : *BB) { | ||||||||
576 | if (isa<CallInst>(I) || isa<InvokeInst>(I)) { | ||||||||
577 | if (const Function *F = cast<CallBase>(I).getCalledFunction()) { | ||||||||
578 | if (!thisT()->isLoweredToCall(F)) | ||||||||
579 | continue; | ||||||||
580 | } | ||||||||
581 | |||||||||
582 | if (ORE) { | ||||||||
583 | ORE->emit([&]() { | ||||||||
584 | return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(), | ||||||||
585 | L->getHeader()) | ||||||||
586 | << "advising against unrolling the loop because it " | ||||||||
587 | "contains a " | ||||||||
588 | << ore::NV("Call", &I); | ||||||||
589 | }); | ||||||||
590 | } | ||||||||
591 | return; | ||||||||
592 | } | ||||||||
593 | } | ||||||||
594 | } | ||||||||
595 | |||||||||
596 | // Enable runtime and partial unrolling up to the specified size. | ||||||||
597 | // Enable using trip count upper bound to unroll loops. | ||||||||
598 | UP.Partial = UP.Runtime = UP.UpperBound = true; | ||||||||
599 | UP.PartialThreshold = MaxOps; | ||||||||
600 | |||||||||
601 | // Avoid unrolling when optimizing for size. | ||||||||
602 | UP.OptSizeThreshold = 0; | ||||||||
603 | UP.PartialOptSizeThreshold = 0; | ||||||||
604 | |||||||||
605 | // Set number of instructions optimized when "back edge" | ||||||||
606 | // becomes "fall through" to default value of 2. | ||||||||
607 | UP.BEInsns = 2; | ||||||||
608 | } | ||||||||
609 | |||||||||
610 | void getPeelingPreferences(Loop *L, ScalarEvolution &SE, | ||||||||
611 | TTI::PeelingPreferences &PP) { | ||||||||
612 | PP.PeelCount = 0; | ||||||||
613 | PP.AllowPeeling = true; | ||||||||
614 | PP.AllowLoopNestsPeeling = false; | ||||||||
615 | PP.PeelProfiledIterations = true; | ||||||||
616 | } | ||||||||
617 | |||||||||
618 | bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, | ||||||||
619 | AssumptionCache &AC, | ||||||||
620 | TargetLibraryInfo *LibInfo, | ||||||||
621 | HardwareLoopInfo &HWLoopInfo) { | ||||||||
622 | return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo); | ||||||||
623 | } | ||||||||
624 | |||||||||
625 | bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, | ||||||||
626 | AssumptionCache &AC, TargetLibraryInfo *TLI, | ||||||||
627 | DominatorTree *DT, | ||||||||
628 | LoopVectorizationLegality *LVL, | ||||||||
629 | InterleavedAccessInfo *IAI) { | ||||||||
630 | return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LVL, IAI); | ||||||||
631 | } | ||||||||
632 | |||||||||
633 | TailFoldingStyle getPreferredTailFoldingStyle() { | ||||||||
634 | return BaseT::getPreferredTailFoldingStyle(); | ||||||||
635 | } | ||||||||
636 | |||||||||
637 | std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC, | ||||||||
638 | IntrinsicInst &II) { | ||||||||
639 | return BaseT::instCombineIntrinsic(IC, II); | ||||||||
640 | } | ||||||||
641 | |||||||||
642 | std::optional<Value *> | ||||||||
643 | simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, | ||||||||
644 | APInt DemandedMask, KnownBits &Known, | ||||||||
645 | bool &KnownBitsComputed) { | ||||||||
646 | return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known, | ||||||||
647 | KnownBitsComputed); | ||||||||
648 | } | ||||||||
649 | |||||||||
650 | std::optional<Value *> simplifyDemandedVectorEltsIntrinsic( | ||||||||
651 | InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, | ||||||||
652 | APInt &UndefElts2, APInt &UndefElts3, | ||||||||
653 | std::function<void(Instruction *, unsigned, APInt, APInt &)> | ||||||||
654 | SimplifyAndSetOp) { | ||||||||
655 | return BaseT::simplifyDemandedVectorEltsIntrinsic( | ||||||||
656 | IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3, | ||||||||
657 | SimplifyAndSetOp); | ||||||||
658 | } | ||||||||
659 | |||||||||
660 | virtual std::optional<unsigned> | ||||||||
661 | getCacheSize(TargetTransformInfo::CacheLevel Level) const { | ||||||||
662 | return std::optional<unsigned>( | ||||||||
663 | getST()->getCacheSize(static_cast<unsigned>(Level))); | ||||||||
664 | } | ||||||||
665 | |||||||||
666 | virtual std::optional<unsigned> | ||||||||
667 | getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const { | ||||||||
668 | std::optional<unsigned> TargetResult = | ||||||||
669 | getST()->getCacheAssociativity(static_cast<unsigned>(Level)); | ||||||||
670 | |||||||||
671 | if (TargetResult) | ||||||||
672 | return TargetResult; | ||||||||
673 | |||||||||
674 | return BaseT::getCacheAssociativity(Level); | ||||||||
675 | } | ||||||||
676 | |||||||||
677 | virtual unsigned getCacheLineSize() const { | ||||||||
678 | return getST()->getCacheLineSize(); | ||||||||
679 | } | ||||||||
680 | |||||||||
681 | virtual unsigned getPrefetchDistance() const { | ||||||||
682 | return getST()->getPrefetchDistance(); | ||||||||
683 | } | ||||||||
684 | |||||||||
685 | virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, | ||||||||
686 | unsigned NumStridedMemAccesses, | ||||||||
687 | unsigned NumPrefetches, | ||||||||
688 | bool HasCall) const { | ||||||||
689 | return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses, | ||||||||
690 | NumPrefetches, HasCall); | ||||||||
691 | } | ||||||||
692 | |||||||||
693 | virtual unsigned getMaxPrefetchIterationsAhead() const { | ||||||||
694 | return getST()->getMaxPrefetchIterationsAhead(); | ||||||||
695 | } | ||||||||
696 | |||||||||
697 | virtual bool enableWritePrefetching() const { | ||||||||
698 | return getST()->enableWritePrefetching(); | ||||||||
699 | } | ||||||||
700 | |||||||||
701 | virtual bool shouldPrefetchAddressSpace(unsigned AS) const { | ||||||||
702 | return getST()->shouldPrefetchAddressSpace(AS); | ||||||||
703 | } | ||||||||
704 | |||||||||
705 | /// @} | ||||||||
706 | |||||||||
707 | /// \name Vector TTI Implementations | ||||||||
708 | /// @{ | ||||||||
709 | |||||||||
710 | TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { | ||||||||
711 | return TypeSize::getFixed(32); | ||||||||
712 | } | ||||||||
713 | |||||||||
714 | std::optional<unsigned> getMaxVScale() const { return std::nullopt; } | ||||||||
715 | std::optional<unsigned> getVScaleForTuning() const { return std::nullopt; } | ||||||||
716 | |||||||||
717 | /// Estimate the overhead of scalarizing an instruction. Insert and Extract | ||||||||
718 | /// are set if the demanded result elements need to be inserted and/or | ||||||||
719 | /// extracted from vectors. | ||||||||
720 | InstructionCost getScalarizationOverhead(VectorType *InTy, | ||||||||
721 | const APInt &DemandedElts, | ||||||||
722 | bool Insert, bool Extract, | ||||||||
723 | TTI::TargetCostKind CostKind) { | ||||||||
724 | /// FIXME: a bitfield is not a reasonable abstraction for talking about | ||||||||
725 | /// which elements are needed from a scalable vector | ||||||||
726 | if (isa<ScalableVectorType>(InTy)) | ||||||||
727 | return InstructionCost::getInvalid(); | ||||||||
728 | auto *Ty = cast<FixedVectorType>(InTy); | ||||||||
729 | |||||||||
730 | assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&(static_cast <bool> (DemandedElts.getBitWidth() == Ty-> getNumElements() && "Vector size mismatch") ? void (0 ) : __assert_fail ("DemandedElts.getBitWidth() == Ty->getNumElements() && \"Vector size mismatch\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 731, __extension__ __PRETTY_FUNCTION__)) | ||||||||
731 | "Vector size mismatch")(static_cast <bool> (DemandedElts.getBitWidth() == Ty-> getNumElements() && "Vector size mismatch") ? void (0 ) : __assert_fail ("DemandedElts.getBitWidth() == Ty->getNumElements() && \"Vector size mismatch\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 731, __extension__ __PRETTY_FUNCTION__)); | ||||||||
732 | |||||||||
733 | InstructionCost Cost = 0; | ||||||||
734 | |||||||||
735 | for (int i = 0, e = Ty->getNumElements(); i < e; ++i) { | ||||||||
736 | if (!DemandedElts[i]) | ||||||||
737 | continue; | ||||||||
738 | if (Insert
| ||||||||
739 | Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, | ||||||||
740 | CostKind, i, nullptr, nullptr); | ||||||||
741 | if (Extract) | ||||||||
742 | Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, | ||||||||
743 | CostKind, i, nullptr, nullptr); | ||||||||
744 | } | ||||||||
745 | |||||||||
746 | return Cost; | ||||||||
747 | } | ||||||||
748 | |||||||||
749 | /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead. | ||||||||
750 | InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert, | ||||||||
751 | bool Extract, | ||||||||
752 | TTI::TargetCostKind CostKind) { | ||||||||
753 | if (isa<ScalableVectorType>(InTy)) | ||||||||
754 | return InstructionCost::getInvalid(); | ||||||||
755 | auto *Ty = cast<FixedVectorType>(InTy); | ||||||||
756 | |||||||||
757 | APInt DemandedElts = APInt::getAllOnes(Ty->getNumElements()); | ||||||||
758 | return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract, | ||||||||
759 | CostKind); | ||||||||
760 | } | ||||||||
761 | |||||||||
762 | /// Estimate the overhead of scalarizing an instructions unique | ||||||||
763 | /// non-constant operands. The (potentially vector) types to use for each of | ||||||||
764 | /// argument are passes via Tys. | ||||||||
765 | InstructionCost | ||||||||
766 | getOperandsScalarizationOverhead(ArrayRef<const Value *> Args, | ||||||||
767 | ArrayRef<Type *> Tys, | ||||||||
768 | TTI::TargetCostKind CostKind) { | ||||||||
769 | assert(Args.size() == Tys.size() && "Expected matching Args and Tys")(static_cast <bool> (Args.size() == Tys.size() && "Expected matching Args and Tys") ? void (0) : __assert_fail ("Args.size() == Tys.size() && \"Expected matching Args and Tys\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 769, __extension__ __PRETTY_FUNCTION__)); | ||||||||
770 | |||||||||
771 | InstructionCost Cost = 0; | ||||||||
772 | SmallPtrSet<const Value*, 4> UniqueOperands; | ||||||||
773 | for (int I = 0, E = Args.size(); I != E; I++) { | ||||||||
774 | // Disregard things like metadata arguments. | ||||||||
775 | const Value *A = Args[I]; | ||||||||
776 | Type *Ty = Tys[I]; | ||||||||
777 | if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() && | ||||||||
778 | !Ty->isPtrOrPtrVectorTy()) | ||||||||
779 | continue; | ||||||||
780 | |||||||||
781 | if (!isa<Constant>(A) && UniqueOperands.insert(A).second) { | ||||||||
782 | if (auto *VecTy = dyn_cast<VectorType>(Ty)) | ||||||||
783 | Cost += getScalarizationOverhead(VecTy, /*Insert*/ false, | ||||||||
784 | /*Extract*/ true, CostKind); | ||||||||
785 | } | ||||||||
786 | } | ||||||||
787 | |||||||||
788 | return Cost; | ||||||||
789 | } | ||||||||
790 | |||||||||
791 | /// Estimate the overhead of scalarizing the inputs and outputs of an | ||||||||
792 | /// instruction, with return type RetTy and arguments Args of type Tys. If | ||||||||
793 | /// Args are unknown (empty), then the cost associated with one argument is | ||||||||
794 | /// added as a heuristic. | ||||||||
795 | InstructionCost getScalarizationOverhead(VectorType *RetTy, | ||||||||
796 | ArrayRef<const Value *> Args, | ||||||||
797 | ArrayRef<Type *> Tys, | ||||||||
798 | TTI::TargetCostKind CostKind) { | ||||||||
799 | InstructionCost Cost = getScalarizationOverhead( | ||||||||
800 | RetTy, /*Insert*/ true, /*Extract*/ false, CostKind); | ||||||||
801 | if (!Args.empty()) | ||||||||
802 | Cost += getOperandsScalarizationOverhead(Args, Tys, CostKind); | ||||||||
803 | else | ||||||||
804 | // When no information on arguments is provided, we add the cost | ||||||||
805 | // associated with one argument as a heuristic. | ||||||||
806 | Cost += getScalarizationOverhead(RetTy, /*Insert*/ false, | ||||||||
807 | /*Extract*/ true, CostKind); | ||||||||
808 | |||||||||
809 | return Cost; | ||||||||
810 | } | ||||||||
811 | |||||||||
812 | /// Estimate the cost of type-legalization and the legalized type. | ||||||||
813 | std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const { | ||||||||
814 | LLVMContext &C = Ty->getContext(); | ||||||||
815 | EVT MTy = getTLI()->getValueType(DL, Ty); | ||||||||
816 | |||||||||
817 | InstructionCost Cost = 1; | ||||||||
818 | // We keep legalizing the type until we find a legal kind. We assume that | ||||||||
819 | // the only operation that costs anything is the split. After splitting | ||||||||
820 | // we need to handle two types. | ||||||||
821 | while (true) { | ||||||||
822 | TargetLoweringBase::LegalizeKind LK = getTLI()->getTypeConversion(C, MTy); | ||||||||
823 | |||||||||
824 | if (LK.first == TargetLoweringBase::TypeScalarizeScalableVector) { | ||||||||
825 | // Ensure we return a sensible simple VT here, since many callers of | ||||||||
826 | // this function require it. | ||||||||
827 | MVT VT = MTy.isSimple() ? MTy.getSimpleVT() : MVT::i64; | ||||||||
828 | return std::make_pair(InstructionCost::getInvalid(), VT); | ||||||||
829 | } | ||||||||
830 | |||||||||
831 | if (LK.first == TargetLoweringBase::TypeLegal) | ||||||||
832 | return std::make_pair(Cost, MTy.getSimpleVT()); | ||||||||
833 | |||||||||
834 | if (LK.first == TargetLoweringBase::TypeSplitVector || | ||||||||
835 | LK.first == TargetLoweringBase::TypeExpandInteger) | ||||||||
836 | Cost *= 2; | ||||||||
837 | |||||||||
838 | // Do not loop with f128 type. | ||||||||
839 | if (MTy == LK.second) | ||||||||
840 | return std::make_pair(Cost, MTy.getSimpleVT()); | ||||||||
841 | |||||||||
842 | // Keep legalizing the type. | ||||||||
843 | MTy = LK.second; | ||||||||
844 | } | ||||||||
845 | } | ||||||||
846 | |||||||||
847 | unsigned getMaxInterleaveFactor(unsigned VF) { return 1; } | ||||||||
848 | |||||||||
849 | InstructionCost getArithmeticInstrCost( | ||||||||
850 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, | ||||||||
851 | TTI::OperandValueInfo Opd1Info = {TTI::OK_AnyValue, TTI::OP_None}, | ||||||||
852 | TTI::OperandValueInfo Opd2Info = {TTI::OK_AnyValue, TTI::OP_None}, | ||||||||
853 | ArrayRef<const Value *> Args = ArrayRef<const Value *>(), | ||||||||
854 | const Instruction *CxtI = nullptr) { | ||||||||
855 | // Check if any of the operands are vector operands. | ||||||||
856 | const TargetLoweringBase *TLI = getTLI(); | ||||||||
857 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | ||||||||
858 | assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ? void (0) : __assert_fail ("ISD && \"Invalid opcode\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 858, __extension__ __PRETTY_FUNCTION__)); | ||||||||
859 | |||||||||
860 | // TODO: Handle more cost kinds. | ||||||||
861 | if (CostKind != TTI::TCK_RecipThroughput) | ||||||||
862 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, | ||||||||
863 | Opd1Info, Opd2Info, | ||||||||
864 | Args, CxtI); | ||||||||
865 | |||||||||
866 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); | ||||||||
867 | |||||||||
868 | bool IsFloat = Ty->isFPOrFPVectorTy(); | ||||||||
869 | // Assume that floating point arithmetic operations cost twice as much as | ||||||||
870 | // integer operations. | ||||||||
871 | InstructionCost OpCost = (IsFloat ? 2 : 1); | ||||||||
872 | |||||||||
873 | if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { | ||||||||
874 | // The operation is legal. Assume it costs 1. | ||||||||
875 | // TODO: Once we have extract/insert subvector cost we need to use them. | ||||||||
876 | return LT.first * OpCost; | ||||||||
877 | } | ||||||||
878 | |||||||||
879 | if (!TLI->isOperationExpand(ISD, LT.second)) { | ||||||||
880 | // If the operation is custom lowered, then assume that the code is twice | ||||||||
881 | // as expensive. | ||||||||
882 | return LT.first * 2 * OpCost; | ||||||||
883 | } | ||||||||
884 | |||||||||
885 | // An 'Expand' of URem and SRem is special because it may default | ||||||||
886 | // to expanding the operation into a sequence of sub-operations | ||||||||
887 | // i.e. X % Y -> X-(X/Y)*Y. | ||||||||
888 | if (ISD == ISD::UREM || ISD == ISD::SREM) { | ||||||||
889 | bool IsSigned = ISD == ISD::SREM; | ||||||||
890 | if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, | ||||||||
891 | LT.second) || | ||||||||
892 | TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV, | ||||||||
893 | LT.second)) { | ||||||||
894 | unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv; | ||||||||
895 | InstructionCost DivCost = thisT()->getArithmeticInstrCost( | ||||||||
896 | DivOpc, Ty, CostKind, Opd1Info, Opd2Info); | ||||||||
897 | InstructionCost MulCost = | ||||||||
898 | thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind); | ||||||||
899 | InstructionCost SubCost = | ||||||||
900 | thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind); | ||||||||
901 | return DivCost + MulCost + SubCost; | ||||||||
902 | } | ||||||||
903 | } | ||||||||
904 | |||||||||
905 | // We cannot scalarize scalable vectors, so return Invalid. | ||||||||
906 | if (isa<ScalableVectorType>(Ty)) | ||||||||
907 | return InstructionCost::getInvalid(); | ||||||||
908 | |||||||||
909 | // Else, assume that we need to scalarize this op. | ||||||||
910 | // TODO: If one of the types get legalized by splitting, handle this | ||||||||
911 | // similarly to what getCastInstrCost() does. | ||||||||
912 | if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) { | ||||||||
913 | InstructionCost Cost = thisT()->getArithmeticInstrCost( | ||||||||
914 | Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info, | ||||||||
915 | Args, CxtI); | ||||||||
916 | // Return the cost of multiple scalar invocation plus the cost of | ||||||||
917 | // inserting and extracting the values. | ||||||||
918 | SmallVector<Type *> Tys(Args.size(), Ty); | ||||||||
919 | return getScalarizationOverhead(VTy, Args, Tys, CostKind) + | ||||||||
920 | VTy->getNumElements() * Cost; | ||||||||
921 | } | ||||||||
922 | |||||||||
923 | // We don't know anything about this scalar instruction. | ||||||||
924 | return OpCost; | ||||||||
925 | } | ||||||||
926 | |||||||||
927 | TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, | ||||||||
928 | ArrayRef<int> Mask) const { | ||||||||
929 | int Limit = Mask.size() * 2; | ||||||||
930 | if (Mask.empty() || | ||||||||
931 | // Extra check required by isSingleSourceMaskImpl function (called by | ||||||||
932 | // ShuffleVectorInst::isSingleSourceMask). | ||||||||
933 | any_of(Mask, [Limit](int I) { return I >= Limit; })) | ||||||||
934 | return Kind; | ||||||||
935 | int Index; | ||||||||
936 | switch (Kind) { | ||||||||
937 | case TTI::SK_PermuteSingleSrc: | ||||||||
938 | if (ShuffleVectorInst::isReverseMask(Mask)) | ||||||||
939 | return TTI::SK_Reverse; | ||||||||
940 | if (ShuffleVectorInst::isZeroEltSplatMask(Mask)) | ||||||||
941 | return TTI::SK_Broadcast; | ||||||||
942 | break; | ||||||||
943 | case TTI::SK_PermuteTwoSrc: | ||||||||
944 | if (ShuffleVectorInst::isSelectMask(Mask)) | ||||||||
945 | return TTI::SK_Select; | ||||||||
946 | if (ShuffleVectorInst::isTransposeMask(Mask)) | ||||||||
947 | return TTI::SK_Transpose; | ||||||||
948 | if (ShuffleVectorInst::isSpliceMask(Mask, Index)) | ||||||||
949 | return TTI::SK_Splice; | ||||||||
950 | break; | ||||||||
951 | case TTI::SK_Select: | ||||||||
952 | case TTI::SK_Reverse: | ||||||||
953 | case TTI::SK_Broadcast: | ||||||||
954 | case TTI::SK_Transpose: | ||||||||
955 | case TTI::SK_InsertSubvector: | ||||||||
956 | case TTI::SK_ExtractSubvector: | ||||||||
957 | case TTI::SK_Splice: | ||||||||
958 | break; | ||||||||
959 | } | ||||||||
960 | return Kind; | ||||||||
961 | } | ||||||||
962 | |||||||||
963 | InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, | ||||||||
964 | ArrayRef<int> Mask, | ||||||||
965 | TTI::TargetCostKind CostKind, int Index, | ||||||||
966 | VectorType *SubTp, | ||||||||
967 | ArrayRef<const Value *> Args = std::nullopt) { | ||||||||
968 | |||||||||
969 | switch (improveShuffleKindFromMask(Kind, Mask)) { | ||||||||
970 | case TTI::SK_Broadcast: | ||||||||
971 | if (auto *FVT = dyn_cast<FixedVectorType>(Tp)) | ||||||||
972 | return getBroadcastShuffleOverhead(FVT, CostKind); | ||||||||
973 | return InstructionCost::getInvalid(); | ||||||||
974 | case TTI::SK_Select: | ||||||||
975 | case TTI::SK_Splice: | ||||||||
976 | case TTI::SK_Reverse: | ||||||||
977 | case TTI::SK_Transpose: | ||||||||
978 | case TTI::SK_PermuteSingleSrc: | ||||||||
979 | case TTI::SK_PermuteTwoSrc: | ||||||||
980 | if (auto *FVT = dyn_cast<FixedVectorType>(Tp)) | ||||||||
981 | return getPermuteShuffleOverhead(FVT, CostKind); | ||||||||
982 | return InstructionCost::getInvalid(); | ||||||||
983 | case TTI::SK_ExtractSubvector: | ||||||||
984 | return getExtractSubvectorOverhead(Tp, CostKind, Index, | ||||||||
985 | cast<FixedVectorType>(SubTp)); | ||||||||
986 | case TTI::SK_InsertSubvector: | ||||||||
987 | return getInsertSubvectorOverhead(Tp, CostKind, Index, | ||||||||
988 | cast<FixedVectorType>(SubTp)); | ||||||||
989 | } | ||||||||
990 | llvm_unreachable("Unknown TTI::ShuffleKind")::llvm::llvm_unreachable_internal("Unknown TTI::ShuffleKind", "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 990); | ||||||||
991 | } | ||||||||
992 | |||||||||
993 | InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, | ||||||||
994 | TTI::CastContextHint CCH, | ||||||||
995 | TTI::TargetCostKind CostKind, | ||||||||
996 | const Instruction *I = nullptr) { | ||||||||
997 | if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0) | ||||||||
998 | return 0; | ||||||||
999 | |||||||||
1000 | const TargetLoweringBase *TLI = getTLI(); | ||||||||
1001 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | ||||||||
1002 | assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ? void (0) : __assert_fail ("ISD && \"Invalid opcode\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 1002, __extension__ __PRETTY_FUNCTION__)); | ||||||||
1003 | std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src); | ||||||||
1004 | std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst); | ||||||||
1005 | |||||||||
1006 | TypeSize SrcSize = SrcLT.second.getSizeInBits(); | ||||||||
1007 | TypeSize DstSize = DstLT.second.getSizeInBits(); | ||||||||
1008 | bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy(); | ||||||||
1009 | bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy(); | ||||||||
1010 | |||||||||
1011 | switch (Opcode) { | ||||||||
1012 | default: | ||||||||
1013 | break; | ||||||||
1014 | case Instruction::Trunc: | ||||||||
1015 | // Check for NOOP conversions. | ||||||||
1016 | if (TLI->isTruncateFree(SrcLT.second, DstLT.second)) | ||||||||
1017 | return 0; | ||||||||
1018 | [[fallthrough]]; | ||||||||
1019 | case Instruction::BitCast: | ||||||||
1020 | // Bitcast between types that are legalized to the same type are free and | ||||||||
1021 | // assume int to/from ptr of the same size is also free. | ||||||||
1022 | if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst && | ||||||||
1023 | SrcSize == DstSize) | ||||||||
1024 | return 0; | ||||||||
1025 | break; | ||||||||
1026 | case Instruction::FPExt: | ||||||||
1027 | if (I && getTLI()->isExtFree(I)) | ||||||||
1028 | return 0; | ||||||||
1029 | break; | ||||||||
1030 | case Instruction::ZExt: | ||||||||
1031 | if (TLI->isZExtFree(SrcLT.second, DstLT.second)) | ||||||||
1032 | return 0; | ||||||||
1033 | [[fallthrough]]; | ||||||||
1034 | case Instruction::SExt: | ||||||||
1035 | if (I && getTLI()->isExtFree(I)) | ||||||||
1036 | return 0; | ||||||||
1037 | |||||||||
1038 | // If this is a zext/sext of a load, return 0 if the corresponding | ||||||||
1039 | // extending load exists on target and the result type is legal. | ||||||||
1040 | if (CCH == TTI::CastContextHint::Normal) { | ||||||||
1041 | EVT ExtVT = EVT::getEVT(Dst); | ||||||||
1042 | EVT LoadVT = EVT::getEVT(Src); | ||||||||
1043 | unsigned LType = | ||||||||
1044 | ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD); | ||||||||
1045 | if (DstLT.first == SrcLT.first && | ||||||||
1046 | TLI->isLoadExtLegal(LType, ExtVT, LoadVT)) | ||||||||
1047 | return 0; | ||||||||
1048 | } | ||||||||
1049 | break; | ||||||||
1050 | case Instruction::AddrSpaceCast: | ||||||||
1051 | if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(), | ||||||||
1052 | Dst->getPointerAddressSpace())) | ||||||||
1053 | return 0; | ||||||||
1054 | break; | ||||||||
1055 | } | ||||||||
1056 | |||||||||
1057 | auto *SrcVTy = dyn_cast<VectorType>(Src); | ||||||||
1058 | auto *DstVTy = dyn_cast<VectorType>(Dst); | ||||||||
1059 | |||||||||
1060 | // If the cast is marked as legal (or promote) then assume low cost. | ||||||||
1061 | if (SrcLT.first == DstLT.first && | ||||||||
1062 | TLI->isOperationLegalOrPromote(ISD, DstLT.second)) | ||||||||
1063 | return SrcLT.first; | ||||||||
1064 | |||||||||
1065 | // Handle scalar conversions. | ||||||||
1066 | if (!SrcVTy && !DstVTy) { | ||||||||
1067 | // Just check the op cost. If the operation is legal then assume it costs | ||||||||
1068 | // 1. | ||||||||
1069 | if (!TLI->isOperationExpand(ISD, DstLT.second)) | ||||||||
1070 | return 1; | ||||||||
1071 | |||||||||
1072 | // Assume that illegal scalar instruction are expensive. | ||||||||
1073 | return 4; | ||||||||
1074 | } | ||||||||
1075 | |||||||||
1076 | // Check vector-to-vector casts. | ||||||||
1077 | if (DstVTy && SrcVTy) { | ||||||||
1078 | // If the cast is between same-sized registers, then the check is simple. | ||||||||
1079 | if (SrcLT.first == DstLT.first && SrcSize == DstSize) { | ||||||||
1080 | |||||||||
1081 | // Assume that Zext is done using AND. | ||||||||
1082 | if (Opcode == Instruction::ZExt) | ||||||||
1083 | return SrcLT.first; | ||||||||
1084 | |||||||||
1085 | // Assume that sext is done using SHL and SRA. | ||||||||
1086 | if (Opcode == Instruction::SExt) | ||||||||
1087 | return SrcLT.first * 2; | ||||||||
1088 | |||||||||
1089 | // Just check the op cost. If the operation is legal then assume it | ||||||||
1090 | // costs | ||||||||
1091 | // 1 and multiply by the type-legalization overhead. | ||||||||
1092 | if (!TLI->isOperationExpand(ISD, DstLT.second)) | ||||||||
1093 | return SrcLT.first * 1; | ||||||||
1094 | } | ||||||||
1095 | |||||||||
1096 | // If we are legalizing by splitting, query the concrete TTI for the cost | ||||||||
1097 | // of casting the original vector twice. We also need to factor in the | ||||||||
1098 | // cost of the split itself. Count that as 1, to be consistent with | ||||||||
1099 | // getTypeLegalizationCost(). | ||||||||
1100 | bool SplitSrc = | ||||||||
1101 | TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) == | ||||||||
1102 | TargetLowering::TypeSplitVector; | ||||||||
1103 | bool SplitDst = | ||||||||
1104 | TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) == | ||||||||
1105 | TargetLowering::TypeSplitVector; | ||||||||
1106 | if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() && | ||||||||
1107 | DstVTy->getElementCount().isVector()) { | ||||||||
1108 | Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy); | ||||||||
1109 | Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy); | ||||||||
1110 | T *TTI = static_cast<T *>(this); | ||||||||
1111 | // If both types need to be split then the split is free. | ||||||||
1112 | InstructionCost SplitCost = | ||||||||
1113 | (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0; | ||||||||
1114 | return SplitCost + | ||||||||
1115 | (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH, | ||||||||
1116 | CostKind, I)); | ||||||||
1117 | } | ||||||||
1118 | |||||||||
1119 | // Scalarization cost is Invalid, can't assume any num elements. | ||||||||
1120 | if (isa<ScalableVectorType>(DstVTy)) | ||||||||
1121 | return InstructionCost::getInvalid(); | ||||||||
1122 | |||||||||
1123 | // In other cases where the source or destination are illegal, assume | ||||||||
1124 | // the operation will get scalarized. | ||||||||
1125 | unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements(); | ||||||||
1126 | InstructionCost Cost = thisT()->getCastInstrCost( | ||||||||
1127 | Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I); | ||||||||
1128 | |||||||||
1129 | // Return the cost of multiple scalar invocation plus the cost of | ||||||||
1130 | // inserting and extracting the values. | ||||||||
1131 | return getScalarizationOverhead(DstVTy, /*Insert*/ true, /*Extract*/ true, | ||||||||
1132 | CostKind) + | ||||||||
1133 | Num * Cost; | ||||||||
1134 | } | ||||||||
1135 | |||||||||
1136 | // We already handled vector-to-vector and scalar-to-scalar conversions. | ||||||||
1137 | // This | ||||||||
1138 | // is where we handle bitcast between vectors and scalars. We need to assume | ||||||||
1139 | // that the conversion is scalarized in one way or another. | ||||||||
1140 | if (Opcode == Instruction::BitCast) { | ||||||||
1141 | // Illegal bitcasts are done by storing and loading from a stack slot. | ||||||||
1142 | return (SrcVTy ? getScalarizationOverhead(SrcVTy, /*Insert*/ false, | ||||||||
1143 | /*Extract*/ true, CostKind) | ||||||||
1144 | : 0) + | ||||||||
1145 | (DstVTy ? getScalarizationOverhead(DstVTy, /*Insert*/ true, | ||||||||
1146 | /*Extract*/ false, CostKind) | ||||||||
1147 | : 0); | ||||||||
1148 | } | ||||||||
1149 | |||||||||
1150 | llvm_unreachable("Unhandled cast")::llvm::llvm_unreachable_internal("Unhandled cast", "llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 1150); | ||||||||
1151 | } | ||||||||
1152 | |||||||||
1153 | InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, | ||||||||
1154 | VectorType *VecTy, unsigned Index) { | ||||||||
1155 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; | ||||||||
1156 | return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy, | ||||||||
1157 | CostKind, Index, nullptr, nullptr) + | ||||||||
1158 | thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(), | ||||||||
1159 | TTI::CastContextHint::None, CostKind); | ||||||||
1160 | } | ||||||||
1161 | |||||||||
1162 | InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, | ||||||||
1163 | const Instruction *I = nullptr) { | ||||||||
1164 | return BaseT::getCFInstrCost(Opcode, CostKind, I); | ||||||||
1165 | } | ||||||||
1166 | |||||||||
1167 | InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, | ||||||||
1168 | CmpInst::Predicate VecPred, | ||||||||
1169 | TTI::TargetCostKind CostKind, | ||||||||
1170 | const Instruction *I = nullptr) { | ||||||||
1171 | const TargetLoweringBase *TLI = getTLI(); | ||||||||
1172 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | ||||||||
1173 | assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ? void (0) : __assert_fail ("ISD && \"Invalid opcode\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 1173, __extension__ __PRETTY_FUNCTION__)); | ||||||||
1174 | |||||||||
1175 | // TODO: Handle other cost kinds. | ||||||||
1176 | if (CostKind != TTI::TCK_RecipThroughput) | ||||||||
1177 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, | ||||||||
1178 | I); | ||||||||
1179 | |||||||||
1180 | // Selects on vectors are actually vector selects. | ||||||||
1181 | if (ISD == ISD::SELECT) { | ||||||||
1182 | assert(CondTy && "CondTy must exist")(static_cast <bool> (CondTy && "CondTy must exist" ) ? void (0) : __assert_fail ("CondTy && \"CondTy must exist\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 1182, __extension__ __PRETTY_FUNCTION__)); | ||||||||
1183 | if (CondTy->isVectorTy()) | ||||||||
1184 | ISD = ISD::VSELECT; | ||||||||
1185 | } | ||||||||
1186 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); | ||||||||
1187 | |||||||||
1188 | if (!(ValTy->isVectorTy() && !LT.second.isVector()) && | ||||||||
1189 | !TLI->isOperationExpand(ISD, LT.second)) { | ||||||||
1190 | // The operation is legal. Assume it costs 1. Multiply | ||||||||
1191 | // by the type-legalization overhead. | ||||||||
1192 | return LT.first * 1; | ||||||||
1193 | } | ||||||||
1194 | |||||||||
1195 | // Otherwise, assume that the cast is scalarized. | ||||||||
1196 | // TODO: If one of the types get legalized by splitting, handle this | ||||||||
1197 | // similarly to what getCastInstrCost() does. | ||||||||
1198 | if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) { | ||||||||
1199 | if (isa<ScalableVectorType>(ValTy)) | ||||||||
1200 | return InstructionCost::getInvalid(); | ||||||||
1201 | |||||||||
1202 | unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements(); | ||||||||
1203 | if (CondTy) | ||||||||
1204 | CondTy = CondTy->getScalarType(); | ||||||||
1205 | InstructionCost Cost = thisT()->getCmpSelInstrCost( | ||||||||
1206 | Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I); | ||||||||
1207 | |||||||||
1208 | // Return the cost of multiple scalar invocation plus the cost of | ||||||||
1209 | // inserting and extracting the values. | ||||||||
1210 | return getScalarizationOverhead(ValVTy, /*Insert*/ true, | ||||||||
1211 | /*Extract*/ false, CostKind) + | ||||||||
1212 | Num * Cost; | ||||||||
1213 | } | ||||||||
1214 | |||||||||
1215 | // Unknown scalar opcode. | ||||||||
1216 | return 1; | ||||||||
1217 | } | ||||||||
1218 | |||||||||
1219 | InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, | ||||||||
1220 | TTI::TargetCostKind CostKind, | ||||||||
1221 | unsigned Index, Value *Op0, Value *Op1) { | ||||||||
1222 | return getRegUsageForType(Val->getScalarType()); | ||||||||
1223 | } | ||||||||
1224 | |||||||||
1225 | InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, | ||||||||
1226 | TTI::TargetCostKind CostKind, | ||||||||
1227 | unsigned Index) { | ||||||||
1228 | Value *Op0 = nullptr; | ||||||||
1229 | Value *Op1 = nullptr; | ||||||||
1230 | if (auto *IE = dyn_cast<InsertElementInst>(&I)) { | ||||||||
1231 | Op0 = IE->getOperand(0); | ||||||||
1232 | Op1 = IE->getOperand(1); | ||||||||
1233 | } | ||||||||
1234 | return thisT()->getVectorInstrCost(I.getOpcode(), Val, CostKind, Index, Op0, | ||||||||
1235 | Op1); | ||||||||
1236 | } | ||||||||
1237 | |||||||||
1238 | InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, | ||||||||
1239 | int VF, | ||||||||
1240 | const APInt &DemandedDstElts, | ||||||||
1241 | TTI::TargetCostKind CostKind) { | ||||||||
1242 | assert(DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor &&(static_cast <bool> (DemandedDstElts.getBitWidth() == ( unsigned)VF * ReplicationFactor && "Unexpected size of DemandedDstElts." ) ? void (0) : __assert_fail ("DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor && \"Unexpected size of DemandedDstElts.\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 1243, __extension__ __PRETTY_FUNCTION__)) | ||||||||
1243 | "Unexpected size of DemandedDstElts.")(static_cast <bool> (DemandedDstElts.getBitWidth() == ( unsigned)VF * ReplicationFactor && "Unexpected size of DemandedDstElts." ) ? void (0) : __assert_fail ("DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor && \"Unexpected size of DemandedDstElts.\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 1243, __extension__ __PRETTY_FUNCTION__)); | ||||||||
1244 | |||||||||
1245 | InstructionCost Cost; | ||||||||
1246 | |||||||||
1247 | auto *SrcVT = FixedVectorType::get(EltTy, VF); | ||||||||
1248 | auto *ReplicatedVT = FixedVectorType::get(EltTy, VF * ReplicationFactor); | ||||||||
1249 | |||||||||
1250 | // The Mask shuffling cost is extract all the elements of the Mask | ||||||||
1251 | // and insert each of them Factor times into the wide vector: | ||||||||
1252 | // | ||||||||
1253 | // E.g. an interleaved group with factor 3: | ||||||||
1254 | // %mask = icmp ult <8 x i32> %vec1, %vec2 | ||||||||
1255 | // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef, | ||||||||
1256 | // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7> | ||||||||
1257 | // The cost is estimated as extract all mask elements from the <8xi1> mask | ||||||||
1258 | // vector and insert them factor times into the <24xi1> shuffled mask | ||||||||
1259 | // vector. | ||||||||
1260 | APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedDstElts, VF); | ||||||||
1261 | Cost += thisT()->getScalarizationOverhead(SrcVT, DemandedSrcElts, | ||||||||
1262 | /*Insert*/ false, | ||||||||
1263 | /*Extract*/ true, CostKind); | ||||||||
1264 | Cost += thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts, | ||||||||
1265 | /*Insert*/ true, | ||||||||
1266 | /*Extract*/ false, CostKind); | ||||||||
1267 | |||||||||
1268 | return Cost; | ||||||||
1269 | } | ||||||||
1270 | |||||||||
1271 | InstructionCost | ||||||||
1272 | getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, | ||||||||
1273 | unsigned AddressSpace, TTI::TargetCostKind CostKind, | ||||||||
1274 | TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None}, | ||||||||
1275 | const Instruction *I = nullptr) { | ||||||||
1276 | assert(!Src->isVoidTy() && "Invalid type")(static_cast <bool> (!Src->isVoidTy() && "Invalid type" ) ? void (0) : __assert_fail ("!Src->isVoidTy() && \"Invalid type\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 1276, __extension__ __PRETTY_FUNCTION__)); | ||||||||
1277 | // Assume types, such as structs, are expensive. | ||||||||
1278 | if (getTLI()->getValueType(DL, Src, true) == MVT::Other) | ||||||||
1279 | return 4; | ||||||||
1280 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src); | ||||||||
1281 | |||||||||
1282 | // Assuming that all loads of legal types cost 1. | ||||||||
1283 | InstructionCost Cost = LT.first; | ||||||||
1284 | if (CostKind != TTI::TCK_RecipThroughput) | ||||||||
1285 | return Cost; | ||||||||
1286 | |||||||||
1287 | const DataLayout &DL = this->getDataLayout(); | ||||||||
1288 | if (Src->isVectorTy() && | ||||||||
1289 | // In practice it's not currently possible to have a change in lane | ||||||||
1290 | // length for extending loads or truncating stores so both types should | ||||||||
1291 | // have the same scalable property. | ||||||||
1292 | TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src), | ||||||||
1293 | LT.second.getSizeInBits())) { | ||||||||
1294 | // This is a vector load that legalizes to a larger type than the vector | ||||||||
1295 | // itself. Unless the corresponding extending load or truncating store is | ||||||||
1296 | // legal, then this will scalarize. | ||||||||
1297 | TargetLowering::LegalizeAction LA = TargetLowering::Expand; | ||||||||
1298 | EVT MemVT = getTLI()->getValueType(DL, Src); | ||||||||
1299 | if (Opcode == Instruction::Store) | ||||||||
1300 | LA = getTLI()->getTruncStoreAction(LT.second, MemVT); | ||||||||
1301 | else | ||||||||
1302 | LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT); | ||||||||
1303 | |||||||||
1304 | if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) { | ||||||||
1305 | // This is a vector load/store for some illegal type that is scalarized. | ||||||||
1306 | // We must account for the cost of building or decomposing the vector. | ||||||||
1307 | Cost += getScalarizationOverhead( | ||||||||
1308 | cast<VectorType>(Src), Opcode != Instruction::Store, | ||||||||
1309 | Opcode == Instruction::Store, CostKind); | ||||||||
1310 | } | ||||||||
1311 | } | ||||||||
1312 | |||||||||
1313 | return Cost; | ||||||||
1314 | } | ||||||||
1315 | |||||||||
1316 | InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, | ||||||||
1317 | Align Alignment, unsigned AddressSpace, | ||||||||
1318 | TTI::TargetCostKind CostKind) { | ||||||||
1319 | return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false, | ||||||||
1320 | CostKind); | ||||||||
1321 | } | ||||||||
1322 | |||||||||
1323 | InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, | ||||||||
1324 | const Value *Ptr, bool VariableMask, | ||||||||
1325 | Align Alignment, | ||||||||
1326 | TTI::TargetCostKind CostKind, | ||||||||
1327 | const Instruction *I = nullptr) { | ||||||||
1328 | return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask, | ||||||||
1329 | true, CostKind); | ||||||||
1330 | } | ||||||||
1331 | |||||||||
1332 | InstructionCost getInterleavedMemoryOpCost( | ||||||||
1333 | unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, | ||||||||
1334 | Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, | ||||||||
1335 | bool UseMaskForCond = false, bool UseMaskForGaps = false) { | ||||||||
1336 | |||||||||
1337 | // We cannot scalarize scalable vectors, so return Invalid. | ||||||||
1338 | if (isa<ScalableVectorType>(VecTy)) | ||||||||
1339 | return InstructionCost::getInvalid(); | ||||||||
1340 | |||||||||
1341 | auto *VT = cast<FixedVectorType>(VecTy); | ||||||||
1342 | |||||||||
1343 | unsigned NumElts = VT->getNumElements(); | ||||||||
1344 | assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor")(static_cast <bool> (Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor") ? void ( 0) : __assert_fail ("Factor > 1 && NumElts % Factor == 0 && \"Invalid interleave factor\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 1344, __extension__ __PRETTY_FUNCTION__)); | ||||||||
1345 | |||||||||
1346 | unsigned NumSubElts = NumElts / Factor; | ||||||||
1347 | auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts); | ||||||||
1348 | |||||||||
1349 | // Firstly, the cost of load/store operation. | ||||||||
1350 | InstructionCost Cost; | ||||||||
1351 | if (UseMaskForCond
| ||||||||
1352 | Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment, | ||||||||
1353 | AddressSpace, CostKind); | ||||||||
1354 | else | ||||||||
1355 | Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, | ||||||||
1356 | CostKind); | ||||||||
1357 | |||||||||
1358 | // Legalize the vector type, and get the legalized and unlegalized type | ||||||||
1359 | // sizes. | ||||||||
1360 | MVT VecTyLT = getTypeLegalizationCost(VecTy).second; | ||||||||
1361 | unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy); | ||||||||
1362 | unsigned VecTyLTSize = VecTyLT.getStoreSize(); | ||||||||
1363 | |||||||||
1364 | // Scale the cost of the memory operation by the fraction of legalized | ||||||||
1365 | // instructions that will actually be used. We shouldn't account for the | ||||||||
1366 | // cost of dead instructions since they will be removed. | ||||||||
1367 | // | ||||||||
1368 | // E.g., An interleaved load of factor 8: | ||||||||
1369 | // %vec = load <16 x i64>, <16 x i64>* %ptr | ||||||||
1370 | // %v0 = shufflevector %vec, undef, <0, 8> | ||||||||
1371 | // | ||||||||
1372 | // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be | ||||||||
1373 | // used (those corresponding to elements [0:1] and [8:9] of the unlegalized | ||||||||
1374 | // type). The other loads are unused. | ||||||||
1375 | // | ||||||||
1376 | // TODO: Note that legalization can turn masked loads/stores into unmasked | ||||||||
1377 | // (legalized) loads/stores. This can be reflected in the cost. | ||||||||
1378 | if (Cost.isValid() && VecTySize > VecTyLTSize) { | ||||||||
1379 | // The number of loads of a legal type it will take to represent a load | ||||||||
1380 | // of the unlegalized vector type. | ||||||||
1381 | unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize); | ||||||||
1382 | |||||||||
1383 | // The number of elements of the unlegalized type that correspond to a | ||||||||
1384 | // single legal instruction. | ||||||||
1385 | unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts); | ||||||||
1386 | |||||||||
1387 | // Determine which legal instructions will be used. | ||||||||
1388 | BitVector UsedInsts(NumLegalInsts, false); | ||||||||
1389 | for (unsigned Index : Indices) | ||||||||
1390 | for (unsigned Elt = 0; Elt < NumSubElts; ++Elt) | ||||||||
1391 | UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst); | ||||||||
1392 | |||||||||
1393 | // Scale the cost of the load by the fraction of legal instructions that | ||||||||
1394 | // will be used. | ||||||||
1395 | Cost = divideCeil(UsedInsts.count() * *Cost.getValue(), NumLegalInsts); | ||||||||
1396 | } | ||||||||
1397 | |||||||||
1398 | // Then plus the cost of interleave operation. | ||||||||
1399 | assert(Indices.size() <= Factor &&(static_cast <bool> (Indices.size() <= Factor && "Interleaved memory op has too many members") ? void (0) : __assert_fail ("Indices.size() <= Factor && \"Interleaved memory op has too many members\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 1400, __extension__ __PRETTY_FUNCTION__)) | ||||||||
1400 | "Interleaved memory op has too many members")(static_cast <bool> (Indices.size() <= Factor && "Interleaved memory op has too many members") ? void (0) : __assert_fail ("Indices.size() <= Factor && \"Interleaved memory op has too many members\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 1400, __extension__ __PRETTY_FUNCTION__)); | ||||||||
1401 | |||||||||
1402 | const APInt DemandedAllSubElts = APInt::getAllOnes(NumSubElts); | ||||||||
1403 | const APInt DemandedAllResultElts = APInt::getAllOnes(NumElts); | ||||||||
1404 | |||||||||
1405 | APInt DemandedLoadStoreElts = APInt::getZero(NumElts); | ||||||||
1406 | for (unsigned Index : Indices) { | ||||||||
1407 | assert(Index < Factor && "Invalid index for interleaved memory op")(static_cast <bool> (Index < Factor && "Invalid index for interleaved memory op" ) ? void (0) : __assert_fail ("Index < Factor && \"Invalid index for interleaved memory op\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 1407, __extension__ __PRETTY_FUNCTION__)); | ||||||||
1408 | for (unsigned Elm = 0; Elm < NumSubElts; Elm++) | ||||||||
1409 | DemandedLoadStoreElts.setBit(Index + Elm * Factor); | ||||||||
1410 | } | ||||||||
1411 | |||||||||
1412 | if (Opcode
| ||||||||
1413 | // The interleave cost is similar to extract sub vectors' elements | ||||||||
1414 | // from the wide vector, and insert them into sub vectors. | ||||||||
1415 | // | ||||||||
1416 | // E.g. An interleaved load of factor 2 (with one member of index 0): | ||||||||
1417 | // %vec = load <8 x i32>, <8 x i32>* %ptr | ||||||||
1418 | // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0 | ||||||||
1419 | // The cost is estimated as extract elements at 0, 2, 4, 6 from the | ||||||||
1420 | // <8 x i32> vector and insert them into a <4 x i32> vector. | ||||||||
1421 | InstructionCost InsSubCost = thisT()->getScalarizationOverhead( | ||||||||
1422 | SubVT, DemandedAllSubElts, | ||||||||
1423 | /*Insert*/ true, /*Extract*/ false, CostKind); | ||||||||
1424 | Cost += Indices.size() * InsSubCost; | ||||||||
1425 | Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts, | ||||||||
1426 | /*Insert*/ false, | ||||||||
1427 | /*Extract*/ true, CostKind); | ||||||||
1428 | } else { | ||||||||
1429 | // The interleave cost is extract elements from sub vectors, and | ||||||||
1430 | // insert them into the wide vector. | ||||||||
1431 | // | ||||||||
1432 | // E.g. An interleaved store of factor 3 with 2 members at indices 0,1: | ||||||||
1433 | // (using VF=4): | ||||||||
1434 | // %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef> | ||||||||
1435 | // %gaps.mask = <true, true, false, true, true, false, | ||||||||
1436 | // true, true, false, true, true, false> | ||||||||
1437 | // call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr, | ||||||||
1438 | // i32 Align, <12 x i1> %gaps.mask | ||||||||
1439 | // The cost is estimated as extract all elements (of actual members, | ||||||||
1440 | // excluding gaps) from both <4 x i32> vectors and insert into the <12 x | ||||||||
1441 | // i32> vector. | ||||||||
1442 | InstructionCost ExtSubCost = thisT()->getScalarizationOverhead( | ||||||||
1443 | SubVT, DemandedAllSubElts, | ||||||||
1444 | /*Insert*/ false, /*Extract*/ true, CostKind); | ||||||||
1445 | Cost += ExtSubCost * Indices.size(); | ||||||||
1446 | Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts, | ||||||||
1447 | /*Insert*/ true, | ||||||||
1448 | /*Extract*/ false, CostKind); | ||||||||
1449 | } | ||||||||
1450 | |||||||||
1451 | if (!UseMaskForCond) | ||||||||
1452 | return Cost; | ||||||||
1453 | |||||||||
1454 | Type *I8Type = Type::getInt8Ty(VT->getContext()); | ||||||||
1455 | |||||||||
1456 | Cost += thisT()->getReplicationShuffleCost( | ||||||||
1457 | I8Type, Factor, NumSubElts, | ||||||||
1458 | UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts, | ||||||||
1459 | CostKind); | ||||||||
1460 | |||||||||
1461 | // The Gaps mask is invariant and created outside the loop, therefore the | ||||||||
1462 | // cost of creating it is not accounted for here. However if we have both | ||||||||
1463 | // a MaskForGaps and some other mask that guards the execution of the | ||||||||
1464 | // memory access, we need to account for the cost of And-ing the two masks | ||||||||
1465 | // inside the loop. | ||||||||
1466 | if (UseMaskForGaps) { | ||||||||
1467 | auto *MaskVT = FixedVectorType::get(I8Type, NumElts); | ||||||||
1468 | Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT, | ||||||||
1469 | CostKind); | ||||||||
1470 | } | ||||||||
1471 | |||||||||
1472 | return Cost; | ||||||||
1473 | } | ||||||||
1474 | |||||||||
1475 | /// Get intrinsic cost based on arguments. | ||||||||
1476 | InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, | ||||||||
1477 | TTI::TargetCostKind CostKind) { | ||||||||
1478 | // Check for generically free intrinsics. | ||||||||
1479 | if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0) | ||||||||
1480 | return 0; | ||||||||
1481 | |||||||||
1482 | // Assume that target intrinsics are cheap. | ||||||||
1483 | Intrinsic::ID IID = ICA.getID(); | ||||||||
1484 | if (Function::isTargetIntrinsic(IID)) | ||||||||
1485 | return TargetTransformInfo::TCC_Basic; | ||||||||
1486 | |||||||||
1487 | if (ICA.isTypeBasedOnly()) | ||||||||
1488 | return getTypeBasedIntrinsicInstrCost(ICA, CostKind); | ||||||||
1489 | |||||||||
1490 | Type *RetTy = ICA.getReturnType(); | ||||||||
1491 | |||||||||
1492 | ElementCount RetVF = | ||||||||
1493 | (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount() | ||||||||
1494 | : ElementCount::getFixed(1)); | ||||||||
1495 | const IntrinsicInst *I = ICA.getInst(); | ||||||||
1496 | const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); | ||||||||
1497 | FastMathFlags FMF = ICA.getFlags(); | ||||||||
1498 | switch (IID) { | ||||||||
1499 | default: | ||||||||
1500 | break; | ||||||||
1501 | |||||||||
1502 | case Intrinsic::powi: | ||||||||
1503 | if (auto *RHSC = dyn_cast<ConstantInt>(Args[1])) { | ||||||||
1504 | bool ShouldOptForSize = I->getParent()->getParent()->hasOptSize(); | ||||||||
1505 | if (getTLI()->isBeneficialToExpandPowI(RHSC->getSExtValue(), | ||||||||
1506 | ShouldOptForSize)) { | ||||||||
1507 | // The cost is modeled on the expansion performed by ExpandPowI in | ||||||||
1508 | // SelectionDAGBuilder. | ||||||||
1509 | APInt Exponent = RHSC->getValue().abs(); | ||||||||
1510 | unsigned ActiveBits = Exponent.getActiveBits(); | ||||||||
1511 | unsigned PopCount = Exponent.countPopulation(); | ||||||||
1512 | InstructionCost Cost = (ActiveBits + PopCount - 2) * | ||||||||
1513 | thisT()->getArithmeticInstrCost( | ||||||||
1514 | Instruction::FMul, RetTy, CostKind); | ||||||||
1515 | if (RHSC->getSExtValue() < 0) | ||||||||
1516 | Cost += thisT()->getArithmeticInstrCost(Instruction::FDiv, RetTy, | ||||||||
1517 | CostKind); | ||||||||
1518 | return Cost; | ||||||||
1519 | } | ||||||||
1520 | } | ||||||||
1521 | break; | ||||||||
1522 | case Intrinsic::cttz: | ||||||||
1523 | // FIXME: If necessary, this should go in target-specific overrides. | ||||||||
1524 | if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz(RetTy)) | ||||||||
1525 | return TargetTransformInfo::TCC_Basic; | ||||||||
1526 | break; | ||||||||
1527 | |||||||||
1528 | case Intrinsic::ctlz: | ||||||||
1529 | // FIXME: If necessary, this should go in target-specific overrides. | ||||||||
1530 | if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz(RetTy)) | ||||||||
1531 | return TargetTransformInfo::TCC_Basic; | ||||||||
1532 | break; | ||||||||
1533 | |||||||||
1534 | case Intrinsic::memcpy: | ||||||||
1535 | return thisT()->getMemcpyCost(ICA.getInst()); | ||||||||
1536 | |||||||||
1537 | case Intrinsic::masked_scatter: { | ||||||||
1538 | const Value *Mask = Args[3]; | ||||||||
1539 | bool VarMask = !isa<Constant>(Mask); | ||||||||
1540 | Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue(); | ||||||||
1541 | return thisT()->getGatherScatterOpCost(Instruction::Store, | ||||||||
1542 | ICA.getArgTypes()[0], Args[1], | ||||||||
1543 | VarMask, Alignment, CostKind, I); | ||||||||
1544 | } | ||||||||
1545 | case Intrinsic::masked_gather: { | ||||||||
1546 | const Value *Mask = Args[2]; | ||||||||
1547 | bool VarMask = !isa<Constant>(Mask); | ||||||||
1548 | Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue(); | ||||||||
1549 | return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0], | ||||||||
1550 | VarMask, Alignment, CostKind, I); | ||||||||
1551 | } | ||||||||
1552 | case Intrinsic::experimental_stepvector: { | ||||||||
1553 | if (isa<ScalableVectorType>(RetTy)) | ||||||||
1554 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); | ||||||||
1555 | // The cost of materialising a constant integer vector. | ||||||||
1556 | return TargetTransformInfo::TCC_Basic; | ||||||||
1557 | } | ||||||||
1558 | case Intrinsic::vector_extract: { | ||||||||
1559 | // FIXME: Handle case where a scalable vector is extracted from a scalable | ||||||||
1560 | // vector | ||||||||
1561 | if (isa<ScalableVectorType>(RetTy)) | ||||||||
1562 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); | ||||||||
1563 | unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue(); | ||||||||
1564 | return thisT()->getShuffleCost( | ||||||||
1565 | TTI::SK_ExtractSubvector, cast<VectorType>(Args[0]->getType()), | ||||||||
1566 | std::nullopt, CostKind, Index, cast<VectorType>(RetTy)); | ||||||||
1567 | } | ||||||||
1568 | case Intrinsic::vector_insert: { | ||||||||
1569 | // FIXME: Handle case where a scalable vector is inserted into a scalable | ||||||||
1570 | // vector | ||||||||
1571 | if (isa<ScalableVectorType>(Args[1]->getType())) | ||||||||
1572 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); | ||||||||
1573 | unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue(); | ||||||||
1574 | return thisT()->getShuffleCost( | ||||||||
1575 | TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()), | ||||||||
1576 | std::nullopt, CostKind, Index, cast<VectorType>(Args[1]->getType())); | ||||||||
1577 | } | ||||||||
1578 | case Intrinsic::experimental_vector_reverse: { | ||||||||
1579 | return thisT()->getShuffleCost( | ||||||||
1580 | TTI::SK_Reverse, cast<VectorType>(Args[0]->getType()), std::nullopt, | ||||||||
1581 | CostKind, 0, cast<VectorType>(RetTy)); | ||||||||
1582 | } | ||||||||
1583 | case Intrinsic::experimental_vector_splice: { | ||||||||
1584 | unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue(); | ||||||||
1585 | return thisT()->getShuffleCost( | ||||||||
1586 | TTI::SK_Splice, cast<VectorType>(Args[0]->getType()), std::nullopt, | ||||||||
1587 | CostKind, Index, cast<VectorType>(RetTy)); | ||||||||
1588 | } | ||||||||
1589 | case Intrinsic::vector_reduce_add: | ||||||||
1590 | case Intrinsic::vector_reduce_mul: | ||||||||
1591 | case Intrinsic::vector_reduce_and: | ||||||||
1592 | case Intrinsic::vector_reduce_or: | ||||||||
1593 | case Intrinsic::vector_reduce_xor: | ||||||||
1594 | case Intrinsic::vector_reduce_smax: | ||||||||
1595 | case Intrinsic::vector_reduce_smin: | ||||||||
1596 | case Intrinsic::vector_reduce_fmax: | ||||||||
1597 | case Intrinsic::vector_reduce_fmin: | ||||||||
1598 | case Intrinsic::vector_reduce_umax: | ||||||||
1599 | case Intrinsic::vector_reduce_umin: { | ||||||||
1600 | IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1); | ||||||||
1601 | return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); | ||||||||
1602 | } | ||||||||
1603 | case Intrinsic::vector_reduce_fadd: | ||||||||
1604 | case Intrinsic::vector_reduce_fmul: { | ||||||||
1605 | IntrinsicCostAttributes Attrs( | ||||||||
1606 | IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1); | ||||||||
1607 | return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); | ||||||||
1608 | } | ||||||||
1609 | case Intrinsic::fshl: | ||||||||
1610 | case Intrinsic::fshr: { | ||||||||
1611 | const Value *X = Args[0]; | ||||||||
1612 | const Value *Y = Args[1]; | ||||||||
1613 | const Value *Z = Args[2]; | ||||||||
1614 | const TTI::OperandValueInfo OpInfoX = TTI::getOperandInfo(X); | ||||||||
1615 | const TTI::OperandValueInfo OpInfoY = TTI::getOperandInfo(Y); | ||||||||
1616 | const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(Z); | ||||||||
1617 | const TTI::OperandValueInfo OpInfoBW = | ||||||||
1618 | {TTI::OK_UniformConstantValue, | ||||||||
1619 | isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2 | ||||||||
1620 | : TTI::OP_None}; | ||||||||
1621 | |||||||||
1622 | // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW))) | ||||||||
1623 | // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW)) | ||||||||
1624 | InstructionCost Cost = 0; | ||||||||
1625 | Cost += | ||||||||
1626 | thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind); | ||||||||
1627 | Cost += | ||||||||
1628 | thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind); | ||||||||
1629 | Cost += thisT()->getArithmeticInstrCost( | ||||||||
1630 | BinaryOperator::Shl, RetTy, CostKind, OpInfoX, | ||||||||
1631 | {OpInfoZ.Kind, TTI::OP_None}); | ||||||||
1632 | Cost += thisT()->getArithmeticInstrCost( | ||||||||
1633 | BinaryOperator::LShr, RetTy, CostKind, OpInfoY, | ||||||||
1634 | {OpInfoZ.Kind, TTI::OP_None}); | ||||||||
1635 | // Non-constant shift amounts requires a modulo. | ||||||||
1636 | if (!OpInfoZ.isConstant()) | ||||||||
1637 | Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy, | ||||||||
1638 | CostKind, OpInfoZ, OpInfoBW); | ||||||||
1639 | // For non-rotates (X != Y) we must add shift-by-zero handling costs. | ||||||||
1640 | if (X != Y) { | ||||||||
1641 | Type *CondTy = RetTy->getWithNewBitWidth(1); | ||||||||
1642 | Cost += | ||||||||
1643 | thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, | ||||||||
1644 | CmpInst::ICMP_EQ, CostKind); | ||||||||
1645 | Cost += | ||||||||
1646 | thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, | ||||||||
1647 | CmpInst::ICMP_EQ, CostKind); | ||||||||
1648 | } | ||||||||
1649 | return Cost; | ||||||||
1650 | } | ||||||||
1651 | case Intrinsic::get_active_lane_mask: { | ||||||||
1652 | EVT ResVT = getTLI()->getValueType(DL, RetTy, true); | ||||||||
1653 | EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true); | ||||||||
1654 | |||||||||
1655 | // If we're not expanding the intrinsic then we assume this is cheap | ||||||||
1656 | // to implement. | ||||||||
1657 | if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgType)) { | ||||||||
1658 | return getTypeLegalizationCost(RetTy).first; | ||||||||
1659 | } | ||||||||
1660 | |||||||||
1661 | // Create the expanded types that will be used to calculate the uadd_sat | ||||||||
1662 | // operation. | ||||||||
1663 | Type *ExpRetTy = VectorType::get( | ||||||||
1664 | ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount()); | ||||||||
1665 | IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, FMF); | ||||||||
1666 | InstructionCost Cost = | ||||||||
1667 | thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind); | ||||||||
1668 | Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy, | ||||||||
1669 | CmpInst::ICMP_ULT, CostKind); | ||||||||
1670 | return Cost; | ||||||||
1671 | } | ||||||||
1672 | } | ||||||||
1673 | |||||||||
1674 | // Assume that we need to scalarize this intrinsic. | ||||||||
1675 | // Compute the scalarization overhead based on Args for a vector | ||||||||
1676 | // intrinsic. | ||||||||
1677 | InstructionCost ScalarizationCost = InstructionCost::getInvalid(); | ||||||||
1678 | if (RetVF.isVector() && !RetVF.isScalable()) { | ||||||||
1679 | ScalarizationCost = 0; | ||||||||
1680 | if (!RetTy->isVoidTy()) | ||||||||
1681 | ScalarizationCost += getScalarizationOverhead( | ||||||||
1682 | cast<VectorType>(RetTy), | ||||||||
1683 | /*Insert*/ true, /*Extract*/ false, CostKind); | ||||||||
1684 | ScalarizationCost += | ||||||||
1685 | getOperandsScalarizationOverhead(Args, ICA.getArgTypes(), CostKind); | ||||||||
1686 | } | ||||||||
1687 | |||||||||
1688 | IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I, | ||||||||
1689 | ScalarizationCost); | ||||||||
1690 | return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind); | ||||||||
1691 | } | ||||||||
1692 | |||||||||
1693 | /// Get intrinsic cost based on argument types. | ||||||||
1694 | /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the | ||||||||
1695 | /// cost of scalarizing the arguments and the return value will be computed | ||||||||
1696 | /// based on types. | ||||||||
1697 | InstructionCost | ||||||||
1698 | getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, | ||||||||
1699 | TTI::TargetCostKind CostKind) { | ||||||||
1700 | Intrinsic::ID IID = ICA.getID(); | ||||||||
1701 | Type *RetTy = ICA.getReturnType(); | ||||||||
1702 | const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes(); | ||||||||
1703 | FastMathFlags FMF = ICA.getFlags(); | ||||||||
1704 | InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost(); | ||||||||
1705 | bool SkipScalarizationCost = ICA.skipScalarizationCost(); | ||||||||
1706 | |||||||||
1707 | VectorType *VecOpTy = nullptr; | ||||||||
1708 | if (!Tys.empty()) { | ||||||||
1709 | // The vector reduction operand is operand 0 except for fadd/fmul. | ||||||||
1710 | // Their operand 0 is a scalar start value, so the vector op is operand 1. | ||||||||
1711 | unsigned VecTyIndex = 0; | ||||||||
1712 | if (IID == Intrinsic::vector_reduce_fadd || | ||||||||
1713 | IID == Intrinsic::vector_reduce_fmul) | ||||||||
1714 | VecTyIndex = 1; | ||||||||
1715 | assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes")(static_cast <bool> (Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes") ? void (0) : __assert_fail ("Tys.size() > VecTyIndex && \"Unexpected IntrinsicCostAttributes\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 1715, __extension__ __PRETTY_FUNCTION__)); | ||||||||
1716 | VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]); | ||||||||
1717 | } | ||||||||
1718 | |||||||||
1719 | // Library call cost - other than size, make it expensive. | ||||||||
1720 | unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10; | ||||||||
1721 | unsigned ISD = 0; | ||||||||
1722 | switch (IID) { | ||||||||
1723 | default: { | ||||||||
1724 | // Scalable vectors cannot be scalarized, so return Invalid. | ||||||||
1725 | if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) { | ||||||||
1726 | return isa<ScalableVectorType>(Ty); | ||||||||
1727 | })) | ||||||||
1728 | return InstructionCost::getInvalid(); | ||||||||
1729 | |||||||||
1730 | // Assume that we need to scalarize this intrinsic. | ||||||||
1731 | InstructionCost ScalarizationCost = | ||||||||
1732 | SkipScalarizationCost ? ScalarizationCostPassed : 0; | ||||||||
1733 | unsigned ScalarCalls = 1; | ||||||||
1734 | Type *ScalarRetTy = RetTy; | ||||||||
1735 | if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) { | ||||||||
1736 | if (!SkipScalarizationCost) | ||||||||
1737 | ScalarizationCost = getScalarizationOverhead( | ||||||||
1738 | RetVTy, /*Insert*/ true, /*Extract*/ false, CostKind); | ||||||||
1739 | ScalarCalls = std::max(ScalarCalls, | ||||||||
1740 | cast<FixedVectorType>(RetVTy)->getNumElements()); | ||||||||
1741 | ScalarRetTy = RetTy->getScalarType(); | ||||||||
1742 | } | ||||||||
1743 | SmallVector<Type *, 4> ScalarTys; | ||||||||
1744 | for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { | ||||||||
1745 | Type *Ty = Tys[i]; | ||||||||
1746 | if (auto *VTy = dyn_cast<VectorType>(Ty)) { | ||||||||
1747 | if (!SkipScalarizationCost) | ||||||||
1748 | ScalarizationCost += getScalarizationOverhead( | ||||||||
1749 | VTy, /*Insert*/ false, /*Extract*/ true, CostKind); | ||||||||
1750 | ScalarCalls = std::max(ScalarCalls, | ||||||||
1751 | cast<FixedVectorType>(VTy)->getNumElements()); | ||||||||
1752 | Ty = Ty->getScalarType(); | ||||||||
1753 | } | ||||||||
1754 | ScalarTys.push_back(Ty); | ||||||||
1755 | } | ||||||||
1756 | if (ScalarCalls == 1) | ||||||||
1757 | return 1; // Return cost of a scalar intrinsic. Assume it to be cheap. | ||||||||
1758 | |||||||||
1759 | IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF); | ||||||||
1760 | InstructionCost ScalarCost = | ||||||||
1761 | thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind); | ||||||||
1762 | |||||||||
1763 | return ScalarCalls * ScalarCost + ScalarizationCost; | ||||||||
1764 | } | ||||||||
1765 | // Look for intrinsics that can be lowered directly or turned into a scalar | ||||||||
1766 | // intrinsic call. | ||||||||
1767 | case Intrinsic::sqrt: | ||||||||
1768 | ISD = ISD::FSQRT; | ||||||||
1769 | break; | ||||||||
1770 | case Intrinsic::sin: | ||||||||
1771 | ISD = ISD::FSIN; | ||||||||
1772 | break; | ||||||||
1773 | case Intrinsic::cos: | ||||||||
1774 | ISD = ISD::FCOS; | ||||||||
1775 | break; | ||||||||
1776 | case Intrinsic::exp: | ||||||||
1777 | ISD = ISD::FEXP; | ||||||||
1778 | break; | ||||||||
1779 | case Intrinsic::exp2: | ||||||||
1780 | ISD = ISD::FEXP2; | ||||||||
1781 | break; | ||||||||
1782 | case Intrinsic::log: | ||||||||
1783 | ISD = ISD::FLOG; | ||||||||
1784 | break; | ||||||||
1785 | case Intrinsic::log10: | ||||||||
1786 | ISD = ISD::FLOG10; | ||||||||
1787 | break; | ||||||||
1788 | case Intrinsic::log2: | ||||||||
1789 | ISD = ISD::FLOG2; | ||||||||
1790 | break; | ||||||||
1791 | case Intrinsic::fabs: | ||||||||
1792 | ISD = ISD::FABS; | ||||||||
1793 | break; | ||||||||
1794 | case Intrinsic::canonicalize: | ||||||||
1795 | ISD = ISD::FCANONICALIZE; | ||||||||
1796 | break; | ||||||||
1797 | case Intrinsic::minnum: | ||||||||
1798 | ISD = ISD::FMINNUM; | ||||||||
1799 | break; | ||||||||
1800 | case Intrinsic::maxnum: | ||||||||
1801 | ISD = ISD::FMAXNUM; | ||||||||
1802 | break; | ||||||||
1803 | case Intrinsic::minimum: | ||||||||
1804 | ISD = ISD::FMINIMUM; | ||||||||
1805 | break; | ||||||||
1806 | case Intrinsic::maximum: | ||||||||
1807 | ISD = ISD::FMAXIMUM; | ||||||||
1808 | break; | ||||||||
1809 | case Intrinsic::copysign: | ||||||||
1810 | ISD = ISD::FCOPYSIGN; | ||||||||
1811 | break; | ||||||||
1812 | case Intrinsic::floor: | ||||||||
1813 | ISD = ISD::FFLOOR; | ||||||||
1814 | break; | ||||||||
1815 | case Intrinsic::ceil: | ||||||||
1816 | ISD = ISD::FCEIL; | ||||||||
1817 | break; | ||||||||
1818 | case Intrinsic::trunc: | ||||||||
1819 | ISD = ISD::FTRUNC; | ||||||||
1820 | break; | ||||||||
1821 | case Intrinsic::nearbyint: | ||||||||
1822 | ISD = ISD::FNEARBYINT; | ||||||||
1823 | break; | ||||||||
1824 | case Intrinsic::rint: | ||||||||
1825 | ISD = ISD::FRINT; | ||||||||
1826 | break; | ||||||||
1827 | case Intrinsic::round: | ||||||||
1828 | ISD = ISD::FROUND; | ||||||||
1829 | break; | ||||||||
1830 | case Intrinsic::roundeven: | ||||||||
1831 | ISD = ISD::FROUNDEVEN; | ||||||||
1832 | break; | ||||||||
1833 | case Intrinsic::pow: | ||||||||
1834 | ISD = ISD::FPOW; | ||||||||
1835 | break; | ||||||||
1836 | case Intrinsic::fma: | ||||||||
1837 | ISD = ISD::FMA; | ||||||||
1838 | break; | ||||||||
1839 | case Intrinsic::fmuladd: | ||||||||
1840 | ISD = ISD::FMA; | ||||||||
1841 | break; | ||||||||
1842 | case Intrinsic::experimental_constrained_fmuladd: | ||||||||
1843 | ISD = ISD::STRICT_FMA; | ||||||||
1844 | break; | ||||||||
1845 | // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free. | ||||||||
1846 | case Intrinsic::lifetime_start: | ||||||||
1847 | case Intrinsic::lifetime_end: | ||||||||
1848 | case Intrinsic::sideeffect: | ||||||||
1849 | case Intrinsic::pseudoprobe: | ||||||||
1850 | case Intrinsic::arithmetic_fence: | ||||||||
1851 | return 0; | ||||||||
1852 | case Intrinsic::masked_store: { | ||||||||
1853 | Type *Ty = Tys[0]; | ||||||||
1854 | Align TyAlign = thisT()->DL.getABITypeAlign(Ty); | ||||||||
1855 | return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0, | ||||||||
1856 | CostKind); | ||||||||
1857 | } | ||||||||
1858 | case Intrinsic::masked_load: { | ||||||||
1859 | Type *Ty = RetTy; | ||||||||
1860 | Align TyAlign = thisT()->DL.getABITypeAlign(Ty); | ||||||||
1861 | return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0, | ||||||||
1862 | CostKind); | ||||||||
1863 | } | ||||||||
1864 | case Intrinsic::vector_reduce_add: | ||||||||
1865 | return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy, | ||||||||
1866 | std::nullopt, CostKind); | ||||||||
1867 | case Intrinsic::vector_reduce_mul: | ||||||||
1868 | return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy, | ||||||||
1869 | std::nullopt, CostKind); | ||||||||
1870 | case Intrinsic::vector_reduce_and: | ||||||||
1871 | return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy, | ||||||||
1872 | std::nullopt, CostKind); | ||||||||
1873 | case Intrinsic::vector_reduce_or: | ||||||||
1874 | return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy, | ||||||||
1875 | std::nullopt, CostKind); | ||||||||
1876 | case Intrinsic::vector_reduce_xor: | ||||||||
1877 | return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy, | ||||||||
1878 | std::nullopt, CostKind); | ||||||||
1879 | case Intrinsic::vector_reduce_fadd: | ||||||||
1880 | return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy, | ||||||||
1881 | FMF, CostKind); | ||||||||
1882 | case Intrinsic::vector_reduce_fmul: | ||||||||
1883 | return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy, | ||||||||
1884 | FMF, CostKind); | ||||||||
1885 | case Intrinsic::vector_reduce_smax: | ||||||||
1886 | case Intrinsic::vector_reduce_smin: | ||||||||
1887 | case Intrinsic::vector_reduce_fmax: | ||||||||
1888 | case Intrinsic::vector_reduce_fmin: | ||||||||
1889 | return thisT()->getMinMaxReductionCost( | ||||||||
1890 | VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)), | ||||||||
1891 | /*IsUnsigned=*/false, CostKind); | ||||||||
1892 | case Intrinsic::vector_reduce_umax: | ||||||||
1893 | case Intrinsic::vector_reduce_umin: | ||||||||
1894 | return thisT()->getMinMaxReductionCost( | ||||||||
1895 | VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)), | ||||||||
1896 | /*IsUnsigned=*/true, CostKind); | ||||||||
1897 | case Intrinsic::abs: { | ||||||||
1898 | // abs(X) = select(icmp(X,0),X,sub(0,X)) | ||||||||
1899 | Type *CondTy = RetTy->getWithNewBitWidth(1); | ||||||||
1900 | CmpInst::Predicate Pred = CmpInst::ICMP_SGT; | ||||||||
1901 | InstructionCost Cost = 0; | ||||||||
1902 | Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, | ||||||||
1903 | Pred, CostKind); | ||||||||
1904 | Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, | ||||||||
1905 | Pred, CostKind); | ||||||||
1906 | // TODO: Should we add an OperandValueProperties::OP_Zero property? | ||||||||
1907 | Cost += thisT()->getArithmeticInstrCost( | ||||||||
1908 | BinaryOperator::Sub, RetTy, CostKind, {TTI::OK_UniformConstantValue, TTI::OP_None}); | ||||||||
1909 | return Cost; | ||||||||
1910 | } | ||||||||
1911 | case Intrinsic::smax: | ||||||||
1912 | case Intrinsic::smin: | ||||||||
1913 | case Intrinsic::umax: | ||||||||
1914 | case Intrinsic::umin: { | ||||||||
1915 | // minmax(X,Y) = select(icmp(X,Y),X,Y) | ||||||||
1916 | Type *CondTy = RetTy->getWithNewBitWidth(1); | ||||||||
1917 | bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin; | ||||||||
1918 | CmpInst::Predicate Pred = | ||||||||
1919 | IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT; | ||||||||
1920 | InstructionCost Cost = 0; | ||||||||
1921 | Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, | ||||||||
1922 | Pred, CostKind); | ||||||||
1923 | Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, | ||||||||
1924 | Pred, CostKind); | ||||||||
1925 | return Cost; | ||||||||
1926 | } | ||||||||
1927 | case Intrinsic::sadd_sat: | ||||||||
1928 | case Intrinsic::ssub_sat: { | ||||||||
1929 | Type *CondTy = RetTy->getWithNewBitWidth(1); | ||||||||
1930 | |||||||||
1931 | Type *OpTy = StructType::create({RetTy, CondTy}); | ||||||||
1932 | Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat | ||||||||
1933 | ? Intrinsic::sadd_with_overflow | ||||||||
1934 | : Intrinsic::ssub_with_overflow; | ||||||||
1935 | CmpInst::Predicate Pred = CmpInst::ICMP_SGT; | ||||||||
1936 | |||||||||
1937 | // SatMax -> Overflow && SumDiff < 0 | ||||||||
1938 | // SatMin -> Overflow && SumDiff >= 0 | ||||||||
1939 | InstructionCost Cost = 0; | ||||||||
1940 | IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, | ||||||||
1941 | nullptr, ScalarizationCostPassed); | ||||||||
1942 | Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); | ||||||||
1943 | Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, | ||||||||
1944 | Pred, CostKind); | ||||||||
1945 | Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, | ||||||||
1946 | CondTy, Pred, CostKind); | ||||||||
1947 | return Cost; | ||||||||
1948 | } | ||||||||
1949 | case Intrinsic::uadd_sat: | ||||||||
1950 | case Intrinsic::usub_sat: { | ||||||||
1951 | Type *CondTy = RetTy->getWithNewBitWidth(1); | ||||||||
1952 | |||||||||
1953 | Type *OpTy = StructType::create({RetTy, CondTy}); | ||||||||
1954 | Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat | ||||||||
1955 | ? Intrinsic::uadd_with_overflow | ||||||||
1956 | : Intrinsic::usub_with_overflow; | ||||||||
1957 | |||||||||
1958 | InstructionCost Cost = 0; | ||||||||
1959 | IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, | ||||||||
1960 | nullptr, ScalarizationCostPassed); | ||||||||
1961 | Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); | ||||||||
1962 | Cost += | ||||||||
1963 | thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, | ||||||||
1964 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | ||||||||
1965 | return Cost; | ||||||||
1966 | } | ||||||||
1967 | case Intrinsic::smul_fix: | ||||||||
1968 | case Intrinsic::umul_fix: { | ||||||||
1969 | unsigned ExtSize = RetTy->getScalarSizeInBits() * 2; | ||||||||
1970 | Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize); | ||||||||
1971 | |||||||||
1972 | unsigned ExtOp = | ||||||||
1973 | IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt; | ||||||||
1974 | TTI::CastContextHint CCH = TTI::CastContextHint::None; | ||||||||
1975 | |||||||||
1976 | InstructionCost Cost = 0; | ||||||||
1977 | Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind); | ||||||||
1978 | Cost += | ||||||||
1979 | thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); | ||||||||
1980 | Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy, | ||||||||
1981 | CCH, CostKind); | ||||||||
1982 | Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy, | ||||||||
1983 | CostKind, | ||||||||
1984 | {TTI::OK_AnyValue, TTI::OP_None}, | ||||||||
1985 | {TTI::OK_UniformConstantValue, TTI::OP_None}); | ||||||||
1986 | Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind, | ||||||||
1987 | {TTI::OK_AnyValue, TTI::OP_None}, | ||||||||
1988 | {TTI::OK_UniformConstantValue, TTI::OP_None}); | ||||||||
1989 | Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind); | ||||||||
1990 | return Cost; | ||||||||
1991 | } | ||||||||
1992 | case Intrinsic::sadd_with_overflow: | ||||||||
1993 | case Intrinsic::ssub_with_overflow: { | ||||||||
1994 | Type *SumTy = RetTy->getContainedType(0); | ||||||||
1995 | Type *OverflowTy = RetTy->getContainedType(1); | ||||||||
1996 | unsigned Opcode = IID == Intrinsic::sadd_with_overflow | ||||||||
1997 | ? BinaryOperator::Add | ||||||||
1998 | : BinaryOperator::Sub; | ||||||||
1999 | |||||||||
2000 | // Add: | ||||||||
2001 | // Overflow -> (Result < LHS) ^ (RHS < 0) | ||||||||
2002 | // Sub: | ||||||||
2003 | // Overflow -> (Result < LHS) ^ (RHS > 0) | ||||||||
2004 | InstructionCost Cost = 0; | ||||||||
2005 | Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind); | ||||||||
2006 | Cost += 2 * thisT()->getCmpSelInstrCost( | ||||||||
2007 | Instruction::ICmp, SumTy, OverflowTy, | ||||||||
2008 | CmpInst::ICMP_SGT, CostKind); | ||||||||
2009 | Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy, | ||||||||
2010 | CostKind); | ||||||||
2011 | return Cost; | ||||||||
2012 | } | ||||||||
2013 | case Intrinsic::uadd_with_overflow: | ||||||||
2014 | case Intrinsic::usub_with_overflow: { | ||||||||
2015 | Type *SumTy = RetTy->getContainedType(0); | ||||||||
2016 | Type *OverflowTy = RetTy->getContainedType(1); | ||||||||
2017 | unsigned Opcode = IID == Intrinsic::uadd_with_overflow | ||||||||
2018 | ? BinaryOperator::Add | ||||||||
2019 | : BinaryOperator::Sub; | ||||||||
2020 | CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow | ||||||||
2021 | ? CmpInst::ICMP_ULT | ||||||||
2022 | : CmpInst::ICMP_UGT; | ||||||||
2023 | |||||||||
2024 | InstructionCost Cost = 0; | ||||||||
2025 | Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind); | ||||||||
2026 | Cost += | ||||||||
2027 | thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy, | ||||||||
2028 | Pred, CostKind); | ||||||||
2029 | return Cost; | ||||||||
2030 | } | ||||||||
2031 | case Intrinsic::smul_with_overflow: | ||||||||
2032 | case Intrinsic::umul_with_overflow: { | ||||||||
2033 | Type *MulTy = RetTy->getContainedType(0); | ||||||||
2034 | Type *OverflowTy = RetTy->getContainedType(1); | ||||||||
2035 | unsigned ExtSize = MulTy->getScalarSizeInBits() * 2; | ||||||||
2036 | Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize); | ||||||||
2037 | bool IsSigned = IID == Intrinsic::smul_with_overflow; | ||||||||
2038 | |||||||||
2039 | unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt; | ||||||||
2040 | TTI::CastContextHint CCH = TTI::CastContextHint::None; | ||||||||
2041 | |||||||||
2042 | InstructionCost Cost = 0; | ||||||||
2043 | Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind); | ||||||||
2044 | Cost += | ||||||||
2045 | thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); | ||||||||
2046 | Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy, | ||||||||
2047 | CCH, CostKind); | ||||||||
2048 | Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, ExtTy, | ||||||||
2049 | CostKind, | ||||||||
2050 | {TTI::OK_AnyValue, TTI::OP_None}, | ||||||||
2051 | {TTI::OK_UniformConstantValue, TTI::OP_None}); | ||||||||
2052 | |||||||||
2053 | if (IsSigned) | ||||||||
2054 | Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy, | ||||||||
2055 | CostKind, | ||||||||
2056 | {TTI::OK_AnyValue, TTI::OP_None}, | ||||||||
2057 | {TTI::OK_UniformConstantValue, TTI::OP_None}); | ||||||||
2058 | |||||||||
2059 | Cost += thisT()->getCmpSelInstrCost( | ||||||||
2060 | BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind); | ||||||||
2061 | return Cost; | ||||||||
2062 | } | ||||||||
2063 | case Intrinsic::fptosi_sat: | ||||||||
2064 | case Intrinsic::fptoui_sat: { | ||||||||
2065 | if (Tys.empty()) | ||||||||
2066 | break; | ||||||||
2067 | Type *FromTy = Tys[0]; | ||||||||
2068 | bool IsSigned = IID == Intrinsic::fptosi_sat; | ||||||||
2069 | |||||||||
2070 | InstructionCost Cost = 0; | ||||||||
2071 | IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FromTy, | ||||||||
2072 | {FromTy, FromTy}); | ||||||||
2073 | Cost += thisT()->getIntrinsicInstrCost(Attrs1, CostKind); | ||||||||
2074 | IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FromTy, | ||||||||
2075 | {FromTy, FromTy}); | ||||||||
2076 | Cost += thisT()->getIntrinsicInstrCost(Attrs2, CostKind); | ||||||||
2077 | Cost += thisT()->getCastInstrCost( | ||||||||
2078 | IsSigned ? Instruction::FPToSI : Instruction::FPToUI, RetTy, FromTy, | ||||||||
2079 | TTI::CastContextHint::None, CostKind); | ||||||||
2080 | if (IsSigned) { | ||||||||
2081 | Type *CondTy = RetTy->getWithNewBitWidth(1); | ||||||||
2082 | Cost += thisT()->getCmpSelInstrCost( | ||||||||
2083 | BinaryOperator::FCmp, FromTy, CondTy, CmpInst::FCMP_UNO, CostKind); | ||||||||
2084 | Cost += thisT()->getCmpSelInstrCost( | ||||||||
2085 | BinaryOperator::Select, RetTy, CondTy, CmpInst::FCMP_UNO, CostKind); | ||||||||
2086 | } | ||||||||
2087 | return Cost; | ||||||||
2088 | } | ||||||||
2089 | case Intrinsic::ctpop: | ||||||||
2090 | ISD = ISD::CTPOP; | ||||||||
2091 | // In case of legalization use TCC_Expensive. This is cheaper than a | ||||||||
2092 | // library call but still not a cheap instruction. | ||||||||
2093 | SingleCallCost = TargetTransformInfo::TCC_Expensive; | ||||||||
2094 | break; | ||||||||
2095 | case Intrinsic::ctlz: | ||||||||
2096 | ISD = ISD::CTLZ; | ||||||||
2097 | break; | ||||||||
2098 | case Intrinsic::cttz: | ||||||||
2099 | ISD = ISD::CTTZ; | ||||||||
2100 | break; | ||||||||
2101 | case Intrinsic::bswap: | ||||||||
2102 | ISD = ISD::BSWAP; | ||||||||
2103 | break; | ||||||||
2104 | case Intrinsic::bitreverse: | ||||||||
2105 | ISD = ISD::BITREVERSE; | ||||||||
2106 | break; | ||||||||
2107 | } | ||||||||
2108 | |||||||||
2109 | const TargetLoweringBase *TLI = getTLI(); | ||||||||
2110 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy); | ||||||||
2111 | |||||||||
2112 | if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { | ||||||||
2113 | if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() && | ||||||||
2114 | TLI->isFAbsFree(LT.second)) { | ||||||||
2115 | return 0; | ||||||||
2116 | } | ||||||||
2117 | |||||||||
2118 | // The operation is legal. Assume it costs 1. | ||||||||
2119 | // If the type is split to multiple registers, assume that there is some | ||||||||
2120 | // overhead to this. | ||||||||
2121 | // TODO: Once we have extract/insert subvector cost we need to use them. | ||||||||
2122 | if (LT.first > 1) | ||||||||
2123 | return (LT.first * 2); | ||||||||
2124 | else | ||||||||
2125 | return (LT.first * 1); | ||||||||
2126 | } else if (!TLI->isOperationExpand(ISD, LT.second)) { | ||||||||
2127 | // If the operation is custom lowered then assume | ||||||||
2128 | // that the code is twice as expensive. | ||||||||
2129 | return (LT.first * 2); | ||||||||
2130 | } | ||||||||
2131 | |||||||||
2132 | // If we can't lower fmuladd into an FMA estimate the cost as a floating | ||||||||
2133 | // point mul followed by an add. | ||||||||
2134 | if (IID == Intrinsic::fmuladd) | ||||||||
2135 | return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy, | ||||||||
2136 | CostKind) + | ||||||||
2137 | thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy, | ||||||||
2138 | CostKind); | ||||||||
2139 | if (IID == Intrinsic::experimental_constrained_fmuladd) { | ||||||||
2140 | IntrinsicCostAttributes FMulAttrs( | ||||||||
2141 | Intrinsic::experimental_constrained_fmul, RetTy, Tys); | ||||||||
2142 | IntrinsicCostAttributes FAddAttrs( | ||||||||
2143 | Intrinsic::experimental_constrained_fadd, RetTy, Tys); | ||||||||
2144 | return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) + | ||||||||
2145 | thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind); | ||||||||
2146 | } | ||||||||
2147 | |||||||||
2148 | // Else, assume that we need to scalarize this intrinsic. For math builtins | ||||||||
2149 | // this will emit a costly libcall, adding call overhead and spills. Make it | ||||||||
2150 | // very expensive. | ||||||||
2151 | if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) { | ||||||||
2152 | // Scalable vectors cannot be scalarized, so return Invalid. | ||||||||
2153 | if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) { | ||||||||
2154 | return isa<ScalableVectorType>(Ty); | ||||||||
2155 | })) | ||||||||
2156 | return InstructionCost::getInvalid(); | ||||||||
2157 | |||||||||
2158 | InstructionCost ScalarizationCost = | ||||||||
2159 | SkipScalarizationCost | ||||||||
2160 | ? ScalarizationCostPassed | ||||||||
2161 | : getScalarizationOverhead(RetVTy, /*Insert*/ true, | ||||||||
2162 | /*Extract*/ false, CostKind); | ||||||||
2163 | |||||||||
2164 | unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements(); | ||||||||
2165 | SmallVector<Type *, 4> ScalarTys; | ||||||||
2166 | for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { | ||||||||
2167 | Type *Ty = Tys[i]; | ||||||||
2168 | if (Ty->isVectorTy()) | ||||||||
2169 | Ty = Ty->getScalarType(); | ||||||||
2170 | ScalarTys.push_back(Ty); | ||||||||
2171 | } | ||||||||
2172 | IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF); | ||||||||
2173 | InstructionCost ScalarCost = | ||||||||
2174 | thisT()->getIntrinsicInstrCost(Attrs, CostKind); | ||||||||
2175 | for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { | ||||||||
2176 | if (auto *VTy = dyn_cast<VectorType>(Tys[i])) { | ||||||||
2177 | if (!ICA.skipScalarizationCost()) | ||||||||
2178 | ScalarizationCost += getScalarizationOverhead( | ||||||||
2179 | VTy, /*Insert*/ false, /*Extract*/ true, CostKind); | ||||||||
2180 | ScalarCalls = std::max(ScalarCalls, | ||||||||
2181 | cast<FixedVectorType>(VTy)->getNumElements()); | ||||||||
2182 | } | ||||||||
2183 | } | ||||||||
2184 | return ScalarCalls * ScalarCost + ScalarizationCost; | ||||||||
2185 | } | ||||||||
2186 | |||||||||
2187 | // This is going to be turned into a library call, make it expensive. | ||||||||
2188 | return SingleCallCost; | ||||||||
2189 | } | ||||||||
2190 | |||||||||
2191 | /// Compute a cost of the given call instruction. | ||||||||
2192 | /// | ||||||||
2193 | /// Compute the cost of calling function F with return type RetTy and | ||||||||
2194 | /// argument types Tys. F might be nullptr, in this case the cost of an | ||||||||
2195 | /// arbitrary call with the specified signature will be returned. | ||||||||
2196 | /// This is used, for instance, when we estimate call of a vector | ||||||||
2197 | /// counterpart of the given function. | ||||||||
2198 | /// \param F Called function, might be nullptr. | ||||||||
2199 | /// \param RetTy Return value types. | ||||||||
2200 | /// \param Tys Argument types. | ||||||||
2201 | /// \returns The cost of Call instruction. | ||||||||
2202 | InstructionCost getCallInstrCost(Function *F, Type *RetTy, | ||||||||
2203 | ArrayRef<Type *> Tys, | ||||||||
2204 | TTI::TargetCostKind CostKind) { | ||||||||
2205 | return 10; | ||||||||
2206 | } | ||||||||
2207 | |||||||||
2208 | unsigned getNumberOfParts(Type *Tp) { | ||||||||
2209 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp); | ||||||||
2210 | return LT.first.isValid() ? *LT.first.getValue() : 0; | ||||||||
2211 | } | ||||||||
2212 | |||||||||
2213 | InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, | ||||||||
2214 | const SCEV *) { | ||||||||
2215 | return 0; | ||||||||
2216 | } | ||||||||
2217 | |||||||||
2218 | /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics. | ||||||||
2219 | /// We're assuming that reduction operation are performing the following way: | ||||||||
2220 | /// | ||||||||
2221 | /// %val1 = shufflevector<n x t> %val, <n x t> %undef, | ||||||||
2222 | /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef> | ||||||||
2223 | /// \----------------v-------------/ \----------v------------/ | ||||||||
2224 | /// n/2 elements n/2 elements | ||||||||
2225 | /// %red1 = op <n x t> %val, <n x t> val1 | ||||||||
2226 | /// After this operation we have a vector %red1 where only the first n/2 | ||||||||
2227 | /// elements are meaningful, the second n/2 elements are undefined and can be | ||||||||
2228 | /// dropped. All other operations are actually working with the vector of | ||||||||
2229 | /// length n/2, not n, though the real vector length is still n. | ||||||||
2230 | /// %val2 = shufflevector<n x t> %red1, <n x t> %undef, | ||||||||
2231 | /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef> | ||||||||
2232 | /// \----------------v-------------/ \----------v------------/ | ||||||||
2233 | /// n/4 elements 3*n/4 elements | ||||||||
2234 | /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of | ||||||||
2235 | /// length n/2, the resulting vector has length n/4 etc. | ||||||||
2236 | /// | ||||||||
2237 | /// The cost model should take into account that the actual length of the | ||||||||
2238 | /// vector is reduced on each iteration. | ||||||||
2239 | InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty, | ||||||||
2240 | TTI::TargetCostKind CostKind) { | ||||||||
2241 | // Targets must implement a default value for the scalable case, since | ||||||||
2242 | // we don't know how many lanes the vector has. | ||||||||
2243 | if (isa<ScalableVectorType>(Ty)) | ||||||||
2244 | return InstructionCost::getInvalid(); | ||||||||
2245 | |||||||||
2246 | Type *ScalarTy = Ty->getElementType(); | ||||||||
2247 | unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements(); | ||||||||
2248 | if ((Opcode == Instruction::Or || Opcode == Instruction::And) && | ||||||||
2249 | ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) && | ||||||||
2250 | NumVecElts >= 2) { | ||||||||
2251 | // Or reduction for i1 is represented as: | ||||||||
2252 | // %val = bitcast <ReduxWidth x i1> to iReduxWidth | ||||||||
2253 | // %res = cmp ne iReduxWidth %val, 0 | ||||||||
2254 | // And reduction for i1 is represented as: | ||||||||
2255 | // %val = bitcast <ReduxWidth x i1> to iReduxWidth | ||||||||
2256 | // %res = cmp eq iReduxWidth %val, 11111 | ||||||||
2257 | Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts); | ||||||||
2258 | return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty, | ||||||||
2259 | TTI::CastContextHint::None, CostKind) + | ||||||||
2260 | thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy, | ||||||||
2261 | CmpInst::makeCmpResultType(ValTy), | ||||||||
2262 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | ||||||||
2263 | } | ||||||||
2264 | unsigned NumReduxLevels = Log2_32(NumVecElts); | ||||||||
2265 | InstructionCost ArithCost = 0; | ||||||||
2266 | InstructionCost ShuffleCost = 0; | ||||||||
2267 | std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty); | ||||||||
2268 | unsigned LongVectorCount = 0; | ||||||||
2269 | unsigned MVTLen = | ||||||||
2270 | LT.second.isVector() ? LT.second.getVectorNumElements() : 1; | ||||||||
2271 | while (NumVecElts > MVTLen) { | ||||||||
2272 | NumVecElts /= 2; | ||||||||
2273 | VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); | ||||||||
2274 | ShuffleCost += | ||||||||
2275 | thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, | ||||||||
2276 | CostKind, NumVecElts, SubTy); | ||||||||
2277 | ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind); | ||||||||
2278 | Ty = SubTy; | ||||||||
2279 | ++LongVectorCount; | ||||||||
2280 | } | ||||||||
2281 | |||||||||
2282 | NumReduxLevels -= LongVectorCount; | ||||||||
2283 | |||||||||
2284 | // The minimal length of the vector is limited by the real length of vector | ||||||||
2285 | // operations performed on the current platform. That's why several final | ||||||||
2286 | // reduction operations are performed on the vectors with the same | ||||||||
2287 | // architecture-dependent length. | ||||||||
2288 | |||||||||
2289 | // By default reductions need one shuffle per reduction level. | ||||||||
2290 | ShuffleCost += | ||||||||
2291 | NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, | ||||||||
2292 | std::nullopt, CostKind, 0, Ty); | ||||||||
2293 | ArithCost += | ||||||||
2294 | NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind); | ||||||||
2295 | return ShuffleCost + ArithCost + | ||||||||
2296 | thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, | ||||||||
2297 | CostKind, 0, nullptr, nullptr); | ||||||||
2298 | } | ||||||||
2299 | |||||||||
2300 | /// Try to calculate the cost of performing strict (in-order) reductions, | ||||||||
2301 | /// which involves doing a sequence of floating point additions in lane | ||||||||
2302 | /// order, starting with an initial value. For example, consider a scalar | ||||||||
2303 | /// initial value 'InitVal' of type float and a vector of type <4 x float>: | ||||||||
2304 | /// | ||||||||
2305 | /// Vector = <float %v0, float %v1, float %v2, float %v3> | ||||||||
2306 | /// | ||||||||
2307 | /// %add1 = %InitVal + %v0 | ||||||||
2308 | /// %add2 = %add1 + %v1 | ||||||||
2309 | /// %add3 = %add2 + %v2 | ||||||||
2310 | /// %add4 = %add3 + %v3 | ||||||||
2311 | /// | ||||||||
2312 | /// As a simple estimate we can say the cost of such a reduction is 4 times | ||||||||
2313 | /// the cost of a scalar FP addition. We can only estimate the costs for | ||||||||
2314 | /// fixed-width vectors here because for scalable vectors we do not know the | ||||||||
2315 | /// runtime number of operations. | ||||||||
2316 | InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty, | ||||||||
2317 | TTI::TargetCostKind CostKind) { | ||||||||
2318 | // Targets must implement a default value for the scalable case, since | ||||||||
2319 | // we don't know how many lanes the vector has. | ||||||||
2320 | if (isa<ScalableVectorType>(Ty)) | ||||||||
2321 | return InstructionCost::getInvalid(); | ||||||||
2322 | |||||||||
2323 | auto *VTy = cast<FixedVectorType>(Ty); | ||||||||
2324 | InstructionCost ExtractCost = getScalarizationOverhead( | ||||||||
2325 | VTy, /*Insert=*/false, /*Extract=*/true, CostKind); | ||||||||
2326 | InstructionCost ArithCost = thisT()->getArithmeticInstrCost( | ||||||||
2327 | Opcode, VTy->getElementType(), CostKind); | ||||||||
2328 | ArithCost *= VTy->getNumElements(); | ||||||||
2329 | |||||||||
2330 | return ExtractCost + ArithCost; | ||||||||
2331 | } | ||||||||
2332 | |||||||||
2333 | InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, | ||||||||
2334 | std::optional<FastMathFlags> FMF, | ||||||||
2335 | TTI::TargetCostKind CostKind) { | ||||||||
2336 | if (TTI::requiresOrderedReduction(FMF)) | ||||||||
2337 | return getOrderedReductionCost(Opcode, Ty, CostKind); | ||||||||
2338 | return getTreeReductionCost(Opcode, Ty, CostKind); | ||||||||
2339 | } | ||||||||
2340 | |||||||||
2341 | /// Try to calculate op costs for min/max reduction operations. | ||||||||
2342 | /// \param CondTy Conditional type for the Select instruction. | ||||||||
2343 | InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, | ||||||||
2344 | bool IsUnsigned, | ||||||||
2345 | TTI::TargetCostKind CostKind) { | ||||||||
2346 | // Targets must implement a default value for the scalable case, since | ||||||||
2347 | // we don't know how many lanes the vector has. | ||||||||
2348 | if (isa<ScalableVectorType>(Ty)) | ||||||||
2349 | return InstructionCost::getInvalid(); | ||||||||
2350 | |||||||||
2351 | Type *ScalarTy = Ty->getElementType(); | ||||||||
2352 | Type *ScalarCondTy = CondTy->getElementType(); | ||||||||
2353 | unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements(); | ||||||||
2354 | unsigned NumReduxLevels = Log2_32(NumVecElts); | ||||||||
2355 | unsigned CmpOpcode; | ||||||||
2356 | if (Ty->isFPOrFPVectorTy()) { | ||||||||
2357 | CmpOpcode = Instruction::FCmp; | ||||||||
2358 | } else { | ||||||||
2359 | assert(Ty->isIntOrIntVectorTy() &&(static_cast <bool> (Ty->isIntOrIntVectorTy() && "expecting floating point or integer type for min/max reduction" ) ? void (0) : __assert_fail ("Ty->isIntOrIntVectorTy() && \"expecting floating point or integer type for min/max reduction\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 2360, __extension__ __PRETTY_FUNCTION__)) | ||||||||
2360 | "expecting floating point or integer type for min/max reduction")(static_cast <bool> (Ty->isIntOrIntVectorTy() && "expecting floating point or integer type for min/max reduction" ) ? void (0) : __assert_fail ("Ty->isIntOrIntVectorTy() && \"expecting floating point or integer type for min/max reduction\"" , "llvm/include/llvm/CodeGen/BasicTTIImpl.h", 2360, __extension__ __PRETTY_FUNCTION__)); | ||||||||
2361 | CmpOpcode = Instruction::ICmp; | ||||||||
2362 | } | ||||||||
2363 | InstructionCost MinMaxCost = 0; | ||||||||
2364 | InstructionCost ShuffleCost = 0; | ||||||||
2365 | std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty); | ||||||||
2366 | unsigned LongVectorCount = 0; | ||||||||
2367 | unsigned MVTLen = | ||||||||
2368 | LT.second.isVector() ? LT.second.getVectorNumElements() : 1; | ||||||||
2369 | while (NumVecElts > MVTLen) { | ||||||||
2370 | NumVecElts /= 2; | ||||||||
2371 | auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); | ||||||||
2372 | CondTy = FixedVectorType::get(ScalarCondTy, NumVecElts); | ||||||||
2373 | |||||||||
2374 | ShuffleCost += | ||||||||
2375 | thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, | ||||||||
2376 | CostKind, NumVecElts, SubTy); | ||||||||
2377 | MinMaxCost += | ||||||||
2378 | thisT()->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy, | ||||||||
2379 | CmpInst::BAD_ICMP_PREDICATE, CostKind) + | ||||||||
2380 | thisT()->getCmpSelInstrCost(Instruction::Select, SubTy, CondTy, | ||||||||
2381 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | ||||||||
2382 | Ty = SubTy; | ||||||||
2383 | ++LongVectorCount; | ||||||||
2384 | } | ||||||||
2385 | |||||||||
2386 | NumReduxLevels -= LongVectorCount; | ||||||||
2387 | |||||||||
2388 | // The minimal length of the vector is limited by the real length of vector | ||||||||
2389 | // operations performed on the current platform. That's why several final | ||||||||
2390 | // reduction opertions are perfomed on the vectors with the same | ||||||||
2391 | // architecture-dependent length. | ||||||||
2392 | ShuffleCost += | ||||||||
2393 | NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, | ||||||||
2394 | std::nullopt, CostKind, 0, Ty); | ||||||||
2395 | MinMaxCost += | ||||||||
2396 | NumReduxLevels * | ||||||||
2397 | (thisT()->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, | ||||||||
2398 | CmpInst::BAD_ICMP_PREDICATE, CostKind) + | ||||||||
2399 | thisT()->getCmpSelInstrCost(Instruction::Select, Ty, CondTy, | ||||||||
2400 | CmpInst::BAD_ICMP_PREDICATE, CostKind)); | ||||||||
2401 | // The last min/max should be in vector registers and we counted it above. | ||||||||
2402 | // So just need a single extractelement. | ||||||||
2403 | return ShuffleCost + MinMaxCost + | ||||||||
2404 | thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, | ||||||||
2405 | CostKind, 0, nullptr, nullptr); | ||||||||
2406 | } | ||||||||
2407 | |||||||||
2408 | InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, | ||||||||
2409 | Type *ResTy, VectorType *Ty, | ||||||||
2410 | std::optional<FastMathFlags> FMF, | ||||||||
2411 | TTI::TargetCostKind CostKind) { | ||||||||
2412 | // Without any native support, this is equivalent to the cost of | ||||||||
2413 | // vecreduce.opcode(ext(Ty A)). | ||||||||
2414 | VectorType *ExtTy = VectorType::get(ResTy, Ty); | ||||||||
2415 | InstructionCost RedCost = | ||||||||
2416 | thisT()->getArithmeticReductionCost(Opcode, ExtTy, FMF, CostKind); | ||||||||
2417 | InstructionCost ExtCost = thisT()->getCastInstrCost( | ||||||||
2418 | IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty, | ||||||||
2419 | TTI::CastContextHint::None, CostKind); | ||||||||
2420 | |||||||||
2421 | return RedCost + ExtCost; | ||||||||
2422 | } | ||||||||
2423 | |||||||||
2424 | InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, | ||||||||
2425 | VectorType *Ty, | ||||||||
2426 | TTI::TargetCostKind CostKind) { | ||||||||
2427 | // Without any native support, this is equivalent to the cost of | ||||||||
2428 | // vecreduce.add(mul(ext(Ty A), ext(Ty B))) or | ||||||||
2429 | // vecreduce.add(mul(A, B)). | ||||||||
2430 | VectorType *ExtTy = VectorType::get(ResTy, Ty); | ||||||||
2431 | InstructionCost RedCost = thisT()->getArithmeticReductionCost( | ||||||||
2432 | Instruction::Add, ExtTy, std::nullopt, CostKind); | ||||||||
2433 | InstructionCost ExtCost = thisT()->getCastInstrCost( | ||||||||
2434 | IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty, | ||||||||
2435 | TTI::CastContextHint::None, CostKind); | ||||||||
2436 | |||||||||
2437 | InstructionCost MulCost = | ||||||||
2438 | thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); | ||||||||
2439 | |||||||||
2440 | return RedCost + MulCost + 2 * ExtCost; | ||||||||
2441 | } | ||||||||
2442 | |||||||||
2443 | InstructionCost getVectorSplitCost() { return 1; } | ||||||||
2444 | |||||||||
2445 | /// @} | ||||||||
2446 | }; | ||||||||
2447 | |||||||||
2448 | /// Concrete BasicTTIImpl that can be used if no further customization | ||||||||
2449 | /// is needed. | ||||||||
2450 | class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> { | ||||||||
2451 | using BaseT = BasicTTIImplBase<BasicTTIImpl>; | ||||||||
2452 | |||||||||
2453 | friend class BasicTTIImplBase<BasicTTIImpl>; | ||||||||
2454 | |||||||||
2455 | const TargetSubtargetInfo *ST; | ||||||||
2456 | const TargetLoweringBase *TLI; | ||||||||
2457 | |||||||||
2458 | const TargetSubtargetInfo *getST() const { return ST; } | ||||||||
2459 | const TargetLoweringBase *getTLI() const { return TLI; } | ||||||||
2460 | |||||||||
2461 | public: | ||||||||
2462 | explicit BasicTTIImpl(const TargetMachine *TM, const Function &F); | ||||||||
2463 | }; | ||||||||
2464 | |||||||||
2465 | } // end namespace llvm | ||||||||
2466 | |||||||||
2467 | #endif // LLVM_CODEGEN_BASICTTIIMPL_H |