File: | llvm/lib/Target/X86/X86TargetTransformInfo.cpp |
Warning: | line 3192, column 20 Called C++ object pointer is null |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===// | |||
2 | // | |||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | |||
4 | // See https://llvm.org/LICENSE.txt for license information. | |||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | |||
6 | // | |||
7 | //===----------------------------------------------------------------------===// | |||
8 | /// \file | |||
9 | /// This file implements a TargetTransformInfo analysis pass specific to the | |||
10 | /// X86 target machine. It uses the target's detailed information to provide | |||
11 | /// more precise answers to certain TTI queries, while letting the target | |||
12 | /// independent and default TTI implementations handle the rest. | |||
13 | /// | |||
14 | //===----------------------------------------------------------------------===// | |||
15 | /// About Cost Model numbers used below it's necessary to say the following: | |||
16 | /// the numbers correspond to some "generic" X86 CPU instead of usage of | |||
17 | /// concrete CPU model. Usually the numbers correspond to CPU where the feature | |||
18 | /// apeared at the first time. For example, if we do Subtarget.hasSSE42() in | |||
19 | /// the lookups below the cost is based on Nehalem as that was the first CPU | |||
20 | /// to support that feature level and thus has most likely the worst case cost. | |||
21 | /// Some examples of other technologies/CPUs: | |||
22 | /// SSE 3 - Pentium4 / Athlon64 | |||
23 | /// SSE 4.1 - Penryn | |||
24 | /// SSE 4.2 - Nehalem | |||
25 | /// AVX - Sandy Bridge | |||
26 | /// AVX2 - Haswell | |||
27 | /// AVX-512 - Xeon Phi / Skylake | |||
28 | /// And some examples of instruction target dependent costs (latency) | |||
29 | /// divss sqrtss rsqrtss | |||
30 | /// AMD K7 11-16 19 3 | |||
31 | /// Piledriver 9-24 13-15 5 | |||
32 | /// Jaguar 14 16 2 | |||
33 | /// Pentium II,III 18 30 2 | |||
34 | /// Nehalem 7-14 7-18 3 | |||
35 | /// Haswell 10-13 11 5 | |||
36 | /// TODO: Develop and implement the target dependent cost model and | |||
37 | /// specialize cost numbers for different Cost Model Targets such as throughput, | |||
38 | /// code size, latency and uop count. | |||
39 | //===----------------------------------------------------------------------===// | |||
40 | ||||
41 | #include "X86TargetTransformInfo.h" | |||
42 | #include "llvm/Analysis/TargetTransformInfo.h" | |||
43 | #include "llvm/CodeGen/BasicTTIImpl.h" | |||
44 | #include "llvm/CodeGen/CostTable.h" | |||
45 | #include "llvm/CodeGen/TargetLowering.h" | |||
46 | #include "llvm/IR/IntrinsicInst.h" | |||
47 | #include "llvm/Support/Debug.h" | |||
48 | ||||
49 | using namespace llvm; | |||
50 | ||||
51 | #define DEBUG_TYPE"x86tti" "x86tti" | |||
52 | ||||
53 | //===----------------------------------------------------------------------===// | |||
54 | // | |||
55 | // X86 cost model. | |||
56 | // | |||
57 | //===----------------------------------------------------------------------===// | |||
58 | ||||
59 | TargetTransformInfo::PopcntSupportKind | |||
60 | X86TTIImpl::getPopcntSupport(unsigned TyWidth) { | |||
61 | assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2")((isPowerOf2_32(TyWidth) && "Ty width must be power of 2" ) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(TyWidth) && \"Ty width must be power of 2\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 61, __PRETTY_FUNCTION__)); | |||
62 | // TODO: Currently the __builtin_popcount() implementation using SSE3 | |||
63 | // instructions is inefficient. Once the problem is fixed, we should | |||
64 | // call ST->hasSSE3() instead of ST->hasPOPCNT(). | |||
65 | return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software; | |||
66 | } | |||
67 | ||||
68 | llvm::Optional<unsigned> X86TTIImpl::getCacheSize( | |||
69 | TargetTransformInfo::CacheLevel Level) const { | |||
70 | switch (Level) { | |||
71 | case TargetTransformInfo::CacheLevel::L1D: | |||
72 | // - Penryn | |||
73 | // - Nehalem | |||
74 | // - Westmere | |||
75 | // - Sandy Bridge | |||
76 | // - Ivy Bridge | |||
77 | // - Haswell | |||
78 | // - Broadwell | |||
79 | // - Skylake | |||
80 | // - Kabylake | |||
81 | return 32 * 1024; // 32 KByte | |||
82 | case TargetTransformInfo::CacheLevel::L2D: | |||
83 | // - Penryn | |||
84 | // - Nehalem | |||
85 | // - Westmere | |||
86 | // - Sandy Bridge | |||
87 | // - Ivy Bridge | |||
88 | // - Haswell | |||
89 | // - Broadwell | |||
90 | // - Skylake | |||
91 | // - Kabylake | |||
92 | return 256 * 1024; // 256 KByte | |||
93 | } | |||
94 | ||||
95 | llvm_unreachable("Unknown TargetTransformInfo::CacheLevel")::llvm::llvm_unreachable_internal("Unknown TargetTransformInfo::CacheLevel" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 95); | |||
96 | } | |||
97 | ||||
98 | llvm::Optional<unsigned> X86TTIImpl::getCacheAssociativity( | |||
99 | TargetTransformInfo::CacheLevel Level) const { | |||
100 | // - Penryn | |||
101 | // - Nehalem | |||
102 | // - Westmere | |||
103 | // - Sandy Bridge | |||
104 | // - Ivy Bridge | |||
105 | // - Haswell | |||
106 | // - Broadwell | |||
107 | // - Skylake | |||
108 | // - Kabylake | |||
109 | switch (Level) { | |||
110 | case TargetTransformInfo::CacheLevel::L1D: | |||
111 | LLVM_FALLTHROUGH[[gnu::fallthrough]]; | |||
112 | case TargetTransformInfo::CacheLevel::L2D: | |||
113 | return 8; | |||
114 | } | |||
115 | ||||
116 | llvm_unreachable("Unknown TargetTransformInfo::CacheLevel")::llvm::llvm_unreachable_internal("Unknown TargetTransformInfo::CacheLevel" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 116); | |||
117 | } | |||
118 | ||||
119 | unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const { | |||
120 | bool Vector = (ClassID == 1); | |||
121 | if (Vector && !ST->hasSSE1()) | |||
122 | return 0; | |||
123 | ||||
124 | if (ST->is64Bit()) { | |||
125 | if (Vector && ST->hasAVX512()) | |||
126 | return 32; | |||
127 | return 16; | |||
128 | } | |||
129 | return 8; | |||
130 | } | |||
131 | ||||
132 | unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const { | |||
133 | unsigned PreferVectorWidth = ST->getPreferVectorWidth(); | |||
134 | if (Vector) { | |||
135 | if (ST->hasAVX512() && PreferVectorWidth >= 512) | |||
136 | return 512; | |||
137 | if (ST->hasAVX() && PreferVectorWidth >= 256) | |||
138 | return 256; | |||
139 | if (ST->hasSSE1() && PreferVectorWidth >= 128) | |||
140 | return 128; | |||
141 | return 0; | |||
142 | } | |||
143 | ||||
144 | if (ST->is64Bit()) | |||
145 | return 64; | |||
146 | ||||
147 | return 32; | |||
148 | } | |||
149 | ||||
150 | unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const { | |||
151 | return getRegisterBitWidth(true); | |||
152 | } | |||
153 | ||||
154 | unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) { | |||
155 | // If the loop will not be vectorized, don't interleave the loop. | |||
156 | // Let regular unroll to unroll the loop, which saves the overflow | |||
157 | // check and memory check cost. | |||
158 | if (VF == 1) | |||
159 | return 1; | |||
160 | ||||
161 | if (ST->isAtom()) | |||
162 | return 1; | |||
163 | ||||
164 | // Sandybridge and Haswell have multiple execution ports and pipelined | |||
165 | // vector units. | |||
166 | if (ST->hasAVX()) | |||
167 | return 4; | |||
168 | ||||
169 | return 2; | |||
170 | } | |||
171 | ||||
172 | int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, | |||
173 | TTI::TargetCostKind CostKind, | |||
174 | TTI::OperandValueKind Op1Info, | |||
175 | TTI::OperandValueKind Op2Info, | |||
176 | TTI::OperandValueProperties Opd1PropInfo, | |||
177 | TTI::OperandValueProperties Opd2PropInfo, | |||
178 | ArrayRef<const Value *> Args, | |||
179 | const Instruction *CxtI) { | |||
180 | // TODO: Handle more cost kinds. | |||
181 | if (CostKind != TTI::TCK_RecipThroughput) | |||
182 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, | |||
183 | Op2Info, Opd1PropInfo, | |||
184 | Opd2PropInfo, Args, CxtI); | |||
185 | // Legalize the type. | |||
186 | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); | |||
187 | ||||
188 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
189 | assert(ISD && "Invalid opcode")((ISD && "Invalid opcode") ? static_cast<void> ( 0) : __assert_fail ("ISD && \"Invalid opcode\"", "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 189, __PRETTY_FUNCTION__)); | |||
190 | ||||
191 | static const CostTblEntry GLMCostTable[] = { | |||
192 | { ISD::FDIV, MVT::f32, 18 }, // divss | |||
193 | { ISD::FDIV, MVT::v4f32, 35 }, // divps | |||
194 | { ISD::FDIV, MVT::f64, 33 }, // divsd | |||
195 | { ISD::FDIV, MVT::v2f64, 65 }, // divpd | |||
196 | }; | |||
197 | ||||
198 | if (ST->useGLMDivSqrtCosts()) | |||
199 | if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, | |||
200 | LT.second)) | |||
201 | return LT.first * Entry->Cost; | |||
202 | ||||
203 | static const CostTblEntry SLMCostTable[] = { | |||
204 | { ISD::MUL, MVT::v4i32, 11 }, // pmulld | |||
205 | { ISD::MUL, MVT::v8i16, 2 }, // pmullw | |||
206 | { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence. | |||
207 | { ISD::FMUL, MVT::f64, 2 }, // mulsd | |||
208 | { ISD::FMUL, MVT::v2f64, 4 }, // mulpd | |||
209 | { ISD::FMUL, MVT::v4f32, 2 }, // mulps | |||
210 | { ISD::FDIV, MVT::f32, 17 }, // divss | |||
211 | { ISD::FDIV, MVT::v4f32, 39 }, // divps | |||
212 | { ISD::FDIV, MVT::f64, 32 }, // divsd | |||
213 | { ISD::FDIV, MVT::v2f64, 69 }, // divpd | |||
214 | { ISD::FADD, MVT::v2f64, 2 }, // addpd | |||
215 | { ISD::FSUB, MVT::v2f64, 2 }, // subpd | |||
216 | // v2i64/v4i64 mul is custom lowered as a series of long: | |||
217 | // multiplies(3), shifts(3) and adds(2) | |||
218 | // slm muldq version throughput is 2 and addq throughput 4 | |||
219 | // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) + | |||
220 | // 3X4 (addq throughput) = 17 | |||
221 | { ISD::MUL, MVT::v2i64, 17 }, | |||
222 | // slm addq\subq throughput is 4 | |||
223 | { ISD::ADD, MVT::v2i64, 4 }, | |||
224 | { ISD::SUB, MVT::v2i64, 4 }, | |||
225 | }; | |||
226 | ||||
227 | if (ST->isSLM()) { | |||
228 | if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) { | |||
229 | // Check if the operands can be shrinked into a smaller datatype. | |||
230 | bool Op1Signed = false; | |||
231 | unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed); | |||
232 | bool Op2Signed = false; | |||
233 | unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed); | |||
234 | ||||
235 | bool SignedMode = Op1Signed || Op2Signed; | |||
236 | unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize); | |||
237 | ||||
238 | if (OpMinSize <= 7) | |||
239 | return LT.first * 3; // pmullw/sext | |||
240 | if (!SignedMode && OpMinSize <= 8) | |||
241 | return LT.first * 3; // pmullw/zext | |||
242 | if (OpMinSize <= 15) | |||
243 | return LT.first * 5; // pmullw/pmulhw/pshuf | |||
244 | if (!SignedMode && OpMinSize <= 16) | |||
245 | return LT.first * 5; // pmullw/pmulhw/pshuf | |||
246 | } | |||
247 | ||||
248 | if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, | |||
249 | LT.second)) { | |||
250 | return LT.first * Entry->Cost; | |||
251 | } | |||
252 | } | |||
253 | ||||
254 | if ((ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV || | |||
255 | ISD == ISD::UREM) && | |||
256 | (Op2Info == TargetTransformInfo::OK_UniformConstantValue || | |||
257 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && | |||
258 | Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { | |||
259 | if (ISD == ISD::SDIV || ISD == ISD::SREM) { | |||
260 | // On X86, vector signed division by constants power-of-two are | |||
261 | // normally expanded to the sequence SRA + SRL + ADD + SRA. | |||
262 | // The OperandValue properties may not be the same as that of the previous | |||
263 | // operation; conservatively assume OP_None. | |||
264 | int Cost = | |||
265 | 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Op1Info, | |||
266 | Op2Info, | |||
267 | TargetTransformInfo::OP_None, | |||
268 | TargetTransformInfo::OP_None); | |||
269 | Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info, | |||
270 | Op2Info, | |||
271 | TargetTransformInfo::OP_None, | |||
272 | TargetTransformInfo::OP_None); | |||
273 | Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, Op1Info, | |||
274 | Op2Info, | |||
275 | TargetTransformInfo::OP_None, | |||
276 | TargetTransformInfo::OP_None); | |||
277 | ||||
278 | if (ISD == ISD::SREM) { | |||
279 | // For SREM: (X % C) is the equivalent of (X - (X/C)*C) | |||
280 | Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info, | |||
281 | Op2Info); | |||
282 | Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info, | |||
283 | Op2Info); | |||
284 | } | |||
285 | ||||
286 | return Cost; | |||
287 | } | |||
288 | ||||
289 | // Vector unsigned division/remainder will be simplified to shifts/masks. | |||
290 | if (ISD == ISD::UDIV) | |||
291 | return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, | |||
292 | Op1Info, Op2Info, | |||
293 | TargetTransformInfo::OP_None, | |||
294 | TargetTransformInfo::OP_None); | |||
295 | ||||
296 | else // UREM | |||
297 | return getArithmeticInstrCost(Instruction::And, Ty, CostKind, | |||
298 | Op1Info, Op2Info, | |||
299 | TargetTransformInfo::OP_None, | |||
300 | TargetTransformInfo::OP_None); | |||
301 | } | |||
302 | ||||
303 | static const CostTblEntry AVX512BWUniformConstCostTable[] = { | |||
304 | { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand. | |||
305 | { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand. | |||
306 | { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb. | |||
307 | }; | |||
308 | ||||
309 | if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && | |||
310 | ST->hasBWI()) { | |||
311 | if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD, | |||
312 | LT.second)) | |||
313 | return LT.first * Entry->Cost; | |||
314 | } | |||
315 | ||||
316 | static const CostTblEntry AVX512UniformConstCostTable[] = { | |||
317 | { ISD::SRA, MVT::v2i64, 1 }, | |||
318 | { ISD::SRA, MVT::v4i64, 1 }, | |||
319 | { ISD::SRA, MVT::v8i64, 1 }, | |||
320 | ||||
321 | { ISD::SHL, MVT::v64i8, 4 }, // psllw + pand. | |||
322 | { ISD::SRL, MVT::v64i8, 4 }, // psrlw + pand. | |||
323 | { ISD::SRA, MVT::v64i8, 8 }, // psrlw, pand, pxor, psubb. | |||
324 | ||||
325 | { ISD::SDIV, MVT::v16i32, 6 }, // pmuludq sequence | |||
326 | { ISD::SREM, MVT::v16i32, 8 }, // pmuludq+mul+sub sequence | |||
327 | { ISD::UDIV, MVT::v16i32, 5 }, // pmuludq sequence | |||
328 | { ISD::UREM, MVT::v16i32, 7 }, // pmuludq+mul+sub sequence | |||
329 | }; | |||
330 | ||||
331 | if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && | |||
332 | ST->hasAVX512()) { | |||
333 | if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD, | |||
334 | LT.second)) | |||
335 | return LT.first * Entry->Cost; | |||
336 | } | |||
337 | ||||
338 | static const CostTblEntry AVX2UniformConstCostTable[] = { | |||
339 | { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand. | |||
340 | { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand. | |||
341 | { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb. | |||
342 | ||||
343 | { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle. | |||
344 | ||||
345 | { ISD::SDIV, MVT::v8i32, 6 }, // pmuludq sequence | |||
346 | { ISD::SREM, MVT::v8i32, 8 }, // pmuludq+mul+sub sequence | |||
347 | { ISD::UDIV, MVT::v8i32, 5 }, // pmuludq sequence | |||
348 | { ISD::UREM, MVT::v8i32, 7 }, // pmuludq+mul+sub sequence | |||
349 | }; | |||
350 | ||||
351 | if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && | |||
352 | ST->hasAVX2()) { | |||
353 | if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD, | |||
354 | LT.second)) | |||
355 | return LT.first * Entry->Cost; | |||
356 | } | |||
357 | ||||
358 | static const CostTblEntry SSE2UniformConstCostTable[] = { | |||
359 | { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand. | |||
360 | { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand. | |||
361 | { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. | |||
362 | ||||
363 | { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split. | |||
364 | { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split. | |||
365 | { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split. | |||
366 | ||||
367 | { ISD::SDIV, MVT::v8i32, 12+2 }, // 2*pmuludq sequence + split. | |||
368 | { ISD::SREM, MVT::v8i32, 16+2 }, // 2*pmuludq+mul+sub sequence + split. | |||
369 | { ISD::SDIV, MVT::v4i32, 6 }, // pmuludq sequence | |||
370 | { ISD::SREM, MVT::v4i32, 8 }, // pmuludq+mul+sub sequence | |||
371 | { ISD::UDIV, MVT::v8i32, 10+2 }, // 2*pmuludq sequence + split. | |||
372 | { ISD::UREM, MVT::v8i32, 14+2 }, // 2*pmuludq+mul+sub sequence + split. | |||
373 | { ISD::UDIV, MVT::v4i32, 5 }, // pmuludq sequence | |||
374 | { ISD::UREM, MVT::v4i32, 7 }, // pmuludq+mul+sub sequence | |||
375 | }; | |||
376 | ||||
377 | // XOP has faster vXi8 shifts. | |||
378 | if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && | |||
379 | ST->hasSSE2() && !ST->hasXOP()) { | |||
380 | if (const auto *Entry = | |||
381 | CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second)) | |||
382 | return LT.first * Entry->Cost; | |||
383 | } | |||
384 | ||||
385 | static const CostTblEntry AVX512BWConstCostTable[] = { | |||
386 | { ISD::SDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence | |||
387 | { ISD::SREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence | |||
388 | { ISD::UDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence | |||
389 | { ISD::UREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence | |||
390 | { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence | |||
391 | { ISD::SREM, MVT::v32i16, 8 }, // vpmulhw+mul+sub sequence | |||
392 | { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence | |||
393 | { ISD::UREM, MVT::v32i16, 8 }, // vpmulhuw+mul+sub sequence | |||
394 | }; | |||
395 | ||||
396 | if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue || | |||
397 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && | |||
398 | ST->hasBWI()) { | |||
399 | if (const auto *Entry = | |||
400 | CostTableLookup(AVX512BWConstCostTable, ISD, LT.second)) | |||
401 | return LT.first * Entry->Cost; | |||
402 | } | |||
403 | ||||
404 | static const CostTblEntry AVX512ConstCostTable[] = { | |||
405 | { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence | |||
406 | { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence | |||
407 | { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence | |||
408 | { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence | |||
409 | { ISD::SDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence | |||
410 | { ISD::SREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence | |||
411 | { ISD::UDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence | |||
412 | { ISD::UREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence | |||
413 | { ISD::SDIV, MVT::v32i16, 12 }, // 2*vpmulhw sequence | |||
414 | { ISD::SREM, MVT::v32i16, 16 }, // 2*vpmulhw+mul+sub sequence | |||
415 | { ISD::UDIV, MVT::v32i16, 12 }, // 2*vpmulhuw sequence | |||
416 | { ISD::UREM, MVT::v32i16, 16 }, // 2*vpmulhuw+mul+sub sequence | |||
417 | }; | |||
418 | ||||
419 | if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue || | |||
420 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && | |||
421 | ST->hasAVX512()) { | |||
422 | if (const auto *Entry = | |||
423 | CostTableLookup(AVX512ConstCostTable, ISD, LT.second)) | |||
424 | return LT.first * Entry->Cost; | |||
425 | } | |||
426 | ||||
427 | static const CostTblEntry AVX2ConstCostTable[] = { | |||
428 | { ISD::SDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence | |||
429 | { ISD::SREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence | |||
430 | { ISD::UDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence | |||
431 | { ISD::UREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence | |||
432 | { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence | |||
433 | { ISD::SREM, MVT::v16i16, 8 }, // vpmulhw+mul+sub sequence | |||
434 | { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence | |||
435 | { ISD::UREM, MVT::v16i16, 8 }, // vpmulhuw+mul+sub sequence | |||
436 | { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence | |||
437 | { ISD::SREM, MVT::v8i32, 19 }, // vpmuldq+mul+sub sequence | |||
438 | { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence | |||
439 | { ISD::UREM, MVT::v8i32, 19 }, // vpmuludq+mul+sub sequence | |||
440 | }; | |||
441 | ||||
442 | if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue || | |||
443 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && | |||
444 | ST->hasAVX2()) { | |||
445 | if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second)) | |||
446 | return LT.first * Entry->Cost; | |||
447 | } | |||
448 | ||||
449 | static const CostTblEntry SSE2ConstCostTable[] = { | |||
450 | { ISD::SDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split. | |||
451 | { ISD::SREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split. | |||
452 | { ISD::SDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence | |||
453 | { ISD::SREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence | |||
454 | { ISD::UDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split. | |||
455 | { ISD::UREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split. | |||
456 | { ISD::UDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence | |||
457 | { ISD::UREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence | |||
458 | { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split. | |||
459 | { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split. | |||
460 | { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence | |||
461 | { ISD::SREM, MVT::v8i16, 8 }, // pmulhw+mul+sub sequence | |||
462 | { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split. | |||
463 | { ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split. | |||
464 | { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence | |||
465 | { ISD::UREM, MVT::v8i16, 8 }, // pmulhuw+mul+sub sequence | |||
466 | { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split. | |||
467 | { ISD::SREM, MVT::v8i32, 48+2 }, // 2*pmuludq+mul+sub sequence + split. | |||
468 | { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence | |||
469 | { ISD::SREM, MVT::v4i32, 24 }, // pmuludq+mul+sub sequence | |||
470 | { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split. | |||
471 | { ISD::UREM, MVT::v8i32, 40+2 }, // 2*pmuludq+mul+sub sequence + split. | |||
472 | { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence | |||
473 | { ISD::UREM, MVT::v4i32, 20 }, // pmuludq+mul+sub sequence | |||
474 | }; | |||
475 | ||||
476 | if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue || | |||
477 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && | |||
478 | ST->hasSSE2()) { | |||
479 | // pmuldq sequence. | |||
480 | if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX()) | |||
481 | return LT.first * 32; | |||
482 | if (ISD == ISD::SREM && LT.second == MVT::v8i32 && ST->hasAVX()) | |||
483 | return LT.first * 38; | |||
484 | if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41()) | |||
485 | return LT.first * 15; | |||
486 | if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41()) | |||
487 | return LT.first * 20; | |||
488 | ||||
489 | if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second)) | |||
490 | return LT.first * Entry->Cost; | |||
491 | } | |||
492 | ||||
493 | static const CostTblEntry AVX512BWShiftCostTable[] = { | |||
494 | { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw | |||
495 | { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw | |||
496 | { ISD::SRA, MVT::v8i16, 1 }, // vpsravw | |||
497 | ||||
498 | { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw | |||
499 | { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw | |||
500 | { ISD::SRA, MVT::v16i16, 1 }, // vpsravw | |||
501 | ||||
502 | { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw | |||
503 | { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw | |||
504 | { ISD::SRA, MVT::v32i16, 1 }, // vpsravw | |||
505 | }; | |||
506 | ||||
507 | if (ST->hasBWI()) | |||
508 | if (const auto *Entry = CostTableLookup(AVX512BWShiftCostTable, ISD, LT.second)) | |||
509 | return LT.first * Entry->Cost; | |||
510 | ||||
511 | static const CostTblEntry AVX2UniformCostTable[] = { | |||
512 | // Uniform splats are cheaper for the following instructions. | |||
513 | { ISD::SHL, MVT::v16i16, 1 }, // psllw. | |||
514 | { ISD::SRL, MVT::v16i16, 1 }, // psrlw. | |||
515 | { ISD::SRA, MVT::v16i16, 1 }, // psraw. | |||
516 | { ISD::SHL, MVT::v32i16, 2 }, // 2*psllw. | |||
517 | { ISD::SRL, MVT::v32i16, 2 }, // 2*psrlw. | |||
518 | { ISD::SRA, MVT::v32i16, 2 }, // 2*psraw. | |||
519 | }; | |||
520 | ||||
521 | if (ST->hasAVX2() && | |||
522 | ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || | |||
523 | (Op2Info == TargetTransformInfo::OK_UniformValue))) { | |||
524 | if (const auto *Entry = | |||
525 | CostTableLookup(AVX2UniformCostTable, ISD, LT.second)) | |||
526 | return LT.first * Entry->Cost; | |||
527 | } | |||
528 | ||||
529 | static const CostTblEntry SSE2UniformCostTable[] = { | |||
530 | // Uniform splats are cheaper for the following instructions. | |||
531 | { ISD::SHL, MVT::v8i16, 1 }, // psllw. | |||
532 | { ISD::SHL, MVT::v4i32, 1 }, // pslld | |||
533 | { ISD::SHL, MVT::v2i64, 1 }, // psllq. | |||
534 | ||||
535 | { ISD::SRL, MVT::v8i16, 1 }, // psrlw. | |||
536 | { ISD::SRL, MVT::v4i32, 1 }, // psrld. | |||
537 | { ISD::SRL, MVT::v2i64, 1 }, // psrlq. | |||
538 | ||||
539 | { ISD::SRA, MVT::v8i16, 1 }, // psraw. | |||
540 | { ISD::SRA, MVT::v4i32, 1 }, // psrad. | |||
541 | }; | |||
542 | ||||
543 | if (ST->hasSSE2() && | |||
544 | ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || | |||
545 | (Op2Info == TargetTransformInfo::OK_UniformValue))) { | |||
546 | if (const auto *Entry = | |||
547 | CostTableLookup(SSE2UniformCostTable, ISD, LT.second)) | |||
548 | return LT.first * Entry->Cost; | |||
549 | } | |||
550 | ||||
551 | static const CostTblEntry AVX512DQCostTable[] = { | |||
552 | { ISD::MUL, MVT::v2i64, 1 }, | |||
553 | { ISD::MUL, MVT::v4i64, 1 }, | |||
554 | { ISD::MUL, MVT::v8i64, 1 } | |||
555 | }; | |||
556 | ||||
557 | // Look for AVX512DQ lowering tricks for custom cases. | |||
558 | if (ST->hasDQI()) | |||
559 | if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second)) | |||
560 | return LT.first * Entry->Cost; | |||
561 | ||||
562 | static const CostTblEntry AVX512BWCostTable[] = { | |||
563 | { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence. | |||
564 | { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence. | |||
565 | { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence. | |||
566 | ||||
567 | { ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence. | |||
568 | { ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence. | |||
569 | { ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence. | |||
570 | }; | |||
571 | ||||
572 | // Look for AVX512BW lowering tricks for custom cases. | |||
573 | if (ST->hasBWI()) | |||
574 | if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second)) | |||
575 | return LT.first * Entry->Cost; | |||
576 | ||||
577 | static const CostTblEntry AVX512CostTable[] = { | |||
578 | { ISD::SHL, MVT::v16i32, 1 }, | |||
579 | { ISD::SRL, MVT::v16i32, 1 }, | |||
580 | { ISD::SRA, MVT::v16i32, 1 }, | |||
581 | ||||
582 | { ISD::SHL, MVT::v8i64, 1 }, | |||
583 | { ISD::SRL, MVT::v8i64, 1 }, | |||
584 | ||||
585 | { ISD::SRA, MVT::v2i64, 1 }, | |||
586 | { ISD::SRA, MVT::v4i64, 1 }, | |||
587 | { ISD::SRA, MVT::v8i64, 1 }, | |||
588 | ||||
589 | { ISD::MUL, MVT::v64i8, 26 }, // extend/pmullw/trunc sequence. | |||
590 | { ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence. | |||
591 | { ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence. | |||
592 | { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org) | |||
593 | { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org) | |||
594 | { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org) | |||
595 | { ISD::MUL, MVT::v8i64, 8 }, // 3*pmuludq/3*shift/2*add | |||
596 | ||||
597 | { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ | |||
598 | { ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ | |||
599 | { ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ | |||
600 | ||||
601 | { ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ | |||
602 | { ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ | |||
603 | { ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ | |||
604 | }; | |||
605 | ||||
606 | if (ST->hasAVX512()) | |||
607 | if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second)) | |||
608 | return LT.first * Entry->Cost; | |||
609 | ||||
610 | static const CostTblEntry AVX2ShiftCostTable[] = { | |||
611 | // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to | |||
612 | // customize them to detect the cases where shift amount is a scalar one. | |||
613 | { ISD::SHL, MVT::v4i32, 1 }, | |||
614 | { ISD::SRL, MVT::v4i32, 1 }, | |||
615 | { ISD::SRA, MVT::v4i32, 1 }, | |||
616 | { ISD::SHL, MVT::v8i32, 1 }, | |||
617 | { ISD::SRL, MVT::v8i32, 1 }, | |||
618 | { ISD::SRA, MVT::v8i32, 1 }, | |||
619 | { ISD::SHL, MVT::v2i64, 1 }, | |||
620 | { ISD::SRL, MVT::v2i64, 1 }, | |||
621 | { ISD::SHL, MVT::v4i64, 1 }, | |||
622 | { ISD::SRL, MVT::v4i64, 1 }, | |||
623 | }; | |||
624 | ||||
625 | if (ST->hasAVX512()) { | |||
626 | if (ISD == ISD::SHL && LT.second == MVT::v32i16 && | |||
627 | (Op2Info == TargetTransformInfo::OK_UniformConstantValue || | |||
628 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) | |||
629 | // On AVX512, a packed v32i16 shift left by a constant build_vector | |||
630 | // is lowered into a vector multiply (vpmullw). | |||
631 | return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, | |||
632 | Op1Info, Op2Info, | |||
633 | TargetTransformInfo::OP_None, | |||
634 | TargetTransformInfo::OP_None); | |||
635 | } | |||
636 | ||||
637 | // Look for AVX2 lowering tricks. | |||
638 | if (ST->hasAVX2()) { | |||
639 | if (ISD == ISD::SHL && LT.second == MVT::v16i16 && | |||
640 | (Op2Info == TargetTransformInfo::OK_UniformConstantValue || | |||
641 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) | |||
642 | // On AVX2, a packed v16i16 shift left by a constant build_vector | |||
643 | // is lowered into a vector multiply (vpmullw). | |||
644 | return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, | |||
645 | Op1Info, Op2Info, | |||
646 | TargetTransformInfo::OP_None, | |||
647 | TargetTransformInfo::OP_None); | |||
648 | ||||
649 | if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second)) | |||
650 | return LT.first * Entry->Cost; | |||
651 | } | |||
652 | ||||
653 | static const CostTblEntry XOPShiftCostTable[] = { | |||
654 | // 128bit shifts take 1cy, but right shifts require negation beforehand. | |||
655 | { ISD::SHL, MVT::v16i8, 1 }, | |||
656 | { ISD::SRL, MVT::v16i8, 2 }, | |||
657 | { ISD::SRA, MVT::v16i8, 2 }, | |||
658 | { ISD::SHL, MVT::v8i16, 1 }, | |||
659 | { ISD::SRL, MVT::v8i16, 2 }, | |||
660 | { ISD::SRA, MVT::v8i16, 2 }, | |||
661 | { ISD::SHL, MVT::v4i32, 1 }, | |||
662 | { ISD::SRL, MVT::v4i32, 2 }, | |||
663 | { ISD::SRA, MVT::v4i32, 2 }, | |||
664 | { ISD::SHL, MVT::v2i64, 1 }, | |||
665 | { ISD::SRL, MVT::v2i64, 2 }, | |||
666 | { ISD::SRA, MVT::v2i64, 2 }, | |||
667 | // 256bit shifts require splitting if AVX2 didn't catch them above. | |||
668 | { ISD::SHL, MVT::v32i8, 2+2 }, | |||
669 | { ISD::SRL, MVT::v32i8, 4+2 }, | |||
670 | { ISD::SRA, MVT::v32i8, 4+2 }, | |||
671 | { ISD::SHL, MVT::v16i16, 2+2 }, | |||
672 | { ISD::SRL, MVT::v16i16, 4+2 }, | |||
673 | { ISD::SRA, MVT::v16i16, 4+2 }, | |||
674 | { ISD::SHL, MVT::v8i32, 2+2 }, | |||
675 | { ISD::SRL, MVT::v8i32, 4+2 }, | |||
676 | { ISD::SRA, MVT::v8i32, 4+2 }, | |||
677 | { ISD::SHL, MVT::v4i64, 2+2 }, | |||
678 | { ISD::SRL, MVT::v4i64, 4+2 }, | |||
679 | { ISD::SRA, MVT::v4i64, 4+2 }, | |||
680 | }; | |||
681 | ||||
682 | // Look for XOP lowering tricks. | |||
683 | if (ST->hasXOP()) { | |||
684 | // If the right shift is constant then we'll fold the negation so | |||
685 | // it's as cheap as a left shift. | |||
686 | int ShiftISD = ISD; | |||
687 | if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && | |||
688 | (Op2Info == TargetTransformInfo::OK_UniformConstantValue || | |||
689 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) | |||
690 | ShiftISD = ISD::SHL; | |||
691 | if (const auto *Entry = | |||
692 | CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second)) | |||
693 | return LT.first * Entry->Cost; | |||
694 | } | |||
695 | ||||
696 | static const CostTblEntry SSE2UniformShiftCostTable[] = { | |||
697 | // Uniform splats are cheaper for the following instructions. | |||
698 | { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split. | |||
699 | { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split. | |||
700 | { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split. | |||
701 | ||||
702 | { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split. | |||
703 | { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split. | |||
704 | { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split. | |||
705 | ||||
706 | { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split. | |||
707 | { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split. | |||
708 | { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle. | |||
709 | { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split. | |||
710 | }; | |||
711 | ||||
712 | if (ST->hasSSE2() && | |||
713 | ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || | |||
714 | (Op2Info == TargetTransformInfo::OK_UniformValue))) { | |||
715 | ||||
716 | // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table. | |||
717 | if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2()) | |||
718 | return LT.first * 4; // 2*psrad + shuffle. | |||
719 | ||||
720 | if (const auto *Entry = | |||
721 | CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second)) | |||
722 | return LT.first * Entry->Cost; | |||
723 | } | |||
724 | ||||
725 | if (ISD == ISD::SHL && | |||
726 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) { | |||
727 | MVT VT = LT.second; | |||
728 | // Vector shift left by non uniform constant can be lowered | |||
729 | // into vector multiply. | |||
730 | if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) || | |||
731 | ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX())) | |||
732 | ISD = ISD::MUL; | |||
733 | } | |||
734 | ||||
735 | static const CostTblEntry AVX2CostTable[] = { | |||
736 | { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence. | |||
737 | { ISD::SHL, MVT::v64i8, 22 }, // 2*vpblendvb sequence. | |||
738 | { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. | |||
739 | { ISD::SHL, MVT::v32i16, 20 }, // 2*extend/vpsrlvd/pack sequence. | |||
740 | ||||
741 | { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence. | |||
742 | { ISD::SRL, MVT::v64i8, 22 }, // 2*vpblendvb sequence. | |||
743 | { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. | |||
744 | { ISD::SRL, MVT::v32i16, 20 }, // 2*extend/vpsrlvd/pack sequence. | |||
745 | ||||
746 | { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence. | |||
747 | { ISD::SRA, MVT::v64i8, 48 }, // 2*vpblendvb sequence. | |||
748 | { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence. | |||
749 | { ISD::SRA, MVT::v32i16, 20 }, // 2*extend/vpsravd/pack sequence. | |||
750 | { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence. | |||
751 | { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence. | |||
752 | ||||
753 | { ISD::SUB, MVT::v32i8, 1 }, // psubb | |||
754 | { ISD::ADD, MVT::v32i8, 1 }, // paddb | |||
755 | { ISD::SUB, MVT::v16i16, 1 }, // psubw | |||
756 | { ISD::ADD, MVT::v16i16, 1 }, // paddw | |||
757 | { ISD::SUB, MVT::v8i32, 1 }, // psubd | |||
758 | { ISD::ADD, MVT::v8i32, 1 }, // paddd | |||
759 | { ISD::SUB, MVT::v4i64, 1 }, // psubq | |||
760 | { ISD::ADD, MVT::v4i64, 1 }, // paddq | |||
761 | ||||
762 | { ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence. | |||
763 | { ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence. | |||
764 | { ISD::MUL, MVT::v16i16, 1 }, // pmullw | |||
765 | { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org) | |||
766 | { ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add | |||
767 | ||||
768 | { ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ | |||
769 | { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ | |||
770 | { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ | |||
771 | { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ | |||
772 | { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ | |||
773 | { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ | |||
774 | ||||
775 | { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/ | |||
776 | { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/ | |||
777 | { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/ | |||
778 | { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/ | |||
779 | { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/ | |||
780 | { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/ | |||
781 | }; | |||
782 | ||||
783 | // Look for AVX2 lowering tricks for custom cases. | |||
784 | if (ST->hasAVX2()) | |||
785 | if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second)) | |||
786 | return LT.first * Entry->Cost; | |||
787 | ||||
788 | static const CostTblEntry AVX1CostTable[] = { | |||
789 | // We don't have to scalarize unsupported ops. We can issue two half-sized | |||
790 | // operations and we only need to extract the upper YMM half. | |||
791 | // Two ops + 1 extract + 1 insert = 4. | |||
792 | { ISD::MUL, MVT::v16i16, 4 }, | |||
793 | { ISD::MUL, MVT::v8i32, 4 }, | |||
794 | { ISD::SUB, MVT::v32i8, 4 }, | |||
795 | { ISD::ADD, MVT::v32i8, 4 }, | |||
796 | { ISD::SUB, MVT::v16i16, 4 }, | |||
797 | { ISD::ADD, MVT::v16i16, 4 }, | |||
798 | { ISD::SUB, MVT::v8i32, 4 }, | |||
799 | { ISD::ADD, MVT::v8i32, 4 }, | |||
800 | { ISD::SUB, MVT::v4i64, 4 }, | |||
801 | { ISD::ADD, MVT::v4i64, 4 }, | |||
802 | ||||
803 | // A v4i64 multiply is custom lowered as two split v2i64 vectors that then | |||
804 | // are lowered as a series of long multiplies(3), shifts(3) and adds(2) | |||
805 | // Because we believe v4i64 to be a legal type, we must also include the | |||
806 | // extract+insert in the cost table. Therefore, the cost here is 18 | |||
807 | // instead of 8. | |||
808 | { ISD::MUL, MVT::v4i64, 18 }, | |||
809 | ||||
810 | { ISD::MUL, MVT::v32i8, 26 }, // extend/pmullw/trunc sequence. | |||
811 | ||||
812 | { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/ | |||
813 | { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/ | |||
814 | { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/ | |||
815 | { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/ | |||
816 | { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/ | |||
817 | { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/ | |||
818 | }; | |||
819 | ||||
820 | if (ST->hasAVX()) | |||
821 | if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second)) | |||
822 | return LT.first * Entry->Cost; | |||
823 | ||||
824 | static const CostTblEntry SSE42CostTable[] = { | |||
825 | { ISD::FADD, MVT::f64, 1 }, // Nehalem from http://www.agner.org/ | |||
826 | { ISD::FADD, MVT::f32, 1 }, // Nehalem from http://www.agner.org/ | |||
827 | { ISD::FADD, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/ | |||
828 | { ISD::FADD, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/ | |||
829 | ||||
830 | { ISD::FSUB, MVT::f64, 1 }, // Nehalem from http://www.agner.org/ | |||
831 | { ISD::FSUB, MVT::f32 , 1 }, // Nehalem from http://www.agner.org/ | |||
832 | { ISD::FSUB, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/ | |||
833 | { ISD::FSUB, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/ | |||
834 | ||||
835 | { ISD::FMUL, MVT::f64, 1 }, // Nehalem from http://www.agner.org/ | |||
836 | { ISD::FMUL, MVT::f32, 1 }, // Nehalem from http://www.agner.org/ | |||
837 | { ISD::FMUL, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/ | |||
838 | { ISD::FMUL, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/ | |||
839 | ||||
840 | { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/ | |||
841 | { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/ | |||
842 | { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/ | |||
843 | { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/ | |||
844 | }; | |||
845 | ||||
846 | if (ST->hasSSE42()) | |||
847 | if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second)) | |||
848 | return LT.first * Entry->Cost; | |||
849 | ||||
850 | static const CostTblEntry SSE41CostTable[] = { | |||
851 | { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence. | |||
852 | { ISD::SHL, MVT::v32i8, 2*11+2 }, // pblendvb sequence + split. | |||
853 | { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence. | |||
854 | { ISD::SHL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split. | |||
855 | { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld | |||
856 | { ISD::SHL, MVT::v8i32, 2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split | |||
857 | ||||
858 | { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence. | |||
859 | { ISD::SRL, MVT::v32i8, 2*12+2 }, // pblendvb sequence + split. | |||
860 | { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence. | |||
861 | { ISD::SRL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split. | |||
862 | { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend. | |||
863 | { ISD::SRL, MVT::v8i32, 2*11+2 }, // Shift each lane + blend + split. | |||
864 | ||||
865 | { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence. | |||
866 | { ISD::SRA, MVT::v32i8, 2*24+2 }, // pblendvb sequence + split. | |||
867 | { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence. | |||
868 | { ISD::SRA, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split. | |||
869 | { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend. | |||
870 | { ISD::SRA, MVT::v8i32, 2*12+2 }, // Shift each lane + blend + split. | |||
871 | ||||
872 | { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org) | |||
873 | }; | |||
874 | ||||
875 | if (ST->hasSSE41()) | |||
876 | if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second)) | |||
877 | return LT.first * Entry->Cost; | |||
878 | ||||
879 | static const CostTblEntry SSE2CostTable[] = { | |||
880 | // We don't correctly identify costs of casts because they are marked as | |||
881 | // custom. | |||
882 | { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence. | |||
883 | { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence. | |||
884 | { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul. | |||
885 | { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence. | |||
886 | { ISD::SHL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split. | |||
887 | ||||
888 | { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence. | |||
889 | { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence. | |||
890 | { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend. | |||
891 | { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence. | |||
892 | { ISD::SRL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split. | |||
893 | ||||
894 | { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence. | |||
895 | { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence. | |||
896 | { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend. | |||
897 | { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence. | |||
898 | { ISD::SRA, MVT::v4i64, 2*12+2 }, // srl/xor/sub sequence+split. | |||
899 | ||||
900 | { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence. | |||
901 | { ISD::MUL, MVT::v8i16, 1 }, // pmullw | |||
902 | { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle | |||
903 | { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add | |||
904 | ||||
905 | { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/ | |||
906 | { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/ | |||
907 | { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/ | |||
908 | { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/ | |||
909 | ||||
910 | { ISD::FADD, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/ | |||
911 | { ISD::FADD, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/ | |||
912 | ||||
913 | { ISD::FSUB, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/ | |||
914 | { ISD::FSUB, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/ | |||
915 | }; | |||
916 | ||||
917 | if (ST->hasSSE2()) | |||
918 | if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second)) | |||
919 | return LT.first * Entry->Cost; | |||
920 | ||||
921 | static const CostTblEntry SSE1CostTable[] = { | |||
922 | { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/ | |||
923 | { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/ | |||
924 | ||||
925 | { ISD::FADD, MVT::f32, 1 }, // Pentium III from http://www.agner.org/ | |||
926 | { ISD::FADD, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/ | |||
927 | ||||
928 | { ISD::FSUB, MVT::f32, 1 }, // Pentium III from http://www.agner.org/ | |||
929 | { ISD::FSUB, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/ | |||
930 | ||||
931 | { ISD::ADD, MVT::i8, 1 }, // Pentium III from http://www.agner.org/ | |||
932 | { ISD::ADD, MVT::i16, 1 }, // Pentium III from http://www.agner.org/ | |||
933 | { ISD::ADD, MVT::i32, 1 }, // Pentium III from http://www.agner.org/ | |||
934 | ||||
935 | { ISD::SUB, MVT::i8, 1 }, // Pentium III from http://www.agner.org/ | |||
936 | { ISD::SUB, MVT::i16, 1 }, // Pentium III from http://www.agner.org/ | |||
937 | { ISD::SUB, MVT::i32, 1 }, // Pentium III from http://www.agner.org/ | |||
938 | }; | |||
939 | ||||
940 | if (ST->hasSSE1()) | |||
941 | if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second)) | |||
942 | return LT.first * Entry->Cost; | |||
943 | ||||
944 | // It is not a good idea to vectorize division. We have to scalarize it and | |||
945 | // in the process we will often end up having to spilling regular | |||
946 | // registers. The overhead of division is going to dominate most kernels | |||
947 | // anyways so try hard to prevent vectorization of division - it is | |||
948 | // generally a bad idea. Assume somewhat arbitrarily that we have to be able | |||
949 | // to hide "20 cycles" for each lane. | |||
950 | if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM || | |||
951 | ISD == ISD::UDIV || ISD == ISD::UREM)) { | |||
952 | int ScalarCost = getArithmeticInstrCost( | |||
953 | Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info, | |||
954 | TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); | |||
955 | return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost; | |||
956 | } | |||
957 | ||||
958 | // Fallback to the default implementation. | |||
959 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info); | |||
960 | } | |||
961 | ||||
962 | int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp, | |||
963 | int Index, VectorType *SubTp) { | |||
964 | // 64-bit packed float vectors (v2f32) are widened to type v4f32. | |||
965 | // 64-bit packed integer vectors (v2i32) are widened to type v4i32. | |||
966 | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, BaseTp); | |||
967 | ||||
968 | // Treat Transpose as 2-op shuffles - there's no difference in lowering. | |||
969 | if (Kind == TTI::SK_Transpose) | |||
970 | Kind = TTI::SK_PermuteTwoSrc; | |||
971 | ||||
972 | // For Broadcasts we are splatting the first element from the first input | |||
973 | // register, so only need to reference that input and all the output | |||
974 | // registers are the same. | |||
975 | if (Kind == TTI::SK_Broadcast) | |||
976 | LT.first = 1; | |||
977 | ||||
978 | // Subvector extractions are free if they start at the beginning of a | |||
979 | // vector and cheap if the subvectors are aligned. | |||
980 | if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) { | |||
981 | int NumElts = LT.second.getVectorNumElements(); | |||
982 | if ((Index % NumElts) == 0) | |||
983 | return 0; | |||
984 | std::pair<int, MVT> SubLT = TLI->getTypeLegalizationCost(DL, SubTp); | |||
985 | if (SubLT.second.isVector()) { | |||
986 | int NumSubElts = SubLT.second.getVectorNumElements(); | |||
987 | if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) | |||
988 | return SubLT.first; | |||
989 | // Handle some cases for widening legalization. For now we only handle | |||
990 | // cases where the original subvector was naturally aligned and evenly | |||
991 | // fit in its legalized subvector type. | |||
992 | // FIXME: Remove some of the alignment restrictions. | |||
993 | // FIXME: We can use permq for 64-bit or larger extracts from 256-bit | |||
994 | // vectors. | |||
995 | int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements(); | |||
996 | if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 && | |||
997 | (NumSubElts % OrigSubElts) == 0 && | |||
998 | LT.second.getVectorElementType() == | |||
999 | SubLT.second.getVectorElementType() && | |||
1000 | LT.second.getVectorElementType().getSizeInBits() == | |||
1001 | BaseTp->getElementType()->getPrimitiveSizeInBits()) { | |||
1002 | assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&((NumElts >= NumSubElts && NumElts > OrigSubElts && "Unexpected number of elements!") ? static_cast< void> (0) : __assert_fail ("NumElts >= NumSubElts && NumElts > OrigSubElts && \"Unexpected number of elements!\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 1003, __PRETTY_FUNCTION__)) | |||
1003 | "Unexpected number of elements!")((NumElts >= NumSubElts && NumElts > OrigSubElts && "Unexpected number of elements!") ? static_cast< void> (0) : __assert_fail ("NumElts >= NumSubElts && NumElts > OrigSubElts && \"Unexpected number of elements!\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 1003, __PRETTY_FUNCTION__)); | |||
1004 | auto *VecTy = FixedVectorType::get(BaseTp->getElementType(), | |||
1005 | LT.second.getVectorNumElements()); | |||
1006 | auto *SubTy = FixedVectorType::get(BaseTp->getElementType(), | |||
1007 | SubLT.second.getVectorNumElements()); | |||
1008 | int ExtractIndex = alignDown((Index % NumElts), NumSubElts); | |||
1009 | int ExtractCost = getShuffleCost(TTI::SK_ExtractSubvector, VecTy, | |||
1010 | ExtractIndex, SubTy); | |||
1011 | ||||
1012 | // If the original size is 32-bits or more, we can use pshufd. Otherwise | |||
1013 | // if we have SSSE3 we can use pshufb. | |||
1014 | if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3()) | |||
1015 | return ExtractCost + 1; // pshufd or pshufb | |||
1016 | ||||
1017 | assert(SubTp->getPrimitiveSizeInBits() == 16 &&((SubTp->getPrimitiveSizeInBits() == 16 && "Unexpected vector size" ) ? static_cast<void> (0) : __assert_fail ("SubTp->getPrimitiveSizeInBits() == 16 && \"Unexpected vector size\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 1018, __PRETTY_FUNCTION__)) | |||
1018 | "Unexpected vector size")((SubTp->getPrimitiveSizeInBits() == 16 && "Unexpected vector size" ) ? static_cast<void> (0) : __assert_fail ("SubTp->getPrimitiveSizeInBits() == 16 && \"Unexpected vector size\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 1018, __PRETTY_FUNCTION__)); | |||
1019 | ||||
1020 | return ExtractCost + 2; // worst case pshufhw + pshufd | |||
1021 | } | |||
1022 | } | |||
1023 | } | |||
1024 | ||||
1025 | // Handle some common (illegal) sub-vector types as they are often very cheap | |||
1026 | // to shuffle even on targets without PSHUFB. | |||
1027 | EVT VT = TLI->getValueType(DL, BaseTp); | |||
1028 | if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 && | |||
1029 | !ST->hasSSSE3()) { | |||
1030 | static const CostTblEntry SSE2SubVectorShuffleTbl[] = { | |||
1031 | {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw | |||
1032 | {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw | |||
1033 | {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw | |||
1034 | {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw | |||
1035 | {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck | |||
1036 | ||||
1037 | {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw | |||
1038 | {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw | |||
1039 | {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus | |||
1040 | {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck | |||
1041 | ||||
1042 | {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw | |||
1043 | {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw | |||
1044 | {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw | |||
1045 | {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw | |||
1046 | {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck | |||
1047 | ||||
1048 | {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw | |||
1049 | {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw | |||
1050 | {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw | |||
1051 | {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw | |||
1052 | {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck | |||
1053 | }; | |||
1054 | ||||
1055 | if (ST->hasSSE2()) | |||
1056 | if (const auto *Entry = | |||
1057 | CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT())) | |||
1058 | return Entry->Cost; | |||
1059 | } | |||
1060 | ||||
1061 | // We are going to permute multiple sources and the result will be in multiple | |||
1062 | // destinations. Providing an accurate cost only for splits where the element | |||
1063 | // type remains the same. | |||
1064 | if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) { | |||
1065 | MVT LegalVT = LT.second; | |||
1066 | if (LegalVT.isVector() && | |||
1067 | LegalVT.getVectorElementType().getSizeInBits() == | |||
1068 | BaseTp->getElementType()->getPrimitiveSizeInBits() && | |||
1069 | LegalVT.getVectorNumElements() < | |||
1070 | cast<FixedVectorType>(BaseTp)->getNumElements()) { | |||
1071 | ||||
1072 | unsigned VecTySize = DL.getTypeStoreSize(BaseTp); | |||
1073 | unsigned LegalVTSize = LegalVT.getStoreSize(); | |||
1074 | // Number of source vectors after legalization: | |||
1075 | unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize; | |||
1076 | // Number of destination vectors after legalization: | |||
1077 | unsigned NumOfDests = LT.first; | |||
1078 | ||||
1079 | auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(), | |||
1080 | LegalVT.getVectorNumElements()); | |||
1081 | ||||
1082 | unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests; | |||
1083 | return NumOfShuffles * | |||
1084 | getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr); | |||
1085 | } | |||
1086 | ||||
1087 | return BaseT::getShuffleCost(Kind, BaseTp, Index, SubTp); | |||
1088 | } | |||
1089 | ||||
1090 | // For 2-input shuffles, we must account for splitting the 2 inputs into many. | |||
1091 | if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) { | |||
1092 | // We assume that source and destination have the same vector type. | |||
1093 | int NumOfDests = LT.first; | |||
1094 | int NumOfShufflesPerDest = LT.first * 2 - 1; | |||
1095 | LT.first = NumOfDests * NumOfShufflesPerDest; | |||
1096 | } | |||
1097 | ||||
1098 | static const CostTblEntry AVX512VBMIShuffleTbl[] = { | |||
1099 | {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb | |||
1100 | {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb | |||
1101 | ||||
1102 | {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb | |||
1103 | {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb | |||
1104 | ||||
1105 | {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b | |||
1106 | {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b | |||
1107 | {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b | |||
1108 | }; | |||
1109 | ||||
1110 | if (ST->hasVBMI()) | |||
1111 | if (const auto *Entry = | |||
1112 | CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second)) | |||
1113 | return LT.first * Entry->Cost; | |||
1114 | ||||
1115 | static const CostTblEntry AVX512BWShuffleTbl[] = { | |||
1116 | {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw | |||
1117 | {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb | |||
1118 | ||||
1119 | {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw | |||
1120 | {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw | |||
1121 | {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2 | |||
1122 | ||||
1123 | {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw | |||
1124 | {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw | |||
1125 | {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16 | |||
1126 | ||||
1127 | {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w | |||
1128 | {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w | |||
1129 | {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w | |||
1130 | {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1 | |||
1131 | ||||
1132 | {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw | |||
1133 | {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb | |||
1134 | }; | |||
1135 | ||||
1136 | if (ST->hasBWI()) | |||
1137 | if (const auto *Entry = | |||
1138 | CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second)) | |||
1139 | return LT.first * Entry->Cost; | |||
1140 | ||||
1141 | static const CostTblEntry AVX512ShuffleTbl[] = { | |||
1142 | {TTI::SK_Broadcast, MVT::v8f64, 1}, // vbroadcastpd | |||
1143 | {TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps | |||
1144 | {TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq | |||
1145 | {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd | |||
1146 | {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw | |||
1147 | {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb | |||
1148 | ||||
1149 | {TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd | |||
1150 | {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps | |||
1151 | {TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq | |||
1152 | {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd | |||
1153 | ||||
1154 | {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd | |||
1155 | {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd | |||
1156 | {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // vpermpd | |||
1157 | {TTI::SK_PermuteSingleSrc, MVT::v16f32, 1}, // vpermps | |||
1158 | {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps | |||
1159 | {TTI::SK_PermuteSingleSrc, MVT::v4f32, 1}, // vpermps | |||
1160 | {TTI::SK_PermuteSingleSrc, MVT::v8i64, 1}, // vpermq | |||
1161 | {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq | |||
1162 | {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // vpermq | |||
1163 | {TTI::SK_PermuteSingleSrc, MVT::v16i32, 1}, // vpermd | |||
1164 | {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd | |||
1165 | {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // vpermd | |||
1166 | {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb | |||
1167 | ||||
1168 | {TTI::SK_PermuteTwoSrc, MVT::v8f64, 1}, // vpermt2pd | |||
1169 | {TTI::SK_PermuteTwoSrc, MVT::v16f32, 1}, // vpermt2ps | |||
1170 | {TTI::SK_PermuteTwoSrc, MVT::v8i64, 1}, // vpermt2q | |||
1171 | {TTI::SK_PermuteTwoSrc, MVT::v16i32, 1}, // vpermt2d | |||
1172 | {TTI::SK_PermuteTwoSrc, MVT::v4f64, 1}, // vpermt2pd | |||
1173 | {TTI::SK_PermuteTwoSrc, MVT::v8f32, 1}, // vpermt2ps | |||
1174 | {TTI::SK_PermuteTwoSrc, MVT::v4i64, 1}, // vpermt2q | |||
1175 | {TTI::SK_PermuteTwoSrc, MVT::v8i32, 1}, // vpermt2d | |||
1176 | {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1}, // vpermt2pd | |||
1177 | {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1}, // vpermt2ps | |||
1178 | {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1}, // vpermt2q | |||
1179 | {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1}, // vpermt2d | |||
1180 | ||||
1181 | // FIXME: This just applies the type legalization cost rules above | |||
1182 | // assuming these completely split. | |||
1183 | {TTI::SK_PermuteSingleSrc, MVT::v32i16, 14}, | |||
1184 | {TTI::SK_PermuteSingleSrc, MVT::v64i8, 14}, | |||
1185 | {TTI::SK_PermuteTwoSrc, MVT::v32i16, 42}, | |||
1186 | {TTI::SK_PermuteTwoSrc, MVT::v64i8, 42}, | |||
1187 | ||||
1188 | {TTI::SK_Select, MVT::v32i16, 1}, // vpternlogq | |||
1189 | {TTI::SK_Select, MVT::v64i8, 1}, // vpternlogq | |||
1190 | {TTI::SK_Select, MVT::v8f64, 1}, // vblendmpd | |||
1191 | {TTI::SK_Select, MVT::v16f32, 1}, // vblendmps | |||
1192 | {TTI::SK_Select, MVT::v8i64, 1}, // vblendmq | |||
1193 | {TTI::SK_Select, MVT::v16i32, 1}, // vblendmd | |||
1194 | }; | |||
1195 | ||||
1196 | if (ST->hasAVX512()) | |||
1197 | if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second)) | |||
1198 | return LT.first * Entry->Cost; | |||
1199 | ||||
1200 | static const CostTblEntry AVX2ShuffleTbl[] = { | |||
1201 | {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd | |||
1202 | {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps | |||
1203 | {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq | |||
1204 | {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd | |||
1205 | {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw | |||
1206 | {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb | |||
1207 | ||||
1208 | {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd | |||
1209 | {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps | |||
1210 | {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq | |||
1211 | {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd | |||
1212 | {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb | |||
1213 | {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb | |||
1214 | ||||
1215 | {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb | |||
1216 | {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb | |||
1217 | ||||
1218 | {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd | |||
1219 | {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps | |||
1220 | {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq | |||
1221 | {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd | |||
1222 | {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb | |||
1223 | // + vpblendvb | |||
1224 | {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb | |||
1225 | // + vpblendvb | |||
1226 | ||||
1227 | {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd | |||
1228 | {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps | |||
1229 | {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd | |||
1230 | {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd | |||
1231 | {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb | |||
1232 | // + vpblendvb | |||
1233 | {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb | |||
1234 | // + vpblendvb | |||
1235 | }; | |||
1236 | ||||
1237 | if (ST->hasAVX2()) | |||
1238 | if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second)) | |||
1239 | return LT.first * Entry->Cost; | |||
1240 | ||||
1241 | static const CostTblEntry XOPShuffleTbl[] = { | |||
1242 | {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd | |||
1243 | {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps | |||
1244 | {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd | |||
1245 | {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps | |||
1246 | {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm | |||
1247 | // + vinsertf128 | |||
1248 | {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm | |||
1249 | // + vinsertf128 | |||
1250 | ||||
1251 | {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm | |||
1252 | // + vinsertf128 | |||
1253 | {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm | |||
1254 | {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm | |||
1255 | // + vinsertf128 | |||
1256 | {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm | |||
1257 | }; | |||
1258 | ||||
1259 | if (ST->hasXOP()) | |||
1260 | if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second)) | |||
1261 | return LT.first * Entry->Cost; | |||
1262 | ||||
1263 | static const CostTblEntry AVX1ShuffleTbl[] = { | |||
1264 | {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd | |||
1265 | {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps | |||
1266 | {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd | |||
1267 | {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps | |||
1268 | {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128 | |||
1269 | {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128 | |||
1270 | ||||
1271 | {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd | |||
1272 | {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps | |||
1273 | {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd | |||
1274 | {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps | |||
1275 | {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb | |||
1276 | // + vinsertf128 | |||
1277 | {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb | |||
1278 | // + vinsertf128 | |||
1279 | ||||
1280 | {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd | |||
1281 | {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd | |||
1282 | {TTI::SK_Select, MVT::v8i32, 1}, // vblendps | |||
1283 | {TTI::SK_Select, MVT::v8f32, 1}, // vblendps | |||
1284 | {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor | |||
1285 | {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor | |||
1286 | ||||
1287 | {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd | |||
1288 | {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd | |||
1289 | {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps | |||
1290 | {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps | |||
1291 | {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb | |||
1292 | // + 2*por + vinsertf128 | |||
1293 | {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb | |||
1294 | // + 2*por + vinsertf128 | |||
1295 | ||||
1296 | {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd | |||
1297 | {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd | |||
1298 | {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps | |||
1299 | {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps | |||
1300 | {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb | |||
1301 | // + 4*por + vinsertf128 | |||
1302 | {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb | |||
1303 | // + 4*por + vinsertf128 | |||
1304 | }; | |||
1305 | ||||
1306 | if (ST->hasAVX()) | |||
1307 | if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second)) | |||
1308 | return LT.first * Entry->Cost; | |||
1309 | ||||
1310 | static const CostTblEntry SSE41ShuffleTbl[] = { | |||
1311 | {TTI::SK_Select, MVT::v2i64, 1}, // pblendw | |||
1312 | {TTI::SK_Select, MVT::v2f64, 1}, // movsd | |||
1313 | {TTI::SK_Select, MVT::v4i32, 1}, // pblendw | |||
1314 | {TTI::SK_Select, MVT::v4f32, 1}, // blendps | |||
1315 | {TTI::SK_Select, MVT::v8i16, 1}, // pblendw | |||
1316 | {TTI::SK_Select, MVT::v16i8, 1} // pblendvb | |||
1317 | }; | |||
1318 | ||||
1319 | if (ST->hasSSE41()) | |||
1320 | if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second)) | |||
1321 | return LT.first * Entry->Cost; | |||
1322 | ||||
1323 | static const CostTblEntry SSSE3ShuffleTbl[] = { | |||
1324 | {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb | |||
1325 | {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb | |||
1326 | ||||
1327 | {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb | |||
1328 | {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb | |||
1329 | ||||
1330 | {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por | |||
1331 | {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por | |||
1332 | ||||
1333 | {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb | |||
1334 | {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb | |||
1335 | ||||
1336 | {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por | |||
1337 | {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por | |||
1338 | }; | |||
1339 | ||||
1340 | if (ST->hasSSSE3()) | |||
1341 | if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second)) | |||
1342 | return LT.first * Entry->Cost; | |||
1343 | ||||
1344 | static const CostTblEntry SSE2ShuffleTbl[] = { | |||
1345 | {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd | |||
1346 | {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd | |||
1347 | {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd | |||
1348 | {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd | |||
1349 | {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd | |||
1350 | ||||
1351 | {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd | |||
1352 | {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd | |||
1353 | {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd | |||
1354 | {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd | |||
1355 | {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw | |||
1356 | // + 2*pshufd + 2*unpck + packus | |||
1357 | ||||
1358 | {TTI::SK_Select, MVT::v2i64, 1}, // movsd | |||
1359 | {TTI::SK_Select, MVT::v2f64, 1}, // movsd | |||
1360 | {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps | |||
1361 | {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por | |||
1362 | {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por | |||
1363 | ||||
1364 | {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd | |||
1365 | {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd | |||
1366 | {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd | |||
1367 | {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw | |||
1368 | // + pshufd/unpck | |||
1369 | { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw | |||
1370 | // + 2*pshufd + 2*unpck + 2*packus | |||
1371 | ||||
1372 | { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd | |||
1373 | { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd | |||
1374 | { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd} | |||
1375 | { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute | |||
1376 | { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute | |||
1377 | }; | |||
1378 | ||||
1379 | if (ST->hasSSE2()) | |||
1380 | if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second)) | |||
1381 | return LT.first * Entry->Cost; | |||
1382 | ||||
1383 | static const CostTblEntry SSE1ShuffleTbl[] = { | |||
1384 | { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps | |||
1385 | { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps | |||
1386 | { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps | |||
1387 | { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps | |||
1388 | { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps | |||
1389 | }; | |||
1390 | ||||
1391 | if (ST->hasSSE1()) | |||
1392 | if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second)) | |||
1393 | return LT.first * Entry->Cost; | |||
1394 | ||||
1395 | return BaseT::getShuffleCost(Kind, BaseTp, Index, SubTp); | |||
1396 | } | |||
1397 | ||||
1398 | int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, | |||
1399 | TTI::CastContextHint CCH, | |||
1400 | TTI::TargetCostKind CostKind, | |||
1401 | const Instruction *I) { | |||
1402 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
1403 | assert(ISD && "Invalid opcode")((ISD && "Invalid opcode") ? static_cast<void> ( 0) : __assert_fail ("ISD && \"Invalid opcode\"", "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 1403, __PRETTY_FUNCTION__)); | |||
1404 | ||||
1405 | // TODO: Allow non-throughput costs that aren't binary. | |||
1406 | auto AdjustCost = [&CostKind](int Cost) { | |||
1407 | if (CostKind != TTI::TCK_RecipThroughput) | |||
1408 | return Cost == 0 ? 0 : 1; | |||
1409 | return Cost; | |||
1410 | }; | |||
1411 | ||||
1412 | // FIXME: Need a better design of the cost table to handle non-simple types of | |||
1413 | // potential massive combinations (elem_num x src_type x dst_type). | |||
1414 | ||||
1415 | static const TypeConversionCostTblEntry AVX512BWConversionTbl[] { | |||
1416 | { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 }, | |||
1417 | { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 }, | |||
1418 | ||||
1419 | // Mask sign extend has an instruction. | |||
1420 | { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 }, | |||
1421 | { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 }, | |||
1422 | { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 }, | |||
1423 | { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 }, | |||
1424 | { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 }, | |||
1425 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 }, | |||
1426 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 }, | |||
1427 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, | |||
1428 | { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 }, | |||
1429 | { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 }, | |||
1430 | { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 }, | |||
1431 | ||||
1432 | // Mask zero extend is a sext + shift. | |||
1433 | { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 }, | |||
1434 | { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 }, | |||
1435 | { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 }, | |||
1436 | { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 }, | |||
1437 | { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 }, | |||
1438 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 }, | |||
1439 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 }, | |||
1440 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 }, | |||
1441 | { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 }, | |||
1442 | { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 }, | |||
1443 | { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 }, | |||
1444 | ||||
1445 | { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 2 }, | |||
1446 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm | |||
1447 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // widen to zmm | |||
1448 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // widen to zmm | |||
1449 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // widen to zmm | |||
1450 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // widen to zmm | |||
1451 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // widen to zmm | |||
1452 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // widen to zmm | |||
1453 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // widen to zmm | |||
1454 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // widen to zmm | |||
1455 | { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // widen to zmm | |||
1456 | { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, 2 }, | |||
1457 | { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, 2 }, | |||
1458 | }; | |||
1459 | ||||
1460 | static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = { | |||
1461 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, | |||
1462 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, | |||
1463 | ||||
1464 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, | |||
1465 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, | |||
1466 | ||||
1467 | { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 }, | |||
1468 | { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 }, | |||
1469 | ||||
1470 | { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 }, | |||
1471 | { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 }, | |||
1472 | }; | |||
1473 | ||||
1474 | // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and | |||
1475 | // 256-bit wide vectors. | |||
1476 | ||||
1477 | static const TypeConversionCostTblEntry AVX512FConversionTbl[] = { | |||
1478 | { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 }, | |||
1479 | { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 }, | |||
1480 | { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 }, | |||
1481 | ||||
1482 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd | |||
1483 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd | |||
1484 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd | |||
1485 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd | |||
1486 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq | |||
1487 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq | |||
1488 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq | |||
1489 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd | |||
1490 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd | |||
1491 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd | |||
1492 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd | |||
1493 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd | |||
1494 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq | |||
1495 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq | |||
1496 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq | |||
1497 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 }, | |||
1498 | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 }, | |||
1499 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 }, | |||
1500 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 }, | |||
1501 | { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, | |||
1502 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd | |||
1503 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb | |||
1504 | ||||
1505 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32 | |||
1506 | { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 8 }, | |||
1507 | ||||
1508 | // Sign extend is zmm vpternlogd+vptruncdb. | |||
1509 | // Zero extend is zmm broadcast load+vptruncdw. | |||
1510 | { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 3 }, | |||
1511 | { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 4 }, | |||
1512 | { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 3 }, | |||
1513 | { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 4 }, | |||
1514 | { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 3 }, | |||
1515 | { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 4 }, | |||
1516 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 3 }, | |||
1517 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 4 }, | |||
1518 | ||||
1519 | // Sign extend is zmm vpternlogd+vptruncdw. | |||
1520 | // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw. | |||
1521 | { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 3 }, | |||
1522 | { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 4 }, | |||
1523 | { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 3 }, | |||
1524 | { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 4 }, | |||
1525 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 3 }, | |||
1526 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 4 }, | |||
1527 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 3 }, | |||
1528 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, | |||
1529 | ||||
1530 | { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd | |||
1531 | { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld | |||
1532 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd | |||
1533 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld | |||
1534 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd | |||
1535 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld | |||
1536 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq | |||
1537 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq | |||
1538 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq | |||
1539 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq | |||
1540 | ||||
1541 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd | |||
1542 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld | |||
1543 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq | |||
1544 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq | |||
1545 | ||||
1546 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, | |||
1547 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, | |||
1548 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, | |||
1549 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, | |||
1550 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 }, | |||
1551 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 }, | |||
1552 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, | |||
1553 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, | |||
1554 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, | |||
1555 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, | |||
1556 | ||||
1557 | { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right | |||
1558 | { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right | |||
1559 | ||||
1560 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, | |||
1561 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, | |||
1562 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 }, | |||
1563 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 }, | |||
1564 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, | |||
1565 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 }, | |||
1566 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, | |||
1567 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, | |||
1568 | ||||
1569 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, | |||
1570 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, | |||
1571 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 }, | |||
1572 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 }, | |||
1573 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, | |||
1574 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 }, | |||
1575 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, | |||
1576 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, | |||
1577 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 }, | |||
1578 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 }, | |||
1579 | ||||
1580 | { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f64, 3 }, | |||
1581 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, 3 }, | |||
1582 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 3 }, | |||
1583 | { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 3 }, | |||
1584 | ||||
1585 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 }, | |||
1586 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 3 }, | |||
1587 | { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 3 }, | |||
1588 | { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 }, | |||
1589 | { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 3 }, | |||
1590 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 3 }, | |||
1591 | }; | |||
1592 | ||||
1593 | static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] { | |||
1594 | // Mask sign extend has an instruction. | |||
1595 | { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 }, | |||
1596 | { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 }, | |||
1597 | { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 }, | |||
1598 | { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 }, | |||
1599 | { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 }, | |||
1600 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 }, | |||
1601 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 }, | |||
1602 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, | |||
1603 | { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 }, | |||
1604 | ||||
1605 | // Mask zero extend is a sext + shift. | |||
1606 | { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 }, | |||
1607 | { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 }, | |||
1608 | { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 }, | |||
1609 | { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 }, | |||
1610 | { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 }, | |||
1611 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 }, | |||
1612 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 }, | |||
1613 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 }, | |||
1614 | { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 }, | |||
1615 | ||||
1616 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, | |||
1617 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // vpsllw+vptestmb | |||
1618 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // vpsllw+vptestmw | |||
1619 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // vpsllw+vptestmb | |||
1620 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // vpsllw+vptestmw | |||
1621 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // vpsllw+vptestmb | |||
1622 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // vpsllw+vptestmw | |||
1623 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // vpsllw+vptestmb | |||
1624 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // vpsllw+vptestmw | |||
1625 | { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // vpsllw+vptestmb | |||
1626 | }; | |||
1627 | ||||
1628 | static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = { | |||
1629 | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, | |||
1630 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, | |||
1631 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, | |||
1632 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, | |||
1633 | ||||
1634 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, | |||
1635 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, | |||
1636 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, | |||
1637 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, | |||
1638 | ||||
1639 | { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 1 }, | |||
1640 | { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 }, | |||
1641 | { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, | |||
1642 | { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 }, | |||
1643 | ||||
1644 | { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 1 }, | |||
1645 | { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 }, | |||
1646 | { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, | |||
1647 | { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 }, | |||
1648 | }; | |||
1649 | ||||
1650 | static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = { | |||
1651 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd | |||
1652 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd | |||
1653 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd | |||
1654 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8 | |||
1655 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq | |||
1656 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq | |||
1657 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq | |||
1658 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16 | |||
1659 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd | |||
1660 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd | |||
1661 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd | |||
1662 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq | |||
1663 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq | |||
1664 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd | |||
1665 | ||||
1666 | // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb | |||
1667 | // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb | |||
1668 | { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 5 }, | |||
1669 | { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 6 }, | |||
1670 | { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 5 }, | |||
1671 | { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 6 }, | |||
1672 | { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 5 }, | |||
1673 | { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 6 }, | |||
1674 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 10 }, | |||
1675 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 12 }, | |||
1676 | ||||
1677 | // sign extend is vpcmpeq+maskedmove+vpmovdw | |||
1678 | // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw | |||
1679 | { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 4 }, | |||
1680 | { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 5 }, | |||
1681 | { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 4 }, | |||
1682 | { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 5 }, | |||
1683 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 4 }, | |||
1684 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 5 }, | |||
1685 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 10 }, | |||
1686 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 12 }, | |||
1687 | ||||
1688 | { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd | |||
1689 | { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld | |||
1690 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd | |||
1691 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld | |||
1692 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd | |||
1693 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld | |||
1694 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq | |||
1695 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq | |||
1696 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq | |||
1697 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq | |||
1698 | ||||
1699 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 2 }, | |||
1700 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 }, | |||
1701 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 2 }, | |||
1702 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 5 }, | |||
1703 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 }, | |||
1704 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 }, | |||
1705 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 }, | |||
1706 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 }, | |||
1707 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, | |||
1708 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, | |||
1709 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, | |||
1710 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 }, | |||
1711 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 }, | |||
1712 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 5 }, | |||
1713 | ||||
1714 | { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 }, | |||
1715 | { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 }, | |||
1716 | ||||
1717 | { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 3 }, | |||
1718 | { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f32, 3 }, | |||
1719 | ||||
1720 | { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 }, | |||
1721 | { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 }, | |||
1722 | ||||
1723 | { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, | |||
1724 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, | |||
1725 | { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 1 }, | |||
1726 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 }, | |||
1727 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 }, | |||
1728 | }; | |||
1729 | ||||
1730 | static const TypeConversionCostTblEntry AVX2ConversionTbl[] = { | |||
1731 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, | |||
1732 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, | |||
1733 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, | |||
1734 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, | |||
1735 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 1 }, | |||
1736 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 1 }, | |||
1737 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 1 }, | |||
1738 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 1 }, | |||
1739 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, | |||
1740 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, | |||
1741 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, | |||
1742 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, | |||
1743 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 1 }, | |||
1744 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 1 }, | |||
1745 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, | |||
1746 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, | |||
1747 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, | |||
1748 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, | |||
1749 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 3 }, | |||
1750 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 3 }, | |||
1751 | ||||
1752 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 }, | |||
1753 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, | |||
1754 | ||||
1755 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, | |||
1756 | { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, | |||
1757 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, | |||
1758 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 }, | |||
1759 | ||||
1760 | { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 }, | |||
1761 | { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 }, | |||
1762 | ||||
1763 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 }, | |||
1764 | }; | |||
1765 | ||||
1766 | static const TypeConversionCostTblEntry AVXConversionTbl[] = { | |||
1767 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 }, | |||
1768 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 }, | |||
1769 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 }, | |||
1770 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 }, | |||
1771 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 4 }, | |||
1772 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 }, | |||
1773 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 4 }, | |||
1774 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 4 }, | |||
1775 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, | |||
1776 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, | |||
1777 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, | |||
1778 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, | |||
1779 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 4 }, | |||
1780 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, | |||
1781 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, | |||
1782 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, | |||
1783 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 4 }, | |||
1784 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 4 }, | |||
1785 | ||||
1786 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 4 }, | |||
1787 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 5 }, | |||
1788 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 4 }, | |||
1789 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 9 }, | |||
1790 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, 11 }, | |||
1791 | ||||
1792 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 4 }, | |||
1793 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 }, | |||
1794 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, | |||
1795 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 4 }, | |||
1796 | { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 4 }, | |||
1797 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 }, | |||
1798 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 11 }, | |||
1799 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 9 }, | |||
1800 | { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 3 }, | |||
1801 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 11 }, | |||
1802 | ||||
1803 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, | |||
1804 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 }, | |||
1805 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 }, | |||
1806 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, | |||
1807 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i8, 3 }, | |||
1808 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 8 }, | |||
1809 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 3 }, | |||
1810 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i16, 3 }, | |||
1811 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 }, | |||
1812 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, | |||
1813 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, | |||
1814 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, | |||
1815 | ||||
1816 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 }, | |||
1817 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 }, | |||
1818 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 }, | |||
1819 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 2 }, | |||
1820 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 }, | |||
1821 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 5 }, | |||
1822 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, | |||
1823 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 }, | |||
1824 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 }, | |||
1825 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 6 }, | |||
1826 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 6 }, | |||
1827 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 }, | |||
1828 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 9 }, | |||
1829 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 }, | |||
1830 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 6 }, | |||
1831 | // The generic code to compute the scalar overhead is currently broken. | |||
1832 | // Workaround this limitation by estimating the scalarization overhead | |||
1833 | // here. We have roughly 10 instructions per scalar element. | |||
1834 | // Multiply that by the vector width. | |||
1835 | // FIXME: remove that when PR19268 is fixed. | |||
1836 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 }, | |||
1837 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 }, | |||
1838 | ||||
1839 | { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 4 }, | |||
1840 | { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f64, 3 }, | |||
1841 | { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f64, 2 }, | |||
1842 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 3 }, | |||
1843 | ||||
1844 | { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f64, 3 }, | |||
1845 | { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f64, 2 }, | |||
1846 | { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f32, 4 }, | |||
1847 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 3 }, | |||
1848 | // This node is expanded into scalarized operations but BasicTTI is overly | |||
1849 | // optimistic estimating its cost. It computes 3 per element (one | |||
1850 | // vector-extract, one scalar conversion and one vector-insert). The | |||
1851 | // problem is that the inserts form a read-modify-write chain so latency | |||
1852 | // should be factored in too. Inflating the cost per element by 1. | |||
1853 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 8*4 }, | |||
1854 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4*4 }, | |||
1855 | ||||
1856 | { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 }, | |||
1857 | { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 }, | |||
1858 | }; | |||
1859 | ||||
1860 | static const TypeConversionCostTblEntry SSE41ConversionTbl[] = { | |||
1861 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 2 }, | |||
1862 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 2 }, | |||
1863 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 2 }, | |||
1864 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 2 }, | |||
1865 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, | |||
1866 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, | |||
1867 | ||||
1868 | { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 }, | |||
1869 | { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 2 }, | |||
1870 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 1 }, | |||
1871 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 1 }, | |||
1872 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, | |||
1873 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, | |||
1874 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 2 }, | |||
1875 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 2 }, | |||
1876 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, | |||
1877 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, | |||
1878 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 4 }, | |||
1879 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 4 }, | |||
1880 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, | |||
1881 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, | |||
1882 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, | |||
1883 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, | |||
1884 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 }, | |||
1885 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 }, | |||
1886 | ||||
1887 | // These truncates end up widening elements. | |||
1888 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ | |||
1889 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ | |||
1890 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD | |||
1891 | ||||
1892 | { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 1 }, | |||
1893 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 1 }, | |||
1894 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1 }, | |||
1895 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1 }, | |||
1896 | { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, | |||
1897 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, | |||
1898 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 3 }, | |||
1899 | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 }, | |||
1900 | { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1 }, // PSHUFB | |||
1901 | ||||
1902 | { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 4 }, | |||
1903 | { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 }, | |||
1904 | ||||
1905 | { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 3 }, | |||
1906 | { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 3 }, | |||
1907 | ||||
1908 | { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 3 }, | |||
1909 | { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 3 }, | |||
1910 | { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 }, | |||
1911 | }; | |||
1912 | ||||
1913 | static const TypeConversionCostTblEntry SSE2ConversionTbl[] = { | |||
1914 | // These are somewhat magic numbers justified by looking at the output of | |||
1915 | // Intel's IACA, running some kernels and making sure when we take | |||
1916 | // legalization into account the throughput will be overestimated. | |||
1917 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, | |||
1918 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, | |||
1919 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, | |||
1920 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, | |||
1921 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 }, | |||
1922 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 2*10 }, | |||
1923 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2*10 }, | |||
1924 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, | |||
1925 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, | |||
1926 | ||||
1927 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, | |||
1928 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, | |||
1929 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, | |||
1930 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, | |||
1931 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, | |||
1932 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 }, | |||
1933 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 6 }, | |||
1934 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, | |||
1935 | ||||
1936 | { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 4 }, | |||
1937 | { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 2 }, | |||
1938 | { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 }, | |||
1939 | { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 }, | |||
1940 | { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 }, | |||
1941 | { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 4 }, | |||
1942 | ||||
1943 | { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 1 }, | |||
1944 | ||||
1945 | { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 6 }, | |||
1946 | { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 6 }, | |||
1947 | ||||
1948 | { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 }, | |||
1949 | { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 }, | |||
1950 | { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 4 }, | |||
1951 | { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 4 }, | |||
1952 | { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 }, | |||
1953 | { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 2 }, | |||
1954 | { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 }, | |||
1955 | { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 4 }, | |||
1956 | ||||
1957 | { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 }, | |||
1958 | { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 }, | |||
1959 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, | |||
1960 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 3 }, | |||
1961 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 }, | |||
1962 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 8 }, | |||
1963 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, | |||
1964 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 2 }, | |||
1965 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 6 }, | |||
1966 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 6 }, | |||
1967 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 }, | |||
1968 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, | |||
1969 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 9 }, | |||
1970 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 12 }, | |||
1971 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, | |||
1972 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 2 }, | |||
1973 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, | |||
1974 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 10 }, | |||
1975 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 }, | |||
1976 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, | |||
1977 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 }, | |||
1978 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 8 }, | |||
1979 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 }, | |||
1980 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 5 }, | |||
1981 | ||||
1982 | // These truncates are really widening elements. | |||
1983 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD | |||
1984 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // PUNPCKLWD+DQ | |||
1985 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // PUNPCKLBW+WD+PSHUFD | |||
1986 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 1 }, // PUNPCKLWD | |||
1987 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD | |||
1988 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW | |||
1989 | ||||
1990 | { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // PAND+PACKUSWB | |||
1991 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // PAND+PACKUSWB | |||
1992 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // PAND+PACKUSWB | |||
1993 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, | |||
1994 | { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 3 }, // PAND+2*PACKUSWB | |||
1995 | { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 }, | |||
1996 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 3 }, | |||
1997 | { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 3 }, | |||
1998 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 }, | |||
1999 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 }, | |||
2000 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, | |||
2001 | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 10 }, | |||
2002 | { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB | |||
2003 | { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW | |||
2004 | { ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1 }, // PSHUFD | |||
2005 | }; | |||
2006 | ||||
2007 | std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src); | |||
2008 | std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst); | |||
2009 | ||||
2010 | if (ST->hasSSE2() && !ST->hasAVX()) { | |||
2011 | if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, | |||
2012 | LTDest.second, LTSrc.second)) | |||
2013 | return AdjustCost(LTSrc.first * Entry->Cost); | |||
2014 | } | |||
2015 | ||||
2016 | EVT SrcTy = TLI->getValueType(DL, Src); | |||
2017 | EVT DstTy = TLI->getValueType(DL, Dst); | |||
2018 | ||||
2019 | // The function getSimpleVT only handles simple value types. | |||
2020 | if (!SrcTy.isSimple() || !DstTy.isSimple()) | |||
2021 | return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind)); | |||
2022 | ||||
2023 | MVT SimpleSrcTy = SrcTy.getSimpleVT(); | |||
2024 | MVT SimpleDstTy = DstTy.getSimpleVT(); | |||
2025 | ||||
2026 | if (ST->useAVX512Regs()) { | |||
2027 | if (ST->hasBWI()) | |||
2028 | if (const auto *Entry = ConvertCostTableLookup(AVX512BWConversionTbl, ISD, | |||
2029 | SimpleDstTy, SimpleSrcTy)) | |||
2030 | return AdjustCost(Entry->Cost); | |||
2031 | ||||
2032 | if (ST->hasDQI()) | |||
2033 | if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD, | |||
2034 | SimpleDstTy, SimpleSrcTy)) | |||
2035 | return AdjustCost(Entry->Cost); | |||
2036 | ||||
2037 | if (ST->hasAVX512()) | |||
2038 | if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD, | |||
2039 | SimpleDstTy, SimpleSrcTy)) | |||
2040 | return AdjustCost(Entry->Cost); | |||
2041 | } | |||
2042 | ||||
2043 | if (ST->hasBWI()) | |||
2044 | if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD, | |||
2045 | SimpleDstTy, SimpleSrcTy)) | |||
2046 | return AdjustCost(Entry->Cost); | |||
2047 | ||||
2048 | if (ST->hasDQI()) | |||
2049 | if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD, | |||
2050 | SimpleDstTy, SimpleSrcTy)) | |||
2051 | return AdjustCost(Entry->Cost); | |||
2052 | ||||
2053 | if (ST->hasAVX512()) | |||
2054 | if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD, | |||
2055 | SimpleDstTy, SimpleSrcTy)) | |||
2056 | return AdjustCost(Entry->Cost); | |||
2057 | ||||
2058 | if (ST->hasAVX2()) { | |||
2059 | if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, | |||
2060 | SimpleDstTy, SimpleSrcTy)) | |||
2061 | return AdjustCost(Entry->Cost); | |||
2062 | } | |||
2063 | ||||
2064 | if (ST->hasAVX()) { | |||
2065 | if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, | |||
2066 | SimpleDstTy, SimpleSrcTy)) | |||
2067 | return AdjustCost(Entry->Cost); | |||
2068 | } | |||
2069 | ||||
2070 | if (ST->hasSSE41()) { | |||
2071 | if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, | |||
2072 | SimpleDstTy, SimpleSrcTy)) | |||
2073 | return AdjustCost(Entry->Cost); | |||
2074 | } | |||
2075 | ||||
2076 | if (ST->hasSSE2()) { | |||
2077 | if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, | |||
2078 | SimpleDstTy, SimpleSrcTy)) | |||
2079 | return AdjustCost(Entry->Cost); | |||
2080 | } | |||
2081 | ||||
2082 | return AdjustCost( | |||
2083 | BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); | |||
2084 | } | |||
2085 | ||||
2086 | int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, | |||
2087 | CmpInst::Predicate VecPred, | |||
2088 | TTI::TargetCostKind CostKind, | |||
2089 | const Instruction *I) { | |||
2090 | // TODO: Handle other cost kinds. | |||
2091 | if (CostKind != TTI::TCK_RecipThroughput) | |||
2092 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, | |||
2093 | I); | |||
2094 | ||||
2095 | // Legalize the type. | |||
2096 | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); | |||
2097 | ||||
2098 | MVT MTy = LT.second; | |||
2099 | ||||
2100 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
2101 | assert(ISD && "Invalid opcode")((ISD && "Invalid opcode") ? static_cast<void> ( 0) : __assert_fail ("ISD && \"Invalid opcode\"", "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 2101, __PRETTY_FUNCTION__)); | |||
2102 | ||||
2103 | unsigned ExtraCost = 0; | |||
2104 | if (I && (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp)) { | |||
2105 | // Some vector comparison predicates cost extra instructions. | |||
2106 | if (MTy.isVector() && | |||
2107 | !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) || | |||
2108 | (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) || | |||
2109 | ST->hasBWI())) { | |||
2110 | switch (cast<CmpInst>(I)->getPredicate()) { | |||
2111 | case CmpInst::Predicate::ICMP_NE: | |||
2112 | // xor(cmpeq(x,y),-1) | |||
2113 | ExtraCost = 1; | |||
2114 | break; | |||
2115 | case CmpInst::Predicate::ICMP_SGE: | |||
2116 | case CmpInst::Predicate::ICMP_SLE: | |||
2117 | // xor(cmpgt(x,y),-1) | |||
2118 | ExtraCost = 1; | |||
2119 | break; | |||
2120 | case CmpInst::Predicate::ICMP_ULT: | |||
2121 | case CmpInst::Predicate::ICMP_UGT: | |||
2122 | // cmpgt(xor(x,signbit),xor(y,signbit)) | |||
2123 | // xor(cmpeq(pmaxu(x,y),x),-1) | |||
2124 | ExtraCost = 2; | |||
2125 | break; | |||
2126 | case CmpInst::Predicate::ICMP_ULE: | |||
2127 | case CmpInst::Predicate::ICMP_UGE: | |||
2128 | if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) || | |||
2129 | (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) { | |||
2130 | // cmpeq(psubus(x,y),0) | |||
2131 | // cmpeq(pminu(x,y),x) | |||
2132 | ExtraCost = 1; | |||
2133 | } else { | |||
2134 | // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1) | |||
2135 | ExtraCost = 3; | |||
2136 | } | |||
2137 | break; | |||
2138 | default: | |||
2139 | break; | |||
2140 | } | |||
2141 | } | |||
2142 | } | |||
2143 | ||||
2144 | static const CostTblEntry SLMCostTbl[] = { | |||
2145 | // slm pcmpeq/pcmpgt throughput is 2 | |||
2146 | { ISD::SETCC, MVT::v2i64, 2 }, | |||
2147 | }; | |||
2148 | ||||
2149 | static const CostTblEntry AVX512BWCostTbl[] = { | |||
2150 | { ISD::SETCC, MVT::v32i16, 1 }, | |||
2151 | { ISD::SETCC, MVT::v64i8, 1 }, | |||
2152 | ||||
2153 | { ISD::SELECT, MVT::v32i16, 1 }, | |||
2154 | { ISD::SELECT, MVT::v64i8, 1 }, | |||
2155 | }; | |||
2156 | ||||
2157 | static const CostTblEntry AVX512CostTbl[] = { | |||
2158 | { ISD::SETCC, MVT::v8i64, 1 }, | |||
2159 | { ISD::SETCC, MVT::v16i32, 1 }, | |||
2160 | { ISD::SETCC, MVT::v8f64, 1 }, | |||
2161 | { ISD::SETCC, MVT::v16f32, 1 }, | |||
2162 | ||||
2163 | { ISD::SELECT, MVT::v8i64, 1 }, | |||
2164 | { ISD::SELECT, MVT::v16i32, 1 }, | |||
2165 | { ISD::SELECT, MVT::v8f64, 1 }, | |||
2166 | { ISD::SELECT, MVT::v16f32, 1 }, | |||
2167 | ||||
2168 | { ISD::SETCC, MVT::v32i16, 2 }, // FIXME: should probably be 4 | |||
2169 | { ISD::SETCC, MVT::v64i8, 2 }, // FIXME: should probably be 4 | |||
2170 | ||||
2171 | { ISD::SELECT, MVT::v32i16, 2 }, // FIXME: should be 3 | |||
2172 | { ISD::SELECT, MVT::v64i8, 2 }, // FIXME: should be 3 | |||
2173 | }; | |||
2174 | ||||
2175 | static const CostTblEntry AVX2CostTbl[] = { | |||
2176 | { ISD::SETCC, MVT::v4i64, 1 }, | |||
2177 | { ISD::SETCC, MVT::v8i32, 1 }, | |||
2178 | { ISD::SETCC, MVT::v16i16, 1 }, | |||
2179 | { ISD::SETCC, MVT::v32i8, 1 }, | |||
2180 | ||||
2181 | { ISD::SELECT, MVT::v4i64, 1 }, // pblendvb | |||
2182 | { ISD::SELECT, MVT::v8i32, 1 }, // pblendvb | |||
2183 | { ISD::SELECT, MVT::v16i16, 1 }, // pblendvb | |||
2184 | { ISD::SELECT, MVT::v32i8, 1 }, // pblendvb | |||
2185 | }; | |||
2186 | ||||
2187 | static const CostTblEntry AVX1CostTbl[] = { | |||
2188 | { ISD::SETCC, MVT::v4f64, 1 }, | |||
2189 | { ISD::SETCC, MVT::v8f32, 1 }, | |||
2190 | // AVX1 does not support 8-wide integer compare. | |||
2191 | { ISD::SETCC, MVT::v4i64, 4 }, | |||
2192 | { ISD::SETCC, MVT::v8i32, 4 }, | |||
2193 | { ISD::SETCC, MVT::v16i16, 4 }, | |||
2194 | { ISD::SETCC, MVT::v32i8, 4 }, | |||
2195 | ||||
2196 | { ISD::SELECT, MVT::v4f64, 1 }, // vblendvpd | |||
2197 | { ISD::SELECT, MVT::v8f32, 1 }, // vblendvps | |||
2198 | { ISD::SELECT, MVT::v4i64, 1 }, // vblendvpd | |||
2199 | { ISD::SELECT, MVT::v8i32, 1 }, // vblendvps | |||
2200 | { ISD::SELECT, MVT::v16i16, 3 }, // vandps + vandnps + vorps | |||
2201 | { ISD::SELECT, MVT::v32i8, 3 }, // vandps + vandnps + vorps | |||
2202 | }; | |||
2203 | ||||
2204 | static const CostTblEntry SSE42CostTbl[] = { | |||
2205 | { ISD::SETCC, MVT::v2f64, 1 }, | |||
2206 | { ISD::SETCC, MVT::v4f32, 1 }, | |||
2207 | { ISD::SETCC, MVT::v2i64, 1 }, | |||
2208 | }; | |||
2209 | ||||
2210 | static const CostTblEntry SSE41CostTbl[] = { | |||
2211 | { ISD::SELECT, MVT::v2f64, 1 }, // blendvpd | |||
2212 | { ISD::SELECT, MVT::v4f32, 1 }, // blendvps | |||
2213 | { ISD::SELECT, MVT::v2i64, 1 }, // pblendvb | |||
2214 | { ISD::SELECT, MVT::v4i32, 1 }, // pblendvb | |||
2215 | { ISD::SELECT, MVT::v8i16, 1 }, // pblendvb | |||
2216 | { ISD::SELECT, MVT::v16i8, 1 }, // pblendvb | |||
2217 | }; | |||
2218 | ||||
2219 | static const CostTblEntry SSE2CostTbl[] = { | |||
2220 | { ISD::SETCC, MVT::v2f64, 2 }, | |||
2221 | { ISD::SETCC, MVT::f64, 1 }, | |||
2222 | { ISD::SETCC, MVT::v2i64, 8 }, | |||
2223 | { ISD::SETCC, MVT::v4i32, 1 }, | |||
2224 | { ISD::SETCC, MVT::v8i16, 1 }, | |||
2225 | { ISD::SETCC, MVT::v16i8, 1 }, | |||
2226 | ||||
2227 | { ISD::SELECT, MVT::v2f64, 3 }, // andpd + andnpd + orpd | |||
2228 | { ISD::SELECT, MVT::v2i64, 3 }, // pand + pandn + por | |||
2229 | { ISD::SELECT, MVT::v4i32, 3 }, // pand + pandn + por | |||
2230 | { ISD::SELECT, MVT::v8i16, 3 }, // pand + pandn + por | |||
2231 | { ISD::SELECT, MVT::v16i8, 3 }, // pand + pandn + por | |||
2232 | }; | |||
2233 | ||||
2234 | static const CostTblEntry SSE1CostTbl[] = { | |||
2235 | { ISD::SETCC, MVT::v4f32, 2 }, | |||
2236 | { ISD::SETCC, MVT::f32, 1 }, | |||
2237 | ||||
2238 | { ISD::SELECT, MVT::v4f32, 3 }, // andps + andnps + orps | |||
2239 | }; | |||
2240 | ||||
2241 | if (ST->isSLM()) | |||
2242 | if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) | |||
2243 | return LT.first * (ExtraCost + Entry->Cost); | |||
2244 | ||||
2245 | if (ST->hasBWI()) | |||
2246 | if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) | |||
2247 | return LT.first * (ExtraCost + Entry->Cost); | |||
2248 | ||||
2249 | if (ST->hasAVX512()) | |||
2250 | if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) | |||
2251 | return LT.first * (ExtraCost + Entry->Cost); | |||
2252 | ||||
2253 | if (ST->hasAVX2()) | |||
2254 | if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) | |||
2255 | return LT.first * (ExtraCost + Entry->Cost); | |||
2256 | ||||
2257 | if (ST->hasAVX()) | |||
2258 | if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) | |||
2259 | return LT.first * (ExtraCost + Entry->Cost); | |||
2260 | ||||
2261 | if (ST->hasSSE42()) | |||
2262 | if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) | |||
2263 | return LT.first * (ExtraCost + Entry->Cost); | |||
2264 | ||||
2265 | if (ST->hasSSE41()) | |||
2266 | if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) | |||
2267 | return LT.first * (ExtraCost + Entry->Cost); | |||
2268 | ||||
2269 | if (ST->hasSSE2()) | |||
2270 | if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) | |||
2271 | return LT.first * (ExtraCost + Entry->Cost); | |||
2272 | ||||
2273 | if (ST->hasSSE1()) | |||
2274 | if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) | |||
2275 | return LT.first * (ExtraCost + Entry->Cost); | |||
2276 | ||||
2277 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); | |||
2278 | } | |||
2279 | ||||
2280 | unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; } | |||
2281 | ||||
2282 | int X86TTIImpl::getTypeBasedIntrinsicInstrCost( | |||
2283 | const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) { | |||
2284 | ||||
2285 | // Costs should match the codegen from: | |||
2286 | // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll | |||
2287 | // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll | |||
2288 | // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll | |||
2289 | // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll | |||
2290 | // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll | |||
2291 | ||||
2292 | // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not | |||
2293 | // specialized in these tables yet. | |||
2294 | static const CostTblEntry AVX512CDCostTbl[] = { | |||
2295 | { ISD::CTLZ, MVT::v8i64, 1 }, | |||
2296 | { ISD::CTLZ, MVT::v16i32, 1 }, | |||
2297 | { ISD::CTLZ, MVT::v32i16, 8 }, | |||
2298 | { ISD::CTLZ, MVT::v64i8, 20 }, | |||
2299 | { ISD::CTLZ, MVT::v4i64, 1 }, | |||
2300 | { ISD::CTLZ, MVT::v8i32, 1 }, | |||
2301 | { ISD::CTLZ, MVT::v16i16, 4 }, | |||
2302 | { ISD::CTLZ, MVT::v32i8, 10 }, | |||
2303 | { ISD::CTLZ, MVT::v2i64, 1 }, | |||
2304 | { ISD::CTLZ, MVT::v4i32, 1 }, | |||
2305 | { ISD::CTLZ, MVT::v8i16, 4 }, | |||
2306 | { ISD::CTLZ, MVT::v16i8, 4 }, | |||
2307 | }; | |||
2308 | static const CostTblEntry AVX512BWCostTbl[] = { | |||
2309 | { ISD::ABS, MVT::v32i16, 1 }, | |||
2310 | { ISD::ABS, MVT::v64i8, 1 }, | |||
2311 | { ISD::BITREVERSE, MVT::v8i64, 5 }, | |||
2312 | { ISD::BITREVERSE, MVT::v16i32, 5 }, | |||
2313 | { ISD::BITREVERSE, MVT::v32i16, 5 }, | |||
2314 | { ISD::BITREVERSE, MVT::v64i8, 5 }, | |||
2315 | { ISD::CTLZ, MVT::v8i64, 23 }, | |||
2316 | { ISD::CTLZ, MVT::v16i32, 22 }, | |||
2317 | { ISD::CTLZ, MVT::v32i16, 18 }, | |||
2318 | { ISD::CTLZ, MVT::v64i8, 17 }, | |||
2319 | { ISD::CTPOP, MVT::v8i64, 7 }, | |||
2320 | { ISD::CTPOP, MVT::v16i32, 11 }, | |||
2321 | { ISD::CTPOP, MVT::v32i16, 9 }, | |||
2322 | { ISD::CTPOP, MVT::v64i8, 6 }, | |||
2323 | { ISD::CTTZ, MVT::v8i64, 10 }, | |||
2324 | { ISD::CTTZ, MVT::v16i32, 14 }, | |||
2325 | { ISD::CTTZ, MVT::v32i16, 12 }, | |||
2326 | { ISD::CTTZ, MVT::v64i8, 9 }, | |||
2327 | { ISD::SADDSAT, MVT::v32i16, 1 }, | |||
2328 | { ISD::SADDSAT, MVT::v64i8, 1 }, | |||
2329 | { ISD::SMAX, MVT::v32i16, 1 }, | |||
2330 | { ISD::SMAX, MVT::v64i8, 1 }, | |||
2331 | { ISD::SMIN, MVT::v32i16, 1 }, | |||
2332 | { ISD::SMIN, MVT::v64i8, 1 }, | |||
2333 | { ISD::SSUBSAT, MVT::v32i16, 1 }, | |||
2334 | { ISD::SSUBSAT, MVT::v64i8, 1 }, | |||
2335 | { ISD::UADDSAT, MVT::v32i16, 1 }, | |||
2336 | { ISD::UADDSAT, MVT::v64i8, 1 }, | |||
2337 | { ISD::UMAX, MVT::v32i16, 1 }, | |||
2338 | { ISD::UMAX, MVT::v64i8, 1 }, | |||
2339 | { ISD::UMIN, MVT::v32i16, 1 }, | |||
2340 | { ISD::UMIN, MVT::v64i8, 1 }, | |||
2341 | { ISD::USUBSAT, MVT::v32i16, 1 }, | |||
2342 | { ISD::USUBSAT, MVT::v64i8, 1 }, | |||
2343 | }; | |||
2344 | static const CostTblEntry AVX512CostTbl[] = { | |||
2345 | { ISD::ABS, MVT::v8i64, 1 }, | |||
2346 | { ISD::ABS, MVT::v16i32, 1 }, | |||
2347 | { ISD::ABS, MVT::v32i16, 2 }, // FIXME: include split | |||
2348 | { ISD::ABS, MVT::v64i8, 2 }, // FIXME: include split | |||
2349 | { ISD::ABS, MVT::v4i64, 1 }, | |||
2350 | { ISD::ABS, MVT::v2i64, 1 }, | |||
2351 | { ISD::BITREVERSE, MVT::v8i64, 36 }, | |||
2352 | { ISD::BITREVERSE, MVT::v16i32, 24 }, | |||
2353 | { ISD::BITREVERSE, MVT::v32i16, 10 }, | |||
2354 | { ISD::BITREVERSE, MVT::v64i8, 10 }, | |||
2355 | { ISD::CTLZ, MVT::v8i64, 29 }, | |||
2356 | { ISD::CTLZ, MVT::v16i32, 35 }, | |||
2357 | { ISD::CTLZ, MVT::v32i16, 28 }, | |||
2358 | { ISD::CTLZ, MVT::v64i8, 18 }, | |||
2359 | { ISD::CTPOP, MVT::v8i64, 16 }, | |||
2360 | { ISD::CTPOP, MVT::v16i32, 24 }, | |||
2361 | { ISD::CTPOP, MVT::v32i16, 18 }, | |||
2362 | { ISD::CTPOP, MVT::v64i8, 12 }, | |||
2363 | { ISD::CTTZ, MVT::v8i64, 20 }, | |||
2364 | { ISD::CTTZ, MVT::v16i32, 28 }, | |||
2365 | { ISD::CTTZ, MVT::v32i16, 24 }, | |||
2366 | { ISD::CTTZ, MVT::v64i8, 18 }, | |||
2367 | { ISD::SMAX, MVT::v8i64, 1 }, | |||
2368 | { ISD::SMAX, MVT::v16i32, 1 }, | |||
2369 | { ISD::SMAX, MVT::v32i16, 2 }, // FIXME: include split | |||
2370 | { ISD::SMAX, MVT::v64i8, 2 }, // FIXME: include split | |||
2371 | { ISD::SMAX, MVT::v4i64, 1 }, | |||
2372 | { ISD::SMAX, MVT::v2i64, 1 }, | |||
2373 | { ISD::SMIN, MVT::v8i64, 1 }, | |||
2374 | { ISD::SMIN, MVT::v16i32, 1 }, | |||
2375 | { ISD::SMIN, MVT::v32i16, 2 }, // FIXME: include split | |||
2376 | { ISD::SMIN, MVT::v64i8, 2 }, // FIXME: include split | |||
2377 | { ISD::SMIN, MVT::v4i64, 1 }, | |||
2378 | { ISD::SMIN, MVT::v2i64, 1 }, | |||
2379 | { ISD::UMAX, MVT::v8i64, 1 }, | |||
2380 | { ISD::UMAX, MVT::v16i32, 1 }, | |||
2381 | { ISD::UMAX, MVT::v32i16, 2 }, // FIXME: include split | |||
2382 | { ISD::UMAX, MVT::v64i8, 2 }, // FIXME: include split | |||
2383 | { ISD::UMAX, MVT::v4i64, 1 }, | |||
2384 | { ISD::UMAX, MVT::v2i64, 1 }, | |||
2385 | { ISD::UMIN, MVT::v8i64, 1 }, | |||
2386 | { ISD::UMIN, MVT::v16i32, 1 }, | |||
2387 | { ISD::UMIN, MVT::v32i16, 2 }, // FIXME: include split | |||
2388 | { ISD::UMIN, MVT::v64i8, 2 }, // FIXME: include split | |||
2389 | { ISD::UMIN, MVT::v4i64, 1 }, | |||
2390 | { ISD::UMIN, MVT::v2i64, 1 }, | |||
2391 | { ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd | |||
2392 | { ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq | |||
2393 | { ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq | |||
2394 | { ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq | |||
2395 | { ISD::UADDSAT, MVT::v16i32, 3 }, // not + pminud + paddd | |||
2396 | { ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq | |||
2397 | { ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq | |||
2398 | { ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq | |||
2399 | { ISD::SADDSAT, MVT::v32i16, 2 }, // FIXME: include split | |||
2400 | { ISD::SADDSAT, MVT::v64i8, 2 }, // FIXME: include split | |||
2401 | { ISD::SSUBSAT, MVT::v32i16, 2 }, // FIXME: include split | |||
2402 | { ISD::SSUBSAT, MVT::v64i8, 2 }, // FIXME: include split | |||
2403 | { ISD::UADDSAT, MVT::v32i16, 2 }, // FIXME: include split | |||
2404 | { ISD::UADDSAT, MVT::v64i8, 2 }, // FIXME: include split | |||
2405 | { ISD::USUBSAT, MVT::v32i16, 2 }, // FIXME: include split | |||
2406 | { ISD::USUBSAT, MVT::v64i8, 2 }, // FIXME: include split | |||
2407 | { ISD::FMAXNUM, MVT::f32, 2 }, | |||
2408 | { ISD::FMAXNUM, MVT::v4f32, 2 }, | |||
2409 | { ISD::FMAXNUM, MVT::v8f32, 2 }, | |||
2410 | { ISD::FMAXNUM, MVT::v16f32, 2 }, | |||
2411 | { ISD::FMAXNUM, MVT::f64, 2 }, | |||
2412 | { ISD::FMAXNUM, MVT::v2f64, 2 }, | |||
2413 | { ISD::FMAXNUM, MVT::v4f64, 2 }, | |||
2414 | { ISD::FMAXNUM, MVT::v8f64, 2 }, | |||
2415 | }; | |||
2416 | static const CostTblEntry XOPCostTbl[] = { | |||
2417 | { ISD::BITREVERSE, MVT::v4i64, 4 }, | |||
2418 | { ISD::BITREVERSE, MVT::v8i32, 4 }, | |||
2419 | { ISD::BITREVERSE, MVT::v16i16, 4 }, | |||
2420 | { ISD::BITREVERSE, MVT::v32i8, 4 }, | |||
2421 | { ISD::BITREVERSE, MVT::v2i64, 1 }, | |||
2422 | { ISD::BITREVERSE, MVT::v4i32, 1 }, | |||
2423 | { ISD::BITREVERSE, MVT::v8i16, 1 }, | |||
2424 | { ISD::BITREVERSE, MVT::v16i8, 1 }, | |||
2425 | { ISD::BITREVERSE, MVT::i64, 3 }, | |||
2426 | { ISD::BITREVERSE, MVT::i32, 3 }, | |||
2427 | { ISD::BITREVERSE, MVT::i16, 3 }, | |||
2428 | { ISD::BITREVERSE, MVT::i8, 3 } | |||
2429 | }; | |||
2430 | static const CostTblEntry AVX2CostTbl[] = { | |||
2431 | { ISD::ABS, MVT::v4i64, 2 }, // VBLENDVPD(X,VPSUBQ(0,X),X) | |||
2432 | { ISD::ABS, MVT::v8i32, 1 }, | |||
2433 | { ISD::ABS, MVT::v16i16, 1 }, | |||
2434 | { ISD::ABS, MVT::v32i8, 1 }, | |||
2435 | { ISD::BITREVERSE, MVT::v4i64, 5 }, | |||
2436 | { ISD::BITREVERSE, MVT::v8i32, 5 }, | |||
2437 | { ISD::BITREVERSE, MVT::v16i16, 5 }, | |||
2438 | { ISD::BITREVERSE, MVT::v32i8, 5 }, | |||
2439 | { ISD::BSWAP, MVT::v4i64, 1 }, | |||
2440 | { ISD::BSWAP, MVT::v8i32, 1 }, | |||
2441 | { ISD::BSWAP, MVT::v16i16, 1 }, | |||
2442 | { ISD::CTLZ, MVT::v4i64, 23 }, | |||
2443 | { ISD::CTLZ, MVT::v8i32, 18 }, | |||
2444 | { ISD::CTLZ, MVT::v16i16, 14 }, | |||
2445 | { ISD::CTLZ, MVT::v32i8, 9 }, | |||
2446 | { ISD::CTPOP, MVT::v4i64, 7 }, | |||
2447 | { ISD::CTPOP, MVT::v8i32, 11 }, | |||
2448 | { ISD::CTPOP, MVT::v16i16, 9 }, | |||
2449 | { ISD::CTPOP, MVT::v32i8, 6 }, | |||
2450 | { ISD::CTTZ, MVT::v4i64, 10 }, | |||
2451 | { ISD::CTTZ, MVT::v8i32, 14 }, | |||
2452 | { ISD::CTTZ, MVT::v16i16, 12 }, | |||
2453 | { ISD::CTTZ, MVT::v32i8, 9 }, | |||
2454 | { ISD::SADDSAT, MVT::v16i16, 1 }, | |||
2455 | { ISD::SADDSAT, MVT::v32i8, 1 }, | |||
2456 | { ISD::SMAX, MVT::v8i32, 1 }, | |||
2457 | { ISD::SMAX, MVT::v16i16, 1 }, | |||
2458 | { ISD::SMAX, MVT::v32i8, 1 }, | |||
2459 | { ISD::SMIN, MVT::v8i32, 1 }, | |||
2460 | { ISD::SMIN, MVT::v16i16, 1 }, | |||
2461 | { ISD::SMIN, MVT::v32i8, 1 }, | |||
2462 | { ISD::SSUBSAT, MVT::v16i16, 1 }, | |||
2463 | { ISD::SSUBSAT, MVT::v32i8, 1 }, | |||
2464 | { ISD::UADDSAT, MVT::v16i16, 1 }, | |||
2465 | { ISD::UADDSAT, MVT::v32i8, 1 }, | |||
2466 | { ISD::UADDSAT, MVT::v8i32, 3 }, // not + pminud + paddd | |||
2467 | { ISD::UMAX, MVT::v8i32, 1 }, | |||
2468 | { ISD::UMAX, MVT::v16i16, 1 }, | |||
2469 | { ISD::UMAX, MVT::v32i8, 1 }, | |||
2470 | { ISD::UMIN, MVT::v8i32, 1 }, | |||
2471 | { ISD::UMIN, MVT::v16i16, 1 }, | |||
2472 | { ISD::UMIN, MVT::v32i8, 1 }, | |||
2473 | { ISD::USUBSAT, MVT::v16i16, 1 }, | |||
2474 | { ISD::USUBSAT, MVT::v32i8, 1 }, | |||
2475 | { ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd | |||
2476 | { ISD::FMAXNUM, MVT::v8f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS | |||
2477 | { ISD::FMAXNUM, MVT::v4f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD | |||
2478 | { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/ | |||
2479 | { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/ | |||
2480 | { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/ | |||
2481 | { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/ | |||
2482 | { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/ | |||
2483 | { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/ | |||
2484 | }; | |||
2485 | static const CostTblEntry AVX1CostTbl[] = { | |||
2486 | { ISD::ABS, MVT::v4i64, 5 }, // VBLENDVPD(X,VPSUBQ(0,X),X) | |||
2487 | { ISD::ABS, MVT::v8i32, 3 }, | |||
2488 | { ISD::ABS, MVT::v16i16, 3 }, | |||
2489 | { ISD::ABS, MVT::v32i8, 3 }, | |||
2490 | { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert | |||
2491 | { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert | |||
2492 | { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert | |||
2493 | { ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert | |||
2494 | { ISD::BSWAP, MVT::v4i64, 4 }, | |||
2495 | { ISD::BSWAP, MVT::v8i32, 4 }, | |||
2496 | { ISD::BSWAP, MVT::v16i16, 4 }, | |||
2497 | { ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert | |||
2498 | { ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert | |||
2499 | { ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert | |||
2500 | { ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert | |||
2501 | { ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert | |||
2502 | { ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert | |||
2503 | { ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert | |||
2504 | { ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert | |||
2505 | { ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert | |||
2506 | { ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert | |||
2507 | { ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert | |||
2508 | { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert | |||
2509 | { ISD::SADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert | |||
2510 | { ISD::SADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert | |||
2511 | { ISD::SMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert | |||
2512 | { ISD::SMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert | |||
2513 | { ISD::SMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert | |||
2514 | { ISD::SMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert | |||
2515 | { ISD::SMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert | |||
2516 | { ISD::SMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert | |||
2517 | { ISD::SSUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert | |||
2518 | { ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert | |||
2519 | { ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert | |||
2520 | { ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert | |||
2521 | { ISD::UADDSAT, MVT::v8i32, 8 }, // 2 x 128-bit Op + extract/insert | |||
2522 | { ISD::UMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert | |||
2523 | { ISD::UMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert | |||
2524 | { ISD::UMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert | |||
2525 | { ISD::UMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert | |||
2526 | { ISD::UMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert | |||
2527 | { ISD::UMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert | |||
2528 | { ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert | |||
2529 | { ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert | |||
2530 | { ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert | |||
2531 | { ISD::FMAXNUM, MVT::f32, 3 }, // MAXSS + CMPUNORDSS + BLENDVPS | |||
2532 | { ISD::FMAXNUM, MVT::v4f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS | |||
2533 | { ISD::FMAXNUM, MVT::v8f32, 5 }, // MAXPS + CMPUNORDPS + BLENDVPS + ? | |||
2534 | { ISD::FMAXNUM, MVT::f64, 3 }, // MAXSD + CMPUNORDSD + BLENDVPD | |||
2535 | { ISD::FMAXNUM, MVT::v2f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD | |||
2536 | { ISD::FMAXNUM, MVT::v4f64, 5 }, // MAXPD + CMPUNORDPD + BLENDVPD + ? | |||
2537 | { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/ | |||
2538 | { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/ | |||
2539 | { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/ | |||
2540 | { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/ | |||
2541 | { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/ | |||
2542 | { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/ | |||
2543 | }; | |||
2544 | static const CostTblEntry GLMCostTbl[] = { | |||
2545 | { ISD::FSQRT, MVT::f32, 19 }, // sqrtss | |||
2546 | { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps | |||
2547 | { ISD::FSQRT, MVT::f64, 34 }, // sqrtsd | |||
2548 | { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd | |||
2549 | }; | |||
2550 | static const CostTblEntry SLMCostTbl[] = { | |||
2551 | { ISD::FSQRT, MVT::f32, 20 }, // sqrtss | |||
2552 | { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps | |||
2553 | { ISD::FSQRT, MVT::f64, 35 }, // sqrtsd | |||
2554 | { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd | |||
2555 | }; | |||
2556 | static const CostTblEntry SSE42CostTbl[] = { | |||
2557 | { ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd | |||
2558 | { ISD::UADDSAT, MVT::v4i32, 3 }, // not + pminud + paddd | |||
2559 | { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/ | |||
2560 | { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/ | |||
2561 | }; | |||
2562 | static const CostTblEntry SSE41CostTbl[] = { | |||
2563 | { ISD::ABS, MVT::v2i64, 2 }, // BLENDVPD(X,PSUBQ(0,X),X) | |||
2564 | { ISD::SMAX, MVT::v4i32, 1 }, | |||
2565 | { ISD::SMAX, MVT::v16i8, 1 }, | |||
2566 | { ISD::SMIN, MVT::v4i32, 1 }, | |||
2567 | { ISD::SMIN, MVT::v16i8, 1 }, | |||
2568 | { ISD::UMAX, MVT::v4i32, 1 }, | |||
2569 | { ISD::UMAX, MVT::v8i16, 1 }, | |||
2570 | { ISD::UMIN, MVT::v4i32, 1 }, | |||
2571 | { ISD::UMIN, MVT::v8i16, 1 }, | |||
2572 | }; | |||
2573 | static const CostTblEntry SSSE3CostTbl[] = { | |||
2574 | { ISD::ABS, MVT::v4i32, 1 }, | |||
2575 | { ISD::ABS, MVT::v8i16, 1 }, | |||
2576 | { ISD::ABS, MVT::v16i8, 1 }, | |||
2577 | { ISD::BITREVERSE, MVT::v2i64, 5 }, | |||
2578 | { ISD::BITREVERSE, MVT::v4i32, 5 }, | |||
2579 | { ISD::BITREVERSE, MVT::v8i16, 5 }, | |||
2580 | { ISD::BITREVERSE, MVT::v16i8, 5 }, | |||
2581 | { ISD::BSWAP, MVT::v2i64, 1 }, | |||
2582 | { ISD::BSWAP, MVT::v4i32, 1 }, | |||
2583 | { ISD::BSWAP, MVT::v8i16, 1 }, | |||
2584 | { ISD::CTLZ, MVT::v2i64, 23 }, | |||
2585 | { ISD::CTLZ, MVT::v4i32, 18 }, | |||
2586 | { ISD::CTLZ, MVT::v8i16, 14 }, | |||
2587 | { ISD::CTLZ, MVT::v16i8, 9 }, | |||
2588 | { ISD::CTPOP, MVT::v2i64, 7 }, | |||
2589 | { ISD::CTPOP, MVT::v4i32, 11 }, | |||
2590 | { ISD::CTPOP, MVT::v8i16, 9 }, | |||
2591 | { ISD::CTPOP, MVT::v16i8, 6 }, | |||
2592 | { ISD::CTTZ, MVT::v2i64, 10 }, | |||
2593 | { ISD::CTTZ, MVT::v4i32, 14 }, | |||
2594 | { ISD::CTTZ, MVT::v8i16, 12 }, | |||
2595 | { ISD::CTTZ, MVT::v16i8, 9 } | |||
2596 | }; | |||
2597 | static const CostTblEntry SSE2CostTbl[] = { | |||
2598 | { ISD::ABS, MVT::v2i64, 4 }, | |||
2599 | { ISD::ABS, MVT::v4i32, 3 }, | |||
2600 | { ISD::ABS, MVT::v8i16, 2 }, | |||
2601 | { ISD::ABS, MVT::v16i8, 2 }, | |||
2602 | { ISD::BITREVERSE, MVT::v2i64, 29 }, | |||
2603 | { ISD::BITREVERSE, MVT::v4i32, 27 }, | |||
2604 | { ISD::BITREVERSE, MVT::v8i16, 27 }, | |||
2605 | { ISD::BITREVERSE, MVT::v16i8, 20 }, | |||
2606 | { ISD::BSWAP, MVT::v2i64, 7 }, | |||
2607 | { ISD::BSWAP, MVT::v4i32, 7 }, | |||
2608 | { ISD::BSWAP, MVT::v8i16, 7 }, | |||
2609 | { ISD::CTLZ, MVT::v2i64, 25 }, | |||
2610 | { ISD::CTLZ, MVT::v4i32, 26 }, | |||
2611 | { ISD::CTLZ, MVT::v8i16, 20 }, | |||
2612 | { ISD::CTLZ, MVT::v16i8, 17 }, | |||
2613 | { ISD::CTPOP, MVT::v2i64, 12 }, | |||
2614 | { ISD::CTPOP, MVT::v4i32, 15 }, | |||
2615 | { ISD::CTPOP, MVT::v8i16, 13 }, | |||
2616 | { ISD::CTPOP, MVT::v16i8, 10 }, | |||
2617 | { ISD::CTTZ, MVT::v2i64, 14 }, | |||
2618 | { ISD::CTTZ, MVT::v4i32, 18 }, | |||
2619 | { ISD::CTTZ, MVT::v8i16, 16 }, | |||
2620 | { ISD::CTTZ, MVT::v16i8, 13 }, | |||
2621 | { ISD::SADDSAT, MVT::v8i16, 1 }, | |||
2622 | { ISD::SADDSAT, MVT::v16i8, 1 }, | |||
2623 | { ISD::SMAX, MVT::v8i16, 1 }, | |||
2624 | { ISD::SMIN, MVT::v8i16, 1 }, | |||
2625 | { ISD::SSUBSAT, MVT::v8i16, 1 }, | |||
2626 | { ISD::SSUBSAT, MVT::v16i8, 1 }, | |||
2627 | { ISD::UADDSAT, MVT::v8i16, 1 }, | |||
2628 | { ISD::UADDSAT, MVT::v16i8, 1 }, | |||
2629 | { ISD::UMAX, MVT::v8i16, 2 }, | |||
2630 | { ISD::UMAX, MVT::v16i8, 1 }, | |||
2631 | { ISD::UMIN, MVT::v8i16, 2 }, | |||
2632 | { ISD::UMIN, MVT::v16i8, 1 }, | |||
2633 | { ISD::USUBSAT, MVT::v8i16, 1 }, | |||
2634 | { ISD::USUBSAT, MVT::v16i8, 1 }, | |||
2635 | { ISD::FMAXNUM, MVT::f64, 4 }, | |||
2636 | { ISD::FMAXNUM, MVT::v2f64, 4 }, | |||
2637 | { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/ | |||
2638 | { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/ | |||
2639 | }; | |||
2640 | static const CostTblEntry SSE1CostTbl[] = { | |||
2641 | { ISD::FMAXNUM, MVT::f32, 4 }, | |||
2642 | { ISD::FMAXNUM, MVT::v4f32, 4 }, | |||
2643 | { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/ | |||
2644 | { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/ | |||
2645 | }; | |||
2646 | static const CostTblEntry BMI64CostTbl[] = { // 64-bit targets | |||
2647 | { ISD::CTTZ, MVT::i64, 1 }, | |||
2648 | }; | |||
2649 | static const CostTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets | |||
2650 | { ISD::CTTZ, MVT::i32, 1 }, | |||
2651 | { ISD::CTTZ, MVT::i16, 1 }, | |||
2652 | { ISD::CTTZ, MVT::i8, 1 }, | |||
2653 | }; | |||
2654 | static const CostTblEntry LZCNT64CostTbl[] = { // 64-bit targets | |||
2655 | { ISD::CTLZ, MVT::i64, 1 }, | |||
2656 | }; | |||
2657 | static const CostTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets | |||
2658 | { ISD::CTLZ, MVT::i32, 1 }, | |||
2659 | { ISD::CTLZ, MVT::i16, 1 }, | |||
2660 | { ISD::CTLZ, MVT::i8, 1 }, | |||
2661 | }; | |||
2662 | static const CostTblEntry POPCNT64CostTbl[] = { // 64-bit targets | |||
2663 | { ISD::CTPOP, MVT::i64, 1 }, | |||
2664 | }; | |||
2665 | static const CostTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets | |||
2666 | { ISD::CTPOP, MVT::i32, 1 }, | |||
2667 | { ISD::CTPOP, MVT::i16, 1 }, | |||
2668 | { ISD::CTPOP, MVT::i8, 1 }, | |||
2669 | }; | |||
2670 | static const CostTblEntry X64CostTbl[] = { // 64-bit targets | |||
2671 | { ISD::ABS, MVT::i64, 2 }, // SUB+CMOV | |||
2672 | { ISD::BITREVERSE, MVT::i64, 14 }, | |||
2673 | { ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV | |||
2674 | { ISD::CTTZ, MVT::i64, 3 }, // TEST+BSF+CMOV/BRANCH | |||
2675 | { ISD::CTPOP, MVT::i64, 10 }, | |||
2676 | { ISD::SADDO, MVT::i64, 1 }, | |||
2677 | { ISD::UADDO, MVT::i64, 1 }, | |||
2678 | { ISD::UMULO, MVT::i64, 2 }, // mulq + seto | |||
2679 | }; | |||
2680 | static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets | |||
2681 | { ISD::ABS, MVT::i32, 2 }, // SUB+CMOV | |||
2682 | { ISD::ABS, MVT::i16, 2 }, // SUB+CMOV | |||
2683 | { ISD::BITREVERSE, MVT::i32, 14 }, | |||
2684 | { ISD::BITREVERSE, MVT::i16, 14 }, | |||
2685 | { ISD::BITREVERSE, MVT::i8, 11 }, | |||
2686 | { ISD::CTLZ, MVT::i32, 4 }, // BSR+XOR or BSR+XOR+CMOV | |||
2687 | { ISD::CTLZ, MVT::i16, 4 }, // BSR+XOR or BSR+XOR+CMOV | |||
2688 | { ISD::CTLZ, MVT::i8, 4 }, // BSR+XOR or BSR+XOR+CMOV | |||
2689 | { ISD::CTTZ, MVT::i32, 3 }, // TEST+BSF+CMOV/BRANCH | |||
2690 | { ISD::CTTZ, MVT::i16, 3 }, // TEST+BSF+CMOV/BRANCH | |||
2691 | { ISD::CTTZ, MVT::i8, 3 }, // TEST+BSF+CMOV/BRANCH | |||
2692 | { ISD::CTPOP, MVT::i32, 8 }, | |||
2693 | { ISD::CTPOP, MVT::i16, 9 }, | |||
2694 | { ISD::CTPOP, MVT::i8, 7 }, | |||
2695 | { ISD::SADDO, MVT::i32, 1 }, | |||
2696 | { ISD::SADDO, MVT::i16, 1 }, | |||
2697 | { ISD::SADDO, MVT::i8, 1 }, | |||
2698 | { ISD::UADDO, MVT::i32, 1 }, | |||
2699 | { ISD::UADDO, MVT::i16, 1 }, | |||
2700 | { ISD::UADDO, MVT::i8, 1 }, | |||
2701 | { ISD::UMULO, MVT::i32, 2 }, // mul + seto | |||
2702 | { ISD::UMULO, MVT::i16, 2 }, | |||
2703 | { ISD::UMULO, MVT::i8, 2 }, | |||
2704 | }; | |||
2705 | ||||
2706 | Type *RetTy = ICA.getReturnType(); | |||
2707 | Type *OpTy = RetTy; | |||
2708 | Intrinsic::ID IID = ICA.getID(); | |||
2709 | unsigned ISD = ISD::DELETED_NODE; | |||
2710 | switch (IID) { | |||
2711 | default: | |||
2712 | break; | |||
2713 | case Intrinsic::abs: | |||
2714 | ISD = ISD::ABS; | |||
2715 | break; | |||
2716 | case Intrinsic::bitreverse: | |||
2717 | ISD = ISD::BITREVERSE; | |||
2718 | break; | |||
2719 | case Intrinsic::bswap: | |||
2720 | ISD = ISD::BSWAP; | |||
2721 | break; | |||
2722 | case Intrinsic::ctlz: | |||
2723 | ISD = ISD::CTLZ; | |||
2724 | break; | |||
2725 | case Intrinsic::ctpop: | |||
2726 | ISD = ISD::CTPOP; | |||
2727 | break; | |||
2728 | case Intrinsic::cttz: | |||
2729 | ISD = ISD::CTTZ; | |||
2730 | break; | |||
2731 | case Intrinsic::maxnum: | |||
2732 | case Intrinsic::minnum: | |||
2733 | // FMINNUM has same costs so don't duplicate. | |||
2734 | ISD = ISD::FMAXNUM; | |||
2735 | break; | |||
2736 | case Intrinsic::sadd_sat: | |||
2737 | ISD = ISD::SADDSAT; | |||
2738 | break; | |||
2739 | case Intrinsic::smax: | |||
2740 | ISD = ISD::SMAX; | |||
2741 | break; | |||
2742 | case Intrinsic::smin: | |||
2743 | ISD = ISD::SMIN; | |||
2744 | break; | |||
2745 | case Intrinsic::ssub_sat: | |||
2746 | ISD = ISD::SSUBSAT; | |||
2747 | break; | |||
2748 | case Intrinsic::uadd_sat: | |||
2749 | ISD = ISD::UADDSAT; | |||
2750 | break; | |||
2751 | case Intrinsic::umax: | |||
2752 | ISD = ISD::UMAX; | |||
2753 | break; | |||
2754 | case Intrinsic::umin: | |||
2755 | ISD = ISD::UMIN; | |||
2756 | break; | |||
2757 | case Intrinsic::usub_sat: | |||
2758 | ISD = ISD::USUBSAT; | |||
2759 | break; | |||
2760 | case Intrinsic::sqrt: | |||
2761 | ISD = ISD::FSQRT; | |||
2762 | break; | |||
2763 | case Intrinsic::sadd_with_overflow: | |||
2764 | case Intrinsic::ssub_with_overflow: | |||
2765 | // SSUBO has same costs so don't duplicate. | |||
2766 | ISD = ISD::SADDO; | |||
2767 | OpTy = RetTy->getContainedType(0); | |||
2768 | break; | |||
2769 | case Intrinsic::uadd_with_overflow: | |||
2770 | case Intrinsic::usub_with_overflow: | |||
2771 | // USUBO has same costs so don't duplicate. | |||
2772 | ISD = ISD::UADDO; | |||
2773 | OpTy = RetTy->getContainedType(0); | |||
2774 | break; | |||
2775 | case Intrinsic::umul_with_overflow: | |||
2776 | case Intrinsic::smul_with_overflow: | |||
2777 | // SMULO has same costs so don't duplicate. | |||
2778 | ISD = ISD::UMULO; | |||
2779 | OpTy = RetTy->getContainedType(0); | |||
2780 | break; | |||
2781 | } | |||
2782 | ||||
2783 | if (ISD != ISD::DELETED_NODE) { | |||
2784 | // Legalize the type. | |||
2785 | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, OpTy); | |||
2786 | MVT MTy = LT.second; | |||
2787 | ||||
2788 | // Attempt to lookup cost. | |||
2789 | if (ISD == ISD::BITREVERSE && ST->hasGFNI() && ST->hasSSSE3() && | |||
2790 | MTy.isVector()) { | |||
2791 | // With PSHUFB the code is very similar for all types. If we have integer | |||
2792 | // byte operations, we just need a GF2P8AFFINEQB for vXi8. For other types | |||
2793 | // we also need a PSHUFB. | |||
2794 | unsigned Cost = MTy.getVectorElementType() == MVT::i8 ? 1 : 2; | |||
2795 | ||||
2796 | // Without byte operations, we need twice as many GF2P8AFFINEQB and PSHUFB | |||
2797 | // instructions. We also need an extract and an insert. | |||
2798 | if (!(MTy.is128BitVector() || (ST->hasAVX2() && MTy.is256BitVector()) || | |||
2799 | (ST->hasBWI() && MTy.is512BitVector()))) | |||
2800 | Cost = Cost * 2 + 2; | |||
2801 | ||||
2802 | return LT.first * Cost; | |||
2803 | } | |||
2804 | ||||
2805 | auto adjustTableCost = [](const CostTblEntry &Entry, int LegalizationCost, | |||
2806 | FastMathFlags FMF) { | |||
2807 | // If there are no NANs to deal with, then these are reduced to a | |||
2808 | // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we | |||
2809 | // assume is used in the non-fast case. | |||
2810 | if (Entry.ISD == ISD::FMAXNUM || Entry.ISD == ISD::FMINNUM) { | |||
2811 | if (FMF.noNaNs()) | |||
2812 | return LegalizationCost * 1; | |||
2813 | } | |||
2814 | return LegalizationCost * (int)Entry.Cost; | |||
2815 | }; | |||
2816 | ||||
2817 | if (ST->useGLMDivSqrtCosts()) | |||
2818 | if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy)) | |||
2819 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
2820 | ||||
2821 | if (ST->isSLM()) | |||
2822 | if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) | |||
2823 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
2824 | ||||
2825 | if (ST->hasCDI()) | |||
2826 | if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy)) | |||
2827 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
2828 | ||||
2829 | if (ST->hasBWI()) | |||
2830 | if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) | |||
2831 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
2832 | ||||
2833 | if (ST->hasAVX512()) | |||
2834 | if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) | |||
2835 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
2836 | ||||
2837 | if (ST->hasXOP()) | |||
2838 | if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy)) | |||
2839 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
2840 | ||||
2841 | if (ST->hasAVX2()) | |||
2842 | if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) | |||
2843 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
2844 | ||||
2845 | if (ST->hasAVX()) | |||
2846 | if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) | |||
2847 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
2848 | ||||
2849 | if (ST->hasSSE42()) | |||
2850 | if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) | |||
2851 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
2852 | ||||
2853 | if (ST->hasSSE41()) | |||
2854 | if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) | |||
2855 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
2856 | ||||
2857 | if (ST->hasSSSE3()) | |||
2858 | if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy)) | |||
2859 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
2860 | ||||
2861 | if (ST->hasSSE2()) | |||
2862 | if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) | |||
2863 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
2864 | ||||
2865 | if (ST->hasSSE1()) | |||
2866 | if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) | |||
2867 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
2868 | ||||
2869 | if (ST->hasBMI()) { | |||
2870 | if (ST->is64Bit()) | |||
2871 | if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy)) | |||
2872 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
2873 | ||||
2874 | if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy)) | |||
2875 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
2876 | } | |||
2877 | ||||
2878 | if (ST->hasLZCNT()) { | |||
2879 | if (ST->is64Bit()) | |||
2880 | if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy)) | |||
2881 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
2882 | ||||
2883 | if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy)) | |||
2884 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
2885 | } | |||
2886 | ||||
2887 | if (ST->hasPOPCNT()) { | |||
2888 | if (ST->is64Bit()) | |||
2889 | if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy)) | |||
2890 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
2891 | ||||
2892 | if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy)) | |||
2893 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
2894 | } | |||
2895 | ||||
2896 | // TODO - add BMI (TZCNT) scalar handling | |||
2897 | ||||
2898 | if (ST->is64Bit()) | |||
2899 | if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy)) | |||
2900 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
2901 | ||||
2902 | if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy)) | |||
2903 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | |||
2904 | } | |||
2905 | ||||
2906 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); | |||
2907 | } | |||
2908 | ||||
2909 | int X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, | |||
2910 | TTI::TargetCostKind CostKind) { | |||
2911 | if (ICA.isTypeBasedOnly()) | |||
2912 | return getTypeBasedIntrinsicInstrCost(ICA, CostKind); | |||
2913 | ||||
2914 | static const CostTblEntry AVX512CostTbl[] = { | |||
2915 | { ISD::ROTL, MVT::v8i64, 1 }, | |||
2916 | { ISD::ROTL, MVT::v4i64, 1 }, | |||
2917 | { ISD::ROTL, MVT::v2i64, 1 }, | |||
2918 | { ISD::ROTL, MVT::v16i32, 1 }, | |||
2919 | { ISD::ROTL, MVT::v8i32, 1 }, | |||
2920 | { ISD::ROTL, MVT::v4i32, 1 }, | |||
2921 | { ISD::ROTR, MVT::v8i64, 1 }, | |||
2922 | { ISD::ROTR, MVT::v4i64, 1 }, | |||
2923 | { ISD::ROTR, MVT::v2i64, 1 }, | |||
2924 | { ISD::ROTR, MVT::v16i32, 1 }, | |||
2925 | { ISD::ROTR, MVT::v8i32, 1 }, | |||
2926 | { ISD::ROTR, MVT::v4i32, 1 } | |||
2927 | }; | |||
2928 | // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y)) | |||
2929 | static const CostTblEntry XOPCostTbl[] = { | |||
2930 | { ISD::ROTL, MVT::v4i64, 4 }, | |||
2931 | { ISD::ROTL, MVT::v8i32, 4 }, | |||
2932 | { ISD::ROTL, MVT::v16i16, 4 }, | |||
2933 | { ISD::ROTL, MVT::v32i8, 4 }, | |||
2934 | { ISD::ROTL, MVT::v2i64, 1 }, | |||
2935 | { ISD::ROTL, MVT::v4i32, 1 }, | |||
2936 | { ISD::ROTL, MVT::v8i16, 1 }, | |||
2937 | { ISD::ROTL, MVT::v16i8, 1 }, | |||
2938 | { ISD::ROTR, MVT::v4i64, 6 }, | |||
2939 | { ISD::ROTR, MVT::v8i32, 6 }, | |||
2940 | { ISD::ROTR, MVT::v16i16, 6 }, | |||
2941 | { ISD::ROTR, MVT::v32i8, 6 }, | |||
2942 | { ISD::ROTR, MVT::v2i64, 2 }, | |||
2943 | { ISD::ROTR, MVT::v4i32, 2 }, | |||
2944 | { ISD::ROTR, MVT::v8i16, 2 }, | |||
2945 | { ISD::ROTR, MVT::v16i8, 2 } | |||
2946 | }; | |||
2947 | static const CostTblEntry X64CostTbl[] = { // 64-bit targets | |||
2948 | { ISD::ROTL, MVT::i64, 1 }, | |||
2949 | { ISD::ROTR, MVT::i64, 1 }, | |||
2950 | { ISD::FSHL, MVT::i64, 4 } | |||
2951 | }; | |||
2952 | static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets | |||
2953 | { ISD::ROTL, MVT::i32, 1 }, | |||
2954 | { ISD::ROTL, MVT::i16, 1 }, | |||
2955 | { ISD::ROTL, MVT::i8, 1 }, | |||
2956 | { ISD::ROTR, MVT::i32, 1 }, | |||
2957 | { ISD::ROTR, MVT::i16, 1 }, | |||
2958 | { ISD::ROTR, MVT::i8, 1 }, | |||
2959 | { ISD::FSHL, MVT::i32, 4 }, | |||
2960 | { ISD::FSHL, MVT::i16, 4 }, | |||
2961 | { ISD::FSHL, MVT::i8, 4 } | |||
2962 | }; | |||
2963 | ||||
2964 | Intrinsic::ID IID = ICA.getID(); | |||
2965 | Type *RetTy = ICA.getReturnType(); | |||
2966 | const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); | |||
2967 | unsigned ISD = ISD::DELETED_NODE; | |||
2968 | switch (IID) { | |||
2969 | default: | |||
2970 | break; | |||
2971 | case Intrinsic::fshl: | |||
2972 | ISD = ISD::FSHL; | |||
2973 | if (Args[0] == Args[1]) | |||
2974 | ISD = ISD::ROTL; | |||
2975 | break; | |||
2976 | case Intrinsic::fshr: | |||
2977 | // FSHR has same costs so don't duplicate. | |||
2978 | ISD = ISD::FSHL; | |||
2979 | if (Args[0] == Args[1]) | |||
2980 | ISD = ISD::ROTR; | |||
2981 | break; | |||
2982 | } | |||
2983 | ||||
2984 | if (ISD != ISD::DELETED_NODE) { | |||
2985 | // Legalize the type. | |||
2986 | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy); | |||
2987 | MVT MTy = LT.second; | |||
2988 | ||||
2989 | // Attempt to lookup cost. | |||
2990 | if (ST->hasAVX512()) | |||
2991 | if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) | |||
2992 | return LT.first * Entry->Cost; | |||
2993 | ||||
2994 | if (ST->hasXOP()) | |||
2995 | if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy)) | |||
2996 | return LT.first * Entry->Cost; | |||
2997 | ||||
2998 | if (ST->is64Bit()) | |||
2999 | if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy)) | |||
3000 | return LT.first * Entry->Cost; | |||
3001 | ||||
3002 | if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy)) | |||
3003 | return LT.first * Entry->Cost; | |||
3004 | } | |||
3005 | ||||
3006 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); | |||
3007 | } | |||
3008 | ||||
3009 | int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { | |||
3010 | static const CostTblEntry SLMCostTbl[] = { | |||
3011 | { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 }, | |||
3012 | { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 }, | |||
3013 | { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 }, | |||
3014 | { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 } | |||
3015 | }; | |||
3016 | ||||
3017 | assert(Val->isVectorTy() && "This must be a vector type")((Val->isVectorTy() && "This must be a vector type" ) ? static_cast<void> (0) : __assert_fail ("Val->isVectorTy() && \"This must be a vector type\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 3017, __PRETTY_FUNCTION__)); | |||
3018 | Type *ScalarType = Val->getScalarType(); | |||
3019 | int RegisterFileMoveCost = 0; | |||
3020 | ||||
3021 | if (Index != -1U && (Opcode == Instruction::ExtractElement || | |||
3022 | Opcode == Instruction::InsertElement)) { | |||
3023 | // Legalize the type. | |||
3024 | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); | |||
3025 | ||||
3026 | // This type is legalized to a scalar type. | |||
3027 | if (!LT.second.isVector()) | |||
3028 | return 0; | |||
3029 | ||||
3030 | // The type may be split. Normalize the index to the new type. | |||
3031 | unsigned NumElts = LT.second.getVectorNumElements(); | |||
3032 | unsigned SubNumElts = NumElts; | |||
3033 | Index = Index % NumElts; | |||
3034 | ||||
3035 | // For >128-bit vectors, we need to extract higher 128-bit subvectors. | |||
3036 | // For inserts, we also need to insert the subvector back. | |||
3037 | if (LT.second.getSizeInBits() > 128) { | |||
3038 | assert((LT.second.getSizeInBits() % 128) == 0 && "Illegal vector")(((LT.second.getSizeInBits() % 128) == 0 && "Illegal vector" ) ? static_cast<void> (0) : __assert_fail ("(LT.second.getSizeInBits() % 128) == 0 && \"Illegal vector\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 3038, __PRETTY_FUNCTION__)); | |||
3039 | unsigned NumSubVecs = LT.second.getSizeInBits() / 128; | |||
3040 | SubNumElts = NumElts / NumSubVecs; | |||
3041 | if (SubNumElts <= Index) { | |||
3042 | RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1); | |||
3043 | Index %= SubNumElts; | |||
3044 | } | |||
3045 | } | |||
3046 | ||||
3047 | if (Index == 0) { | |||
3048 | // Floating point scalars are already located in index #0. | |||
3049 | // Many insertions to #0 can fold away for scalar fp-ops, so let's assume | |||
3050 | // true for all. | |||
3051 | if (ScalarType->isFloatingPointTy()) | |||
3052 | return RegisterFileMoveCost; | |||
3053 | ||||
3054 | // Assume movd/movq XMM -> GPR is relatively cheap on all targets. | |||
3055 | if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement) | |||
3056 | return 1 + RegisterFileMoveCost; | |||
3057 | } | |||
3058 | ||||
3059 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
3060 | assert(ISD && "Unexpected vector opcode")((ISD && "Unexpected vector opcode") ? static_cast< void> (0) : __assert_fail ("ISD && \"Unexpected vector opcode\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 3060, __PRETTY_FUNCTION__)); | |||
3061 | MVT MScalarTy = LT.second.getScalarType(); | |||
3062 | if (ST->isSLM()) | |||
3063 | if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy)) | |||
3064 | return Entry->Cost + RegisterFileMoveCost; | |||
3065 | ||||
3066 | // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets. | |||
3067 | if ((MScalarTy == MVT::i16 && ST->hasSSE2()) || | |||
3068 | (MScalarTy.isInteger() && ST->hasSSE41())) | |||
3069 | return 1 + RegisterFileMoveCost; | |||
3070 | ||||
3071 | // Assume insertps is relatively cheap on all targets. | |||
3072 | if (MScalarTy == MVT::f32 && ST->hasSSE41() && | |||
3073 | Opcode == Instruction::InsertElement) | |||
3074 | return 1 + RegisterFileMoveCost; | |||
3075 | ||||
3076 | // For extractions we just need to shuffle the element to index 0, which | |||
3077 | // should be very cheap (assume cost = 1). For insertions we need to shuffle | |||
3078 | // the elements to its destination. In both cases we must handle the | |||
3079 | // subvector move(s). | |||
3080 | // If the vector type is already less than 128-bits then don't reduce it. | |||
3081 | // TODO: Under what circumstances should we shuffle using the full width? | |||
3082 | int ShuffleCost = 1; | |||
3083 | if (Opcode == Instruction::InsertElement) { | |||
3084 | auto *SubTy = cast<VectorType>(Val); | |||
3085 | EVT VT = TLI->getValueType(DL, Val); | |||
3086 | if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128) | |||
3087 | SubTy = FixedVectorType::get(ScalarType, SubNumElts); | |||
3088 | ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, 0, SubTy); | |||
3089 | } | |||
3090 | int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1; | |||
3091 | return ShuffleCost + IntOrFpCost + RegisterFileMoveCost; | |||
3092 | } | |||
3093 | ||||
3094 | // Add to the base cost if we know that the extracted element of a vector is | |||
3095 | // destined to be moved to and used in the integer register file. | |||
3096 | if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy()) | |||
3097 | RegisterFileMoveCost += 1; | |||
3098 | ||||
3099 | return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost; | |||
3100 | } | |||
3101 | ||||
3102 | unsigned X86TTIImpl::getScalarizationOverhead(VectorType *Ty, | |||
3103 | const APInt &DemandedElts, | |||
3104 | bool Insert, bool Extract) { | |||
3105 | unsigned Cost = 0; | |||
3106 | ||||
3107 | // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much | |||
3108 | // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT. | |||
3109 | if (Insert) { | |||
3110 | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); | |||
3111 | MVT MScalarTy = LT.second.getScalarType(); | |||
3112 | ||||
3113 | if ((MScalarTy == MVT::i16 && ST->hasSSE2()) || | |||
3114 | (MScalarTy.isInteger() && ST->hasSSE41()) || | |||
3115 | (MScalarTy == MVT::f32 && ST->hasSSE41())) { | |||
3116 | // For types we can insert directly, insertion into 128-bit sub vectors is | |||
3117 | // cheap, followed by a cheap chain of concatenations. | |||
3118 | if (LT.second.getSizeInBits() <= 128) { | |||
3119 | Cost += | |||
3120 | BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false); | |||
3121 | } else { | |||
3122 | // In each 128-lane, if at least one index is demanded but not all | |||
3123 | // indices are demanded and this 128-lane is not the first 128-lane of | |||
3124 | // the legalized-vector, then this 128-lane needs a extracti128; If in | |||
3125 | // each 128-lane, there is at least one demanded index, this 128-lane | |||
3126 | // needs a inserti128. | |||
3127 | ||||
3128 | // The following cases will help you build a better understanding: | |||
3129 | // Assume we insert several elements into a v8i32 vector in avx2, | |||
3130 | // Case#1: inserting into 1th index needs vpinsrd + inserti128. | |||
3131 | // Case#2: inserting into 5th index needs extracti128 + vpinsrd + | |||
3132 | // inserti128. | |||
3133 | // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128. | |||
3134 | unsigned Num128Lanes = LT.second.getSizeInBits() / 128 * LT.first; | |||
3135 | unsigned NumElts = LT.second.getVectorNumElements() * LT.first; | |||
3136 | APInt WidenedDemandedElts = DemandedElts.zextOrSelf(NumElts); | |||
3137 | unsigned Scale = NumElts / Num128Lanes; | |||
3138 | // We iterate each 128-lane, and check if we need a | |||
3139 | // extracti128/inserti128 for this 128-lane. | |||
3140 | for (unsigned I = 0; I < NumElts; I += Scale) { | |||
3141 | APInt Mask = WidenedDemandedElts.getBitsSet(NumElts, I, I + Scale); | |||
3142 | APInt MaskedDE = Mask & WidenedDemandedElts; | |||
3143 | unsigned Population = MaskedDE.countPopulation(); | |||
3144 | Cost += (Population > 0 && Population != Scale && | |||
3145 | I % LT.second.getVectorNumElements() != 0); | |||
3146 | Cost += Population > 0; | |||
3147 | } | |||
3148 | Cost += DemandedElts.countPopulation(); | |||
3149 | ||||
3150 | // For vXf32 cases, insertion into the 0'th index in each v4f32 | |||
3151 | // 128-bit vector is free. | |||
3152 | // NOTE: This assumes legalization widens vXf32 vectors. | |||
3153 | if (MScalarTy == MVT::f32) | |||
3154 | for (unsigned i = 0, e = cast<FixedVectorType>(Ty)->getNumElements(); | |||
3155 | i < e; i += 4) | |||
3156 | if (DemandedElts[i]) | |||
3157 | Cost--; | |||
3158 | } | |||
3159 | } else if (LT.second.isVector()) { | |||
3160 | // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded | |||
3161 | // integer element as a SCALAR_TO_VECTOR, then we build the vector as a | |||
3162 | // series of UNPCK followed by CONCAT_VECTORS - all of these can be | |||
3163 | // considered cheap. | |||
3164 | if (Ty->isIntOrIntVectorTy()) | |||
3165 | Cost += DemandedElts.countPopulation(); | |||
3166 | ||||
3167 | // Get the smaller of the legalized or original pow2-extended number of | |||
3168 | // vector elements, which represents the number of unpacks we'll end up | |||
3169 | // performing. | |||
3170 | unsigned NumElts = LT.second.getVectorNumElements(); | |||
3171 | unsigned Pow2Elts = | |||
3172 | PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements()); | |||
3173 | Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first; | |||
3174 | } | |||
3175 | } | |||
3176 | ||||
3177 | // TODO: Use default extraction for now, but we should investigate extending this | |||
3178 | // to handle repeated subvector extraction. | |||
3179 | if (Extract) | |||
3180 | Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, false, Extract); | |||
3181 | ||||
3182 | return Cost; | |||
3183 | } | |||
3184 | ||||
3185 | int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, | |||
3186 | MaybeAlign Alignment, unsigned AddressSpace, | |||
3187 | TTI::TargetCostKind CostKind, | |||
3188 | const Instruction *I) { | |||
3189 | // TODO: Handle other cost kinds. | |||
3190 | if (CostKind != TTI::TCK_RecipThroughput) { | |||
3191 | if (isa_and_nonnull<StoreInst>(I)) { | |||
3192 | Value *Ptr = I->getOperand(1); | |||
| ||||
3193 | // Store instruction with index and scale costs 2 Uops. | |||
3194 | // Check the preceding GEP to identify non-const indices. | |||
3195 | if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) { | |||
3196 | if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); })) | |||
3197 | return TTI::TCC_Basic * 2; | |||
3198 | } | |||
3199 | } | |||
3200 | return TTI::TCC_Basic; | |||
3201 | } | |||
3202 | ||||
3203 | // Handle non-power-of-two vectors such as <3 x float> | |||
3204 | if (auto *VTy = dyn_cast<FixedVectorType>(Src)) { | |||
3205 | unsigned NumElem = VTy->getNumElements(); | |||
3206 | ||||
3207 | // Handle a few common cases: | |||
3208 | // <3 x float> | |||
3209 | if (NumElem == 3 && VTy->getScalarSizeInBits() == 32) | |||
3210 | // Cost = 64 bit store + extract + 32 bit store. | |||
3211 | return 3; | |||
3212 | ||||
3213 | // <3 x double> | |||
3214 | if (NumElem == 3 && VTy->getScalarSizeInBits() == 64) | |||
3215 | // Cost = 128 bit store + unpack + 64 bit store. | |||
3216 | return 3; | |||
3217 | ||||
3218 | // Assume that all other non-power-of-two numbers are scalarized. | |||
3219 | if (!isPowerOf2_32(NumElem)) { | |||
3220 | APInt DemandedElts = APInt::getAllOnesValue(NumElem); | |||
3221 | int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment, | |||
3222 | AddressSpace, CostKind); | |||
3223 | int SplitCost = getScalarizationOverhead(VTy, DemandedElts, | |||
3224 | Opcode == Instruction::Load, | |||
3225 | Opcode == Instruction::Store); | |||
3226 | return NumElem * Cost + SplitCost; | |||
3227 | } | |||
3228 | } | |||
3229 | ||||
3230 | // Type legalization can't handle structs | |||
3231 | if (TLI->getValueType(DL, Src, true) == MVT::Other) | |||
3232 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, | |||
3233 | CostKind); | |||
3234 | ||||
3235 | // Legalize the type. | |||
3236 | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); | |||
3237 | assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&(((Opcode == Instruction::Load || Opcode == Instruction::Store ) && "Invalid Opcode") ? static_cast<void> (0) : __assert_fail ("(Opcode == Instruction::Load || Opcode == Instruction::Store) && \"Invalid Opcode\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 3238, __PRETTY_FUNCTION__)) | |||
3238 | "Invalid Opcode")(((Opcode == Instruction::Load || Opcode == Instruction::Store ) && "Invalid Opcode") ? static_cast<void> (0) : __assert_fail ("(Opcode == Instruction::Load || Opcode == Instruction::Store) && \"Invalid Opcode\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 3238, __PRETTY_FUNCTION__)); | |||
3239 | ||||
3240 | // Each load/store unit costs 1. | |||
3241 | int Cost = LT.first * 1; | |||
3242 | ||||
3243 | // This isn't exactly right. We're using slow unaligned 32-byte accesses as a | |||
3244 | // proxy for a double-pumped AVX memory interface such as on Sandybridge. | |||
3245 | if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow()) | |||
3246 | Cost *= 2; | |||
3247 | ||||
3248 | return Cost; | |||
3249 | } | |||
3250 | ||||
3251 | int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, | |||
3252 | Align Alignment, unsigned AddressSpace, | |||
3253 | TTI::TargetCostKind CostKind) { | |||
3254 | bool IsLoad = (Instruction::Load == Opcode); | |||
3255 | bool IsStore = (Instruction::Store == Opcode); | |||
3256 | ||||
3257 | auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy); | |||
3258 | if (!SrcVTy) | |||
3259 | // To calculate scalar take the regular cost, without mask | |||
3260 | return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind); | |||
3261 | ||||
3262 | unsigned NumElem = SrcVTy->getNumElements(); | |||
3263 | auto *MaskTy = | |||
3264 | FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem); | |||
3265 | if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) || | |||
3266 | (IsStore && !isLegalMaskedStore(SrcVTy, Alignment)) || | |||
3267 | !isPowerOf2_32(NumElem)) { | |||
3268 | // Scalarization | |||
3269 | APInt DemandedElts = APInt::getAllOnesValue(NumElem); | |||
3270 | int MaskSplitCost = | |||
3271 | getScalarizationOverhead(MaskTy, DemandedElts, false, true); | |||
3272 | int ScalarCompareCost = getCmpSelInstrCost( | |||
3273 | Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr, | |||
3274 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | |||
3275 | int BranchCost = getCFInstrCost(Instruction::Br, CostKind); | |||
3276 | int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); | |||
3277 | int ValueSplitCost = | |||
3278 | getScalarizationOverhead(SrcVTy, DemandedElts, IsLoad, IsStore); | |||
3279 | int MemopCost = | |||
3280 | NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(), | |||
3281 | Alignment, AddressSpace, CostKind); | |||
3282 | return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost; | |||
3283 | } | |||
3284 | ||||
3285 | // Legalize the type. | |||
3286 | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy); | |||
3287 | auto VT = TLI->getValueType(DL, SrcVTy); | |||
3288 | int Cost = 0; | |||
3289 | if (VT.isSimple() && LT.second != VT.getSimpleVT() && | |||
3290 | LT.second.getVectorNumElements() == NumElem) | |||
3291 | // Promotion requires expand/truncate for data and a shuffle for mask. | |||
3292 | Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, 0, nullptr) + | |||
3293 | getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, 0, nullptr); | |||
3294 | ||||
3295 | else if (LT.second.getVectorNumElements() > NumElem) { | |||
3296 | auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(), | |||
3297 | LT.second.getVectorNumElements()); | |||
3298 | // Expanding requires fill mask with zeroes | |||
3299 | Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy); | |||
3300 | } | |||
3301 | ||||
3302 | // Pre-AVX512 - each maskmov load costs 2 + store costs ~8. | |||
3303 | if (!ST->hasAVX512()) | |||
3304 | return Cost + LT.first * (IsLoad ? 2 : 8); | |||
3305 | ||||
3306 | // AVX-512 masked load/store is cheapper | |||
3307 | return Cost + LT.first; | |||
3308 | } | |||
3309 | ||||
3310 | int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, | |||
3311 | const SCEV *Ptr) { | |||
3312 | // Address computations in vectorized code with non-consecutive addresses will | |||
3313 | // likely result in more instructions compared to scalar code where the | |||
3314 | // computation can more often be merged into the index mode. The resulting | |||
3315 | // extra micro-ops can significantly decrease throughput. | |||
3316 | const unsigned NumVectorInstToHideOverhead = 10; | |||
3317 | ||||
3318 | // Cost modeling of Strided Access Computation is hidden by the indexing | |||
3319 | // modes of X86 regardless of the stride value. We dont believe that there | |||
3320 | // is a difference between constant strided access in gerenal and constant | |||
3321 | // strided value which is less than or equal to 64. | |||
3322 | // Even in the case of (loop invariant) stride whose value is not known at | |||
3323 | // compile time, the address computation will not incur more than one extra | |||
3324 | // ADD instruction. | |||
3325 | if (Ty->isVectorTy() && SE) { | |||
3326 | if (!BaseT::isStridedAccess(Ptr)) | |||
3327 | return NumVectorInstToHideOverhead; | |||
3328 | if (!BaseT::getConstantStrideStep(SE, Ptr)) | |||
3329 | return 1; | |||
3330 | } | |||
3331 | ||||
3332 | return BaseT::getAddressComputationCost(Ty, SE, Ptr); | |||
3333 | } | |||
3334 | ||||
3335 | int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, | |||
3336 | bool IsPairwise, | |||
3337 | TTI::TargetCostKind CostKind) { | |||
3338 | // Just use the default implementation for pair reductions. | |||
3339 | if (IsPairwise) | |||
3340 | return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise, CostKind); | |||
3341 | ||||
3342 | // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput | |||
3343 | // and make it as the cost. | |||
3344 | ||||
3345 | static const CostTblEntry SLMCostTblNoPairWise[] = { | |||
3346 | { ISD::FADD, MVT::v2f64, 3 }, | |||
3347 | { ISD::ADD, MVT::v2i64, 5 }, | |||
3348 | }; | |||
3349 | ||||
3350 | static const CostTblEntry SSE2CostTblNoPairWise[] = { | |||
3351 | { ISD::FADD, MVT::v2f64, 2 }, | |||
3352 | { ISD::FADD, MVT::v4f32, 4 }, | |||
3353 | { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". | |||
3354 | { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32 | |||
3355 | { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3". | |||
3356 | { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3". | |||
3357 | { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3". | |||
3358 | { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3". | |||
3359 | { ISD::ADD, MVT::v2i8, 2 }, | |||
3360 | { ISD::ADD, MVT::v4i8, 2 }, | |||
3361 | { ISD::ADD, MVT::v8i8, 2 }, | |||
3362 | { ISD::ADD, MVT::v16i8, 3 }, | |||
3363 | }; | |||
3364 | ||||
3365 | static const CostTblEntry AVX1CostTblNoPairWise[] = { | |||
3366 | { ISD::FADD, MVT::v4f64, 3 }, | |||
3367 | { ISD::FADD, MVT::v4f32, 3 }, | |||
3368 | { ISD::FADD, MVT::v8f32, 4 }, | |||
3369 | { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". | |||
3370 | { ISD::ADD, MVT::v4i64, 3 }, | |||
3371 | { ISD::ADD, MVT::v8i32, 5 }, | |||
3372 | { ISD::ADD, MVT::v16i16, 5 }, | |||
3373 | { ISD::ADD, MVT::v32i8, 4 }, | |||
3374 | }; | |||
3375 | ||||
3376 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
3377 | assert(ISD && "Invalid opcode")((ISD && "Invalid opcode") ? static_cast<void> ( 0) : __assert_fail ("ISD && \"Invalid opcode\"", "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 3377, __PRETTY_FUNCTION__)); | |||
3378 | ||||
3379 | // Before legalizing the type, give a chance to look up illegal narrow types | |||
3380 | // in the table. | |||
3381 | // FIXME: Is there a better way to do this? | |||
3382 | EVT VT = TLI->getValueType(DL, ValTy); | |||
3383 | if (VT.isSimple()) { | |||
3384 | MVT MTy = VT.getSimpleVT(); | |||
3385 | if (ST->isSLM()) | |||
3386 | if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy)) | |||
3387 | return Entry->Cost; | |||
3388 | ||||
3389 | if (ST->hasAVX()) | |||
3390 | if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) | |||
3391 | return Entry->Cost; | |||
3392 | ||||
3393 | if (ST->hasSSE2()) | |||
3394 | if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) | |||
3395 | return Entry->Cost; | |||
3396 | } | |||
3397 | ||||
3398 | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); | |||
3399 | ||||
3400 | MVT MTy = LT.second; | |||
3401 | ||||
3402 | auto *ValVTy = cast<FixedVectorType>(ValTy); | |||
3403 | ||||
3404 | unsigned ArithmeticCost = 0; | |||
3405 | if (LT.first != 1 && MTy.isVector() && | |||
3406 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { | |||
3407 | // Type needs to be split. We need LT.first - 1 arithmetic ops. | |||
3408 | auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(), | |||
3409 | MTy.getVectorNumElements()); | |||
3410 | ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind); | |||
3411 | ArithmeticCost *= LT.first - 1; | |||
3412 | } | |||
3413 | ||||
3414 | if (ST->isSLM()) | |||
3415 | if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy)) | |||
3416 | return ArithmeticCost + Entry->Cost; | |||
3417 | ||||
3418 | if (ST->hasAVX()) | |||
3419 | if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) | |||
3420 | return ArithmeticCost + Entry->Cost; | |||
3421 | ||||
3422 | if (ST->hasSSE2()) | |||
3423 | if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) | |||
3424 | return ArithmeticCost + Entry->Cost; | |||
3425 | ||||
3426 | // FIXME: These assume a naive kshift+binop lowering, which is probably | |||
3427 | // conservative in most cases. | |||
3428 | static const CostTblEntry AVX512BoolReduction[] = { | |||
3429 | { ISD::AND, MVT::v2i1, 3 }, | |||
3430 | { ISD::AND, MVT::v4i1, 5 }, | |||
3431 | { ISD::AND, MVT::v8i1, 7 }, | |||
3432 | { ISD::AND, MVT::v16i1, 9 }, | |||
3433 | { ISD::AND, MVT::v32i1, 11 }, | |||
3434 | { ISD::AND, MVT::v64i1, 13 }, | |||
3435 | { ISD::OR, MVT::v2i1, 3 }, | |||
3436 | { ISD::OR, MVT::v4i1, 5 }, | |||
3437 | { ISD::OR, MVT::v8i1, 7 }, | |||
3438 | { ISD::OR, MVT::v16i1, 9 }, | |||
3439 | { ISD::OR, MVT::v32i1, 11 }, | |||
3440 | { ISD::OR, MVT::v64i1, 13 }, | |||
3441 | }; | |||
3442 | ||||
3443 | static const CostTblEntry AVX2BoolReduction[] = { | |||
3444 | { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp | |||
3445 | { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp | |||
3446 | { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp | |||
3447 | { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp | |||
3448 | }; | |||
3449 | ||||
3450 | static const CostTblEntry AVX1BoolReduction[] = { | |||
3451 | { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp | |||
3452 | { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp | |||
3453 | { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp | |||
3454 | { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp | |||
3455 | { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp | |||
3456 | { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp | |||
3457 | { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp | |||
3458 | { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp | |||
3459 | }; | |||
3460 | ||||
3461 | static const CostTblEntry SSE2BoolReduction[] = { | |||
3462 | { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp | |||
3463 | { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp | |||
3464 | { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp | |||
3465 | { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp | |||
3466 | { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp | |||
3467 | { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp | |||
3468 | { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp | |||
3469 | { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp | |||
3470 | }; | |||
3471 | ||||
3472 | // Handle bool allof/anyof patterns. | |||
3473 | if (ValVTy->getElementType()->isIntegerTy(1)) { | |||
3474 | unsigned ArithmeticCost = 0; | |||
3475 | if (LT.first != 1 && MTy.isVector() && | |||
3476 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { | |||
3477 | // Type needs to be split. We need LT.first - 1 arithmetic ops. | |||
3478 | auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(), | |||
3479 | MTy.getVectorNumElements()); | |||
3480 | ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind); | |||
3481 | ArithmeticCost *= LT.first - 1; | |||
3482 | } | |||
3483 | ||||
3484 | if (ST->hasAVX512()) | |||
3485 | if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy)) | |||
3486 | return ArithmeticCost + Entry->Cost; | |||
3487 | if (ST->hasAVX2()) | |||
3488 | if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy)) | |||
3489 | return ArithmeticCost + Entry->Cost; | |||
3490 | if (ST->hasAVX()) | |||
3491 | if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy)) | |||
3492 | return ArithmeticCost + Entry->Cost; | |||
3493 | if (ST->hasSSE2()) | |||
3494 | if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy)) | |||
3495 | return ArithmeticCost + Entry->Cost; | |||
3496 | ||||
3497 | return BaseT::getArithmeticReductionCost(Opcode, ValVTy, IsPairwise, | |||
3498 | CostKind); | |||
3499 | } | |||
3500 | ||||
3501 | unsigned NumVecElts = ValVTy->getNumElements(); | |||
3502 | unsigned ScalarSize = ValVTy->getScalarSizeInBits(); | |||
3503 | ||||
3504 | // Special case power of 2 reductions where the scalar type isn't changed | |||
3505 | // by type legalization. | |||
3506 | if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits()) | |||
3507 | return BaseT::getArithmeticReductionCost(Opcode, ValVTy, IsPairwise, | |||
3508 | CostKind); | |||
3509 | ||||
3510 | unsigned ReductionCost = 0; | |||
3511 | ||||
3512 | auto *Ty = ValVTy; | |||
3513 | if (LT.first != 1 && MTy.isVector() && | |||
3514 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { | |||
3515 | // Type needs to be split. We need LT.first - 1 arithmetic ops. | |||
3516 | Ty = FixedVectorType::get(ValVTy->getElementType(), | |||
3517 | MTy.getVectorNumElements()); | |||
3518 | ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind); | |||
3519 | ReductionCost *= LT.first - 1; | |||
3520 | NumVecElts = MTy.getVectorNumElements(); | |||
3521 | } | |||
3522 | ||||
3523 | // Now handle reduction with the legal type, taking into account size changes | |||
3524 | // at each level. | |||
3525 | while (NumVecElts > 1) { | |||
3526 | // Determine the size of the remaining vector we need to reduce. | |||
3527 | unsigned Size = NumVecElts * ScalarSize; | |||
3528 | NumVecElts /= 2; | |||
3529 | // If we're reducing from 256/512 bits, use an extract_subvector. | |||
3530 | if (Size > 128) { | |||
3531 | auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts); | |||
3532 | ReductionCost += | |||
3533 | getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts, SubTy); | |||
3534 | Ty = SubTy; | |||
3535 | } else if (Size == 128) { | |||
3536 | // Reducing from 128 bits is a permute of v2f64/v2i64. | |||
3537 | FixedVectorType *ShufTy; | |||
3538 | if (ValVTy->isFloatingPointTy()) | |||
3539 | ShufTy = | |||
3540 | FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2); | |||
3541 | else | |||
3542 | ShufTy = | |||
3543 | FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2); | |||
3544 | ReductionCost += | |||
3545 | getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr); | |||
3546 | } else if (Size == 64) { | |||
3547 | // Reducing from 64 bits is a shuffle of v4f32/v4i32. | |||
3548 | FixedVectorType *ShufTy; | |||
3549 | if (ValVTy->isFloatingPointTy()) | |||
3550 | ShufTy = | |||
3551 | FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4); | |||
3552 | else | |||
3553 | ShufTy = | |||
3554 | FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4); | |||
3555 | ReductionCost += | |||
3556 | getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr); | |||
3557 | } else { | |||
3558 | // Reducing from smaller size is a shift by immediate. | |||
3559 | auto *ShiftTy = FixedVectorType::get( | |||
3560 | Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size); | |||
3561 | ReductionCost += getArithmeticInstrCost( | |||
3562 | Instruction::LShr, ShiftTy, CostKind, | |||
3563 | TargetTransformInfo::OK_AnyValue, | |||
3564 | TargetTransformInfo::OK_UniformConstantValue, | |||
3565 | TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); | |||
3566 | } | |||
3567 | ||||
3568 | // Add the arithmetic op for this level. | |||
3569 | ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind); | |||
3570 | } | |||
3571 | ||||
3572 | // Add the final extract element to the cost. | |||
3573 | return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0); | |||
3574 | } | |||
3575 | ||||
3576 | int X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy, bool IsUnsigned) { | |||
3577 | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); | |||
3578 | ||||
3579 | MVT MTy = LT.second; | |||
3580 | ||||
3581 | int ISD; | |||
3582 | if (Ty->isIntOrIntVectorTy()) { | |||
3583 | ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN; | |||
3584 | } else { | |||
3585 | assert(Ty->isFPOrFPVectorTy() &&((Ty->isFPOrFPVectorTy() && "Expected float point or integer vector type." ) ? static_cast<void> (0) : __assert_fail ("Ty->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 3586, __PRETTY_FUNCTION__)) | |||
3586 | "Expected float point or integer vector type.")((Ty->isFPOrFPVectorTy() && "Expected float point or integer vector type." ) ? static_cast<void> (0) : __assert_fail ("Ty->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 3586, __PRETTY_FUNCTION__)); | |||
3587 | ISD = ISD::FMINNUM; | |||
3588 | } | |||
3589 | ||||
3590 | static const CostTblEntry SSE1CostTbl[] = { | |||
3591 | {ISD::FMINNUM, MVT::v4f32, 1}, | |||
3592 | }; | |||
3593 | ||||
3594 | static const CostTblEntry SSE2CostTbl[] = { | |||
3595 | {ISD::FMINNUM, MVT::v2f64, 1}, | |||
3596 | {ISD::SMIN, MVT::v8i16, 1}, | |||
3597 | {ISD::UMIN, MVT::v16i8, 1}, | |||
3598 | }; | |||
3599 | ||||
3600 | static const CostTblEntry SSE41CostTbl[] = { | |||
3601 | {ISD::SMIN, MVT::v4i32, 1}, | |||
3602 | {ISD::UMIN, MVT::v4i32, 1}, | |||
3603 | {ISD::UMIN, MVT::v8i16, 1}, | |||
3604 | {ISD::SMIN, MVT::v16i8, 1}, | |||
3605 | }; | |||
3606 | ||||
3607 | static const CostTblEntry SSE42CostTbl[] = { | |||
3608 | {ISD::UMIN, MVT::v2i64, 3}, // xor+pcmpgtq+blendvpd | |||
3609 | }; | |||
3610 | ||||
3611 | static const CostTblEntry AVX1CostTbl[] = { | |||
3612 | {ISD::FMINNUM, MVT::v8f32, 1}, | |||
3613 | {ISD::FMINNUM, MVT::v4f64, 1}, | |||
3614 | {ISD::SMIN, MVT::v8i32, 3}, | |||
3615 | {ISD::UMIN, MVT::v8i32, 3}, | |||
3616 | {ISD::SMIN, MVT::v16i16, 3}, | |||
3617 | {ISD::UMIN, MVT::v16i16, 3}, | |||
3618 | {ISD::SMIN, MVT::v32i8, 3}, | |||
3619 | {ISD::UMIN, MVT::v32i8, 3}, | |||
3620 | }; | |||
3621 | ||||
3622 | static const CostTblEntry AVX2CostTbl[] = { | |||
3623 | {ISD::SMIN, MVT::v8i32, 1}, | |||
3624 | {ISD::UMIN, MVT::v8i32, 1}, | |||
3625 | {ISD::SMIN, MVT::v16i16, 1}, | |||
3626 | {ISD::UMIN, MVT::v16i16, 1}, | |||
3627 | {ISD::SMIN, MVT::v32i8, 1}, | |||
3628 | {ISD::UMIN, MVT::v32i8, 1}, | |||
3629 | }; | |||
3630 | ||||
3631 | static const CostTblEntry AVX512CostTbl[] = { | |||
3632 | {ISD::FMINNUM, MVT::v16f32, 1}, | |||
3633 | {ISD::FMINNUM, MVT::v8f64, 1}, | |||
3634 | {ISD::SMIN, MVT::v2i64, 1}, | |||
3635 | {ISD::UMIN, MVT::v2i64, 1}, | |||
3636 | {ISD::SMIN, MVT::v4i64, 1}, | |||
3637 | {ISD::UMIN, MVT::v4i64, 1}, | |||
3638 | {ISD::SMIN, MVT::v8i64, 1}, | |||
3639 | {ISD::UMIN, MVT::v8i64, 1}, | |||
3640 | {ISD::SMIN, MVT::v16i32, 1}, | |||
3641 | {ISD::UMIN, MVT::v16i32, 1}, | |||
3642 | }; | |||
3643 | ||||
3644 | static const CostTblEntry AVX512BWCostTbl[] = { | |||
3645 | {ISD::SMIN, MVT::v32i16, 1}, | |||
3646 | {ISD::UMIN, MVT::v32i16, 1}, | |||
3647 | {ISD::SMIN, MVT::v64i8, 1}, | |||
3648 | {ISD::UMIN, MVT::v64i8, 1}, | |||
3649 | }; | |||
3650 | ||||
3651 | // If we have a native MIN/MAX instruction for this type, use it. | |||
3652 | if (ST->hasBWI()) | |||
3653 | if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) | |||
3654 | return LT.first * Entry->Cost; | |||
3655 | ||||
3656 | if (ST->hasAVX512()) | |||
3657 | if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) | |||
3658 | return LT.first * Entry->Cost; | |||
3659 | ||||
3660 | if (ST->hasAVX2()) | |||
3661 | if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) | |||
3662 | return LT.first * Entry->Cost; | |||
3663 | ||||
3664 | if (ST->hasAVX()) | |||
3665 | if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) | |||
3666 | return LT.first * Entry->Cost; | |||
3667 | ||||
3668 | if (ST->hasSSE42()) | |||
3669 | if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) | |||
3670 | return LT.first * Entry->Cost; | |||
3671 | ||||
3672 | if (ST->hasSSE41()) | |||
3673 | if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) | |||
3674 | return LT.first * Entry->Cost; | |||
3675 | ||||
3676 | if (ST->hasSSE2()) | |||
3677 | if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) | |||
3678 | return LT.first * Entry->Cost; | |||
3679 | ||||
3680 | if (ST->hasSSE1()) | |||
3681 | if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) | |||
3682 | return LT.first * Entry->Cost; | |||
3683 | ||||
3684 | unsigned CmpOpcode; | |||
3685 | if (Ty->isFPOrFPVectorTy()) { | |||
3686 | CmpOpcode = Instruction::FCmp; | |||
3687 | } else { | |||
3688 | assert(Ty->isIntOrIntVectorTy() &&((Ty->isIntOrIntVectorTy() && "expecting floating point or integer type for min/max reduction" ) ? static_cast<void> (0) : __assert_fail ("Ty->isIntOrIntVectorTy() && \"expecting floating point or integer type for min/max reduction\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 3689, __PRETTY_FUNCTION__)) | |||
3689 | "expecting floating point or integer type for min/max reduction")((Ty->isIntOrIntVectorTy() && "expecting floating point or integer type for min/max reduction" ) ? static_cast<void> (0) : __assert_fail ("Ty->isIntOrIntVectorTy() && \"expecting floating point or integer type for min/max reduction\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 3689, __PRETTY_FUNCTION__)); | |||
3690 | CmpOpcode = Instruction::ICmp; | |||
3691 | } | |||
3692 | ||||
3693 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; | |||
3694 | // Otherwise fall back to cmp+select. | |||
3695 | return getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CmpInst::BAD_ICMP_PREDICATE, | |||
3696 | CostKind) + | |||
3697 | getCmpSelInstrCost(Instruction::Select, Ty, CondTy, | |||
3698 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | |||
3699 | } | |||
3700 | ||||
3701 | int X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy, | |||
3702 | bool IsPairwise, bool IsUnsigned, | |||
3703 | TTI::TargetCostKind CostKind) { | |||
3704 | // Just use the default implementation for pair reductions. | |||
3705 | if (IsPairwise) | |||
3706 | return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned, | |||
3707 | CostKind); | |||
3708 | ||||
3709 | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); | |||
3710 | ||||
3711 | MVT MTy = LT.second; | |||
3712 | ||||
3713 | int ISD; | |||
3714 | if (ValTy->isIntOrIntVectorTy()) { | |||
3715 | ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN; | |||
3716 | } else { | |||
3717 | assert(ValTy->isFPOrFPVectorTy() &&((ValTy->isFPOrFPVectorTy() && "Expected float point or integer vector type." ) ? static_cast<void> (0) : __assert_fail ("ValTy->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 3718, __PRETTY_FUNCTION__)) | |||
3718 | "Expected float point or integer vector type.")((ValTy->isFPOrFPVectorTy() && "Expected float point or integer vector type." ) ? static_cast<void> (0) : __assert_fail ("ValTy->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 3718, __PRETTY_FUNCTION__)); | |||
3719 | ISD = ISD::FMINNUM; | |||
3720 | } | |||
3721 | ||||
3722 | // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput | |||
3723 | // and make it as the cost. | |||
3724 | ||||
3725 | static const CostTblEntry SSE2CostTblNoPairWise[] = { | |||
3726 | {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw | |||
3727 | {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw | |||
3728 | {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw | |||
3729 | }; | |||
3730 | ||||
3731 | static const CostTblEntry SSE41CostTblNoPairWise[] = { | |||
3732 | {ISD::SMIN, MVT::v2i16, 3}, // same as sse2 | |||
3733 | {ISD::SMIN, MVT::v4i16, 5}, // same as sse2 | |||
3734 | {ISD::UMIN, MVT::v2i16, 5}, // same as sse2 | |||
3735 | {ISD::UMIN, MVT::v4i16, 7}, // same as sse2 | |||
3736 | {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor | |||
3737 | {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax | |||
3738 | {ISD::SMIN, MVT::v2i8, 3}, // pminsb | |||
3739 | {ISD::SMIN, MVT::v4i8, 5}, // pminsb | |||
3740 | {ISD::SMIN, MVT::v8i8, 7}, // pminsb | |||
3741 | {ISD::SMIN, MVT::v16i8, 6}, | |||
3742 | {ISD::UMIN, MVT::v2i8, 3}, // same as sse2 | |||
3743 | {ISD::UMIN, MVT::v4i8, 5}, // same as sse2 | |||
3744 | {ISD::UMIN, MVT::v8i8, 7}, // same as sse2 | |||
3745 | {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax | |||
3746 | }; | |||
3747 | ||||
3748 | static const CostTblEntry AVX1CostTblNoPairWise[] = { | |||
3749 | {ISD::SMIN, MVT::v16i16, 6}, | |||
3750 | {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax | |||
3751 | {ISD::SMIN, MVT::v32i8, 8}, | |||
3752 | {ISD::UMIN, MVT::v32i8, 8}, | |||
3753 | }; | |||
3754 | ||||
3755 | static const CostTblEntry AVX512BWCostTblNoPairWise[] = { | |||
3756 | {ISD::SMIN, MVT::v32i16, 8}, | |||
3757 | {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax | |||
3758 | {ISD::SMIN, MVT::v64i8, 10}, | |||
3759 | {ISD::UMIN, MVT::v64i8, 10}, | |||
3760 | }; | |||
3761 | ||||
3762 | // Before legalizing the type, give a chance to look up illegal narrow types | |||
3763 | // in the table. | |||
3764 | // FIXME: Is there a better way to do this? | |||
3765 | EVT VT = TLI->getValueType(DL, ValTy); | |||
3766 | if (VT.isSimple()) { | |||
3767 | MVT MTy = VT.getSimpleVT(); | |||
3768 | if (ST->hasBWI()) | |||
3769 | if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy)) | |||
3770 | return Entry->Cost; | |||
3771 | ||||
3772 | if (ST->hasAVX()) | |||
3773 | if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) | |||
3774 | return Entry->Cost; | |||
3775 | ||||
3776 | if (ST->hasSSE41()) | |||
3777 | if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy)) | |||
3778 | return Entry->Cost; | |||
3779 | ||||
3780 | if (ST->hasSSE2()) | |||
3781 | if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) | |||
3782 | return Entry->Cost; | |||
3783 | } | |||
3784 | ||||
3785 | auto *ValVTy = cast<FixedVectorType>(ValTy); | |||
3786 | unsigned NumVecElts = ValVTy->getNumElements(); | |||
3787 | ||||
3788 | auto *Ty = ValVTy; | |||
3789 | unsigned MinMaxCost = 0; | |||
3790 | if (LT.first != 1 && MTy.isVector() && | |||
3791 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { | |||
3792 | // Type needs to be split. We need LT.first - 1 operations ops. | |||
3793 | Ty = FixedVectorType::get(ValVTy->getElementType(), | |||
3794 | MTy.getVectorNumElements()); | |||
3795 | auto *SubCondTy = FixedVectorType::get(CondTy->getElementType(), | |||
3796 | MTy.getVectorNumElements()); | |||
3797 | MinMaxCost = getMinMaxCost(Ty, SubCondTy, IsUnsigned); | |||
3798 | MinMaxCost *= LT.first - 1; | |||
3799 | NumVecElts = MTy.getVectorNumElements(); | |||
3800 | } | |||
3801 | ||||
3802 | if (ST->hasBWI()) | |||
3803 | if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy)) | |||
3804 | return MinMaxCost + Entry->Cost; | |||
3805 | ||||
3806 | if (ST->hasAVX()) | |||
3807 | if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) | |||
3808 | return MinMaxCost + Entry->Cost; | |||
3809 | ||||
3810 | if (ST->hasSSE41()) | |||
3811 | if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy)) | |||
3812 | return MinMaxCost + Entry->Cost; | |||
3813 | ||||
3814 | if (ST->hasSSE2()) | |||
3815 | if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) | |||
3816 | return MinMaxCost + Entry->Cost; | |||
3817 | ||||
3818 | unsigned ScalarSize = ValTy->getScalarSizeInBits(); | |||
3819 | ||||
3820 | // Special case power of 2 reductions where the scalar type isn't changed | |||
3821 | // by type legalization. | |||
3822 | if (!isPowerOf2_32(ValVTy->getNumElements()) || | |||
3823 | ScalarSize != MTy.getScalarSizeInBits()) | |||
3824 | return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned, | |||
3825 | CostKind); | |||
3826 | ||||
3827 | // Now handle reduction with the legal type, taking into account size changes | |||
3828 | // at each level. | |||
3829 | while (NumVecElts > 1) { | |||
3830 | // Determine the size of the remaining vector we need to reduce. | |||
3831 | unsigned Size = NumVecElts * ScalarSize; | |||
3832 | NumVecElts /= 2; | |||
3833 | // If we're reducing from 256/512 bits, use an extract_subvector. | |||
3834 | if (Size > 128) { | |||
3835 | auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts); | |||
3836 | MinMaxCost += | |||
3837 | getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts, SubTy); | |||
3838 | Ty = SubTy; | |||
3839 | } else if (Size == 128) { | |||
3840 | // Reducing from 128 bits is a permute of v2f64/v2i64. | |||
3841 | VectorType *ShufTy; | |||
3842 | if (ValTy->isFloatingPointTy()) | |||
3843 | ShufTy = | |||
3844 | FixedVectorType::get(Type::getDoubleTy(ValTy->getContext()), 2); | |||
3845 | else | |||
3846 | ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2); | |||
3847 | MinMaxCost += | |||
3848 | getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr); | |||
3849 | } else if (Size == 64) { | |||
3850 | // Reducing from 64 bits is a shuffle of v4f32/v4i32. | |||
3851 | FixedVectorType *ShufTy; | |||
3852 | if (ValTy->isFloatingPointTy()) | |||
3853 | ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4); | |||
3854 | else | |||
3855 | ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4); | |||
3856 | MinMaxCost += | |||
3857 | getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr); | |||
3858 | } else { | |||
3859 | // Reducing from smaller size is a shift by immediate. | |||
3860 | auto *ShiftTy = FixedVectorType::get( | |||
3861 | Type::getIntNTy(ValTy->getContext(), Size), 128 / Size); | |||
3862 | MinMaxCost += getArithmeticInstrCost( | |||
3863 | Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput, | |||
3864 | TargetTransformInfo::OK_AnyValue, | |||
3865 | TargetTransformInfo::OK_UniformConstantValue, | |||
3866 | TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); | |||
3867 | } | |||
3868 | ||||
3869 | // Add the arithmetic op for this level. | |||
3870 | auto *SubCondTy = | |||
3871 | FixedVectorType::get(CondTy->getElementType(), Ty->getNumElements()); | |||
3872 | MinMaxCost += getMinMaxCost(Ty, SubCondTy, IsUnsigned); | |||
3873 | } | |||
3874 | ||||
3875 | // Add the final extract element to the cost. | |||
3876 | return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0); | |||
3877 | } | |||
3878 | ||||
3879 | /// Calculate the cost of materializing a 64-bit value. This helper | |||
3880 | /// method might only calculate a fraction of a larger immediate. Therefore it | |||
3881 | /// is valid to return a cost of ZERO. | |||
3882 | int X86TTIImpl::getIntImmCost(int64_t Val) { | |||
3883 | if (Val == 0) | |||
3884 | return TTI::TCC_Free; | |||
3885 | ||||
3886 | if (isInt<32>(Val)) | |||
3887 | return TTI::TCC_Basic; | |||
3888 | ||||
3889 | return 2 * TTI::TCC_Basic; | |||
3890 | } | |||
3891 | ||||
3892 | int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, | |||
3893 | TTI::TargetCostKind CostKind) { | |||
3894 | assert(Ty->isIntegerTy())((Ty->isIntegerTy()) ? static_cast<void> (0) : __assert_fail ("Ty->isIntegerTy()", "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 3894, __PRETTY_FUNCTION__)); | |||
3895 | ||||
3896 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); | |||
3897 | if (BitSize == 0) | |||
3898 | return ~0U; | |||
3899 | ||||
3900 | // Never hoist constants larger than 128bit, because this might lead to | |||
3901 | // incorrect code generation or assertions in codegen. | |||
3902 | // Fixme: Create a cost model for types larger than i128 once the codegen | |||
3903 | // issues have been fixed. | |||
3904 | if (BitSize > 128) | |||
3905 | return TTI::TCC_Free; | |||
3906 | ||||
3907 | if (Imm == 0) | |||
3908 | return TTI::TCC_Free; | |||
3909 | ||||
3910 | // Sign-extend all constants to a multiple of 64-bit. | |||
3911 | APInt ImmVal = Imm; | |||
3912 | if (BitSize % 64 != 0) | |||
3913 | ImmVal = Imm.sext(alignTo(BitSize, 64)); | |||
3914 | ||||
3915 | // Split the constant into 64-bit chunks and calculate the cost for each | |||
3916 | // chunk. | |||
3917 | int Cost = 0; | |||
3918 | for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { | |||
3919 | APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); | |||
3920 | int64_t Val = Tmp.getSExtValue(); | |||
3921 | Cost += getIntImmCost(Val); | |||
3922 | } | |||
3923 | // We need at least one instruction to materialize the constant. | |||
3924 | return std::max(1, Cost); | |||
3925 | } | |||
3926 | ||||
3927 | int X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, | |||
3928 | const APInt &Imm, Type *Ty, | |||
3929 | TTI::TargetCostKind CostKind, | |||
3930 | Instruction *Inst) { | |||
3931 | assert(Ty->isIntegerTy())((Ty->isIntegerTy()) ? static_cast<void> (0) : __assert_fail ("Ty->isIntegerTy()", "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 3931, __PRETTY_FUNCTION__)); | |||
3932 | ||||
3933 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); | |||
3934 | // There is no cost model for constants with a bit size of 0. Return TCC_Free | |||
3935 | // here, so that constant hoisting will ignore this constant. | |||
3936 | if (BitSize == 0) | |||
3937 | return TTI::TCC_Free; | |||
3938 | ||||
3939 | unsigned ImmIdx = ~0U; | |||
3940 | switch (Opcode) { | |||
3941 | default: | |||
3942 | return TTI::TCC_Free; | |||
3943 | case Instruction::GetElementPtr: | |||
3944 | // Always hoist the base address of a GetElementPtr. This prevents the | |||
3945 | // creation of new constants for every base constant that gets constant | |||
3946 | // folded with the offset. | |||
3947 | if (Idx == 0) | |||
3948 | return 2 * TTI::TCC_Basic; | |||
3949 | return TTI::TCC_Free; | |||
3950 | case Instruction::Store: | |||
3951 | ImmIdx = 0; | |||
3952 | break; | |||
3953 | case Instruction::ICmp: | |||
3954 | // This is an imperfect hack to prevent constant hoisting of | |||
3955 | // compares that might be trying to check if a 64-bit value fits in | |||
3956 | // 32-bits. The backend can optimize these cases using a right shift by 32. | |||
3957 | // Ideally we would check the compare predicate here. There also other | |||
3958 | // similar immediates the backend can use shifts for. | |||
3959 | if (Idx == 1 && Imm.getBitWidth() == 64) { | |||
3960 | uint64_t ImmVal = Imm.getZExtValue(); | |||
3961 | if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff) | |||
3962 | return TTI::TCC_Free; | |||
3963 | } | |||
3964 | ImmIdx = 1; | |||
3965 | break; | |||
3966 | case Instruction::And: | |||
3967 | // We support 64-bit ANDs with immediates with 32-bits of leading zeroes | |||
3968 | // by using a 32-bit operation with implicit zero extension. Detect such | |||
3969 | // immediates here as the normal path expects bit 31 to be sign extended. | |||
3970 | if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue())) | |||
3971 | return TTI::TCC_Free; | |||
3972 | ImmIdx = 1; | |||
3973 | break; | |||
3974 | case Instruction::Add: | |||
3975 | case Instruction::Sub: | |||
3976 | // For add/sub, we can use the opposite instruction for INT32_MIN. | |||
3977 | if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000) | |||
3978 | return TTI::TCC_Free; | |||
3979 | ImmIdx = 1; | |||
3980 | break; | |||
3981 | case Instruction::UDiv: | |||
3982 | case Instruction::SDiv: | |||
3983 | case Instruction::URem: | |||
3984 | case Instruction::SRem: | |||
3985 | // Division by constant is typically expanded later into a different | |||
3986 | // instruction sequence. This completely changes the constants. | |||
3987 | // Report them as "free" to stop ConstantHoist from marking them as opaque. | |||
3988 | return TTI::TCC_Free; | |||
3989 | case Instruction::Mul: | |||
3990 | case Instruction::Or: | |||
3991 | case Instruction::Xor: | |||
3992 | ImmIdx = 1; | |||
3993 | break; | |||
3994 | // Always return TCC_Free for the shift value of a shift instruction. | |||
3995 | case Instruction::Shl: | |||
3996 | case Instruction::LShr: | |||
3997 | case Instruction::AShr: | |||
3998 | if (Idx == 1) | |||
3999 | return TTI::TCC_Free; | |||
4000 | break; | |||
4001 | case Instruction::Trunc: | |||
4002 | case Instruction::ZExt: | |||
4003 | case Instruction::SExt: | |||
4004 | case Instruction::IntToPtr: | |||
4005 | case Instruction::PtrToInt: | |||
4006 | case Instruction::BitCast: | |||
4007 | case Instruction::PHI: | |||
4008 | case Instruction::Call: | |||
4009 | case Instruction::Select: | |||
4010 | case Instruction::Ret: | |||
4011 | case Instruction::Load: | |||
4012 | break; | |||
4013 | } | |||
4014 | ||||
4015 | if (Idx == ImmIdx) { | |||
4016 | int NumConstants = divideCeil(BitSize, 64); | |||
4017 | int Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); | |||
4018 | return (Cost <= NumConstants * TTI::TCC_Basic) | |||
4019 | ? static_cast<int>(TTI::TCC_Free) | |||
4020 | : Cost; | |||
4021 | } | |||
4022 | ||||
4023 | return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); | |||
4024 | } | |||
4025 | ||||
4026 | int X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, | |||
4027 | const APInt &Imm, Type *Ty, | |||
4028 | TTI::TargetCostKind CostKind) { | |||
4029 | assert(Ty->isIntegerTy())((Ty->isIntegerTy()) ? static_cast<void> (0) : __assert_fail ("Ty->isIntegerTy()", "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 4029, __PRETTY_FUNCTION__)); | |||
4030 | ||||
4031 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); | |||
4032 | // There is no cost model for constants with a bit size of 0. Return TCC_Free | |||
4033 | // here, so that constant hoisting will ignore this constant. | |||
4034 | if (BitSize == 0) | |||
4035 | return TTI::TCC_Free; | |||
4036 | ||||
4037 | switch (IID) { | |||
4038 | default: | |||
4039 | return TTI::TCC_Free; | |||
4040 | case Intrinsic::sadd_with_overflow: | |||
4041 | case Intrinsic::uadd_with_overflow: | |||
4042 | case Intrinsic::ssub_with_overflow: | |||
4043 | case Intrinsic::usub_with_overflow: | |||
4044 | case Intrinsic::smul_with_overflow: | |||
4045 | case Intrinsic::umul_with_overflow: | |||
4046 | if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue())) | |||
4047 | return TTI::TCC_Free; | |||
4048 | break; | |||
4049 | case Intrinsic::experimental_stackmap: | |||
4050 | if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) | |||
4051 | return TTI::TCC_Free; | |||
4052 | break; | |||
4053 | case Intrinsic::experimental_patchpoint_void: | |||
4054 | case Intrinsic::experimental_patchpoint_i64: | |||
4055 | if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) | |||
4056 | return TTI::TCC_Free; | |||
4057 | break; | |||
4058 | } | |||
4059 | return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); | |||
4060 | } | |||
4061 | ||||
4062 | unsigned | |||
4063 | X86TTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) { | |||
4064 | if (CostKind != TTI::TCK_RecipThroughput) | |||
4065 | return Opcode == Instruction::PHI ? 0 : 1; | |||
4066 | // Branches are assumed to be predicted. | |||
4067 | return CostKind == TTI::TCK_RecipThroughput ? 0 : 1; | |||
4068 | } | |||
4069 | ||||
4070 | int X86TTIImpl::getGatherOverhead() const { | |||
4071 | // Some CPUs have more overhead for gather. The specified overhead is relative | |||
4072 | // to the Load operation. "2" is the number provided by Intel architects. This | |||
4073 | // parameter is used for cost estimation of Gather Op and comparison with | |||
4074 | // other alternatives. | |||
4075 | // TODO: Remove the explicit hasAVX512()?, That would mean we would only | |||
4076 | // enable gather with a -march. | |||
4077 | if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather())) | |||
4078 | return 2; | |||
4079 | ||||
4080 | return 1024; | |||
4081 | } | |||
4082 | ||||
4083 | int X86TTIImpl::getScatterOverhead() const { | |||
4084 | if (ST->hasAVX512()) | |||
4085 | return 2; | |||
4086 | ||||
4087 | return 1024; | |||
4088 | } | |||
4089 | ||||
4090 | // Return an average cost of Gather / Scatter instruction, maybe improved later. | |||
4091 | // FIXME: Add TargetCostKind support. | |||
4092 | int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, const Value *Ptr, | |||
4093 | Align Alignment, unsigned AddressSpace) { | |||
4094 | ||||
4095 | assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost")((isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost" ) ? static_cast<void> (0) : __assert_fail ("isa<VectorType>(SrcVTy) && \"Unexpected type in getGSVectorCost\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 4095, __PRETTY_FUNCTION__)); | |||
4096 | unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements(); | |||
4097 | ||||
4098 | // Try to reduce index size from 64 bit (default for GEP) | |||
4099 | // to 32. It is essential for VF 16. If the index can't be reduced to 32, the | |||
4100 | // operation will use 16 x 64 indices which do not fit in a zmm and needs | |||
4101 | // to split. Also check that the base pointer is the same for all lanes, | |||
4102 | // and that there's at most one variable index. | |||
4103 | auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) { | |||
4104 | unsigned IndexSize = DL.getPointerSizeInBits(); | |||
4105 | const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); | |||
4106 | if (IndexSize < 64 || !GEP) | |||
4107 | return IndexSize; | |||
4108 | ||||
4109 | unsigned NumOfVarIndices = 0; | |||
4110 | const Value *Ptrs = GEP->getPointerOperand(); | |||
4111 | if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs)) | |||
4112 | return IndexSize; | |||
4113 | for (unsigned i = 1; i < GEP->getNumOperands(); ++i) { | |||
4114 | if (isa<Constant>(GEP->getOperand(i))) | |||
4115 | continue; | |||
4116 | Type *IndxTy = GEP->getOperand(i)->getType(); | |||
4117 | if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy)) | |||
4118 | IndxTy = IndexVTy->getElementType(); | |||
4119 | if ((IndxTy->getPrimitiveSizeInBits() == 64 && | |||
4120 | !isa<SExtInst>(GEP->getOperand(i))) || | |||
4121 | ++NumOfVarIndices > 1) | |||
4122 | return IndexSize; // 64 | |||
4123 | } | |||
4124 | return (unsigned)32; | |||
4125 | }; | |||
4126 | ||||
4127 | // Trying to reduce IndexSize to 32 bits for vector 16. | |||
4128 | // By default the IndexSize is equal to pointer size. | |||
4129 | unsigned IndexSize = (ST->hasAVX512() && VF >= 16) | |||
4130 | ? getIndexSizeInBits(Ptr, DL) | |||
4131 | : DL.getPointerSizeInBits(); | |||
4132 | ||||
4133 | auto *IndexVTy = FixedVectorType::get( | |||
4134 | IntegerType::get(SrcVTy->getContext(), IndexSize), VF); | |||
4135 | std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy); | |||
4136 | std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy); | |||
4137 | int SplitFactor = std::max(IdxsLT.first, SrcLT.first); | |||
4138 | if (SplitFactor > 1) { | |||
4139 | // Handle splitting of vector of pointers | |||
4140 | auto *SplitSrcTy = | |||
4141 | FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor); | |||
4142 | return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment, | |||
4143 | AddressSpace); | |||
4144 | } | |||
4145 | ||||
4146 | // The gather / scatter cost is given by Intel architects. It is a rough | |||
4147 | // number since we are looking at one instruction in a time. | |||
4148 | const int GSOverhead = (Opcode == Instruction::Load) | |||
4149 | ? getGatherOverhead() | |||
4150 | : getScatterOverhead(); | |||
4151 | return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), | |||
4152 | MaybeAlign(Alignment), AddressSpace, | |||
4153 | TTI::TCK_RecipThroughput); | |||
4154 | } | |||
4155 | ||||
4156 | /// Return the cost of full scalarization of gather / scatter operation. | |||
4157 | /// | |||
4158 | /// Opcode - Load or Store instruction. | |||
4159 | /// SrcVTy - The type of the data vector that should be gathered or scattered. | |||
4160 | /// VariableMask - The mask is non-constant at compile time. | |||
4161 | /// Alignment - Alignment for one element. | |||
4162 | /// AddressSpace - pointer[s] address space. | |||
4163 | /// | |||
4164 | /// FIXME: Add TargetCostKind support. | |||
4165 | int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy, | |||
4166 | bool VariableMask, Align Alignment, | |||
4167 | unsigned AddressSpace) { | |||
4168 | unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements(); | |||
4169 | APInt DemandedElts = APInt::getAllOnesValue(VF); | |||
4170 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; | |||
4171 | ||||
4172 | int MaskUnpackCost = 0; | |||
4173 | if (VariableMask) { | |||
4174 | auto *MaskTy = | |||
4175 | FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF); | |||
4176 | MaskUnpackCost = | |||
4177 | getScalarizationOverhead(MaskTy, DemandedElts, false, true); | |||
4178 | int ScalarCompareCost = getCmpSelInstrCost( | |||
4179 | Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr, | |||
4180 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | |||
4181 | int BranchCost = getCFInstrCost(Instruction::Br, CostKind); | |||
4182 | MaskUnpackCost += VF * (BranchCost + ScalarCompareCost); | |||
4183 | } | |||
4184 | ||||
4185 | // The cost of the scalar loads/stores. | |||
4186 | int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), | |||
4187 | MaybeAlign(Alignment), AddressSpace, | |||
4188 | CostKind); | |||
4189 | ||||
4190 | int InsertExtractCost = 0; | |||
4191 | if (Opcode == Instruction::Load) | |||
4192 | for (unsigned i = 0; i < VF; ++i) | |||
4193 | // Add the cost of inserting each scalar load into the vector | |||
4194 | InsertExtractCost += | |||
4195 | getVectorInstrCost(Instruction::InsertElement, SrcVTy, i); | |||
4196 | else | |||
4197 | for (unsigned i = 0; i < VF; ++i) | |||
4198 | // Add the cost of extracting each element out of the data vector | |||
4199 | InsertExtractCost += | |||
4200 | getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i); | |||
4201 | ||||
4202 | return MemoryOpCost + MaskUnpackCost + InsertExtractCost; | |||
4203 | } | |||
4204 | ||||
4205 | /// Calculate the cost of Gather / Scatter operation | |||
4206 | int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy, | |||
4207 | const Value *Ptr, bool VariableMask, | |||
4208 | Align Alignment, | |||
4209 | TTI::TargetCostKind CostKind, | |||
4210 | const Instruction *I = nullptr) { | |||
4211 | if (CostKind != TTI::TCK_RecipThroughput) { | |||
4212 | if ((Opcode == Instruction::Load && | |||
4213 | isLegalMaskedGather(SrcVTy, Align(Alignment))) || | |||
4214 | (Opcode == Instruction::Store && | |||
4215 | isLegalMaskedScatter(SrcVTy, Align(Alignment)))) | |||
4216 | return 1; | |||
4217 | return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask, | |||
4218 | Alignment, CostKind, I); | |||
4219 | } | |||
4220 | ||||
4221 | assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter")((SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter" ) ? static_cast<void> (0) : __assert_fail ("SrcVTy->isVectorTy() && \"Unexpected data type for Gather/Scatter\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 4221, __PRETTY_FUNCTION__)); | |||
4222 | unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements(); | |||
4223 | PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); | |||
4224 | if (!PtrTy && Ptr->getType()->isVectorTy()) | |||
4225 | PtrTy = dyn_cast<PointerType>( | |||
4226 | cast<VectorType>(Ptr->getType())->getElementType()); | |||
4227 | assert(PtrTy && "Unexpected type for Ptr argument")((PtrTy && "Unexpected type for Ptr argument") ? static_cast <void> (0) : __assert_fail ("PtrTy && \"Unexpected type for Ptr argument\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 4227, __PRETTY_FUNCTION__)); | |||
4228 | unsigned AddressSpace = PtrTy->getAddressSpace(); | |||
4229 | ||||
4230 | bool Scalarize = false; | |||
4231 | if ((Opcode == Instruction::Load && | |||
4232 | !isLegalMaskedGather(SrcVTy, Align(Alignment))) || | |||
4233 | (Opcode == Instruction::Store && | |||
4234 | !isLegalMaskedScatter(SrcVTy, Align(Alignment)))) | |||
4235 | Scalarize = true; | |||
4236 | // Gather / Scatter for vector 2 is not profitable on KNL / SKX | |||
4237 | // Vector-4 of gather/scatter instruction does not exist on KNL. | |||
4238 | // We can extend it to 8 elements, but zeroing upper bits of | |||
4239 | // the mask vector will add more instructions. Right now we give the scalar | |||
4240 | // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction | |||
4241 | // is better in the VariableMask case. | |||
4242 | if (ST->hasAVX512() && (VF == 2 || (VF == 4 && !ST->hasVLX()))) | |||
4243 | Scalarize = true; | |||
4244 | ||||
4245 | if (Scalarize) | |||
4246 | return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, | |||
4247 | AddressSpace); | |||
4248 | ||||
4249 | return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace); | |||
4250 | } | |||
4251 | ||||
4252 | bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1, | |||
4253 | TargetTransformInfo::LSRCost &C2) { | |||
4254 | // X86 specific here are "instruction number 1st priority". | |||
4255 | return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, | |||
4256 | C1.NumIVMuls, C1.NumBaseAdds, | |||
4257 | C1.ScaleCost, C1.ImmCost, C1.SetupCost) < | |||
4258 | std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, | |||
4259 | C2.NumIVMuls, C2.NumBaseAdds, | |||
4260 | C2.ScaleCost, C2.ImmCost, C2.SetupCost); | |||
4261 | } | |||
4262 | ||||
4263 | bool X86TTIImpl::canMacroFuseCmp() { | |||
4264 | return ST->hasMacroFusion() || ST->hasBranchFusion(); | |||
4265 | } | |||
4266 | ||||
4267 | bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) { | |||
4268 | if (!ST->hasAVX()) | |||
4269 | return false; | |||
4270 | ||||
4271 | // The backend can't handle a single element vector. | |||
4272 | if (isa<VectorType>(DataTy) && | |||
4273 | cast<FixedVectorType>(DataTy)->getNumElements() == 1) | |||
4274 | return false; | |||
4275 | Type *ScalarTy = DataTy->getScalarType(); | |||
4276 | ||||
4277 | if (ScalarTy->isPointerTy()) | |||
4278 | return true; | |||
4279 | ||||
4280 | if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) | |||
4281 | return true; | |||
4282 | ||||
4283 | if (!ScalarTy->isIntegerTy()) | |||
4284 | return false; | |||
4285 | ||||
4286 | unsigned IntWidth = ScalarTy->getIntegerBitWidth(); | |||
4287 | return IntWidth == 32 || IntWidth == 64 || | |||
4288 | ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI()); | |||
4289 | } | |||
4290 | ||||
4291 | bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) { | |||
4292 | return isLegalMaskedLoad(DataType, Alignment); | |||
4293 | } | |||
4294 | ||||
4295 | bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) { | |||
4296 | unsigned DataSize = DL.getTypeStoreSize(DataType); | |||
4297 | // The only supported nontemporal loads are for aligned vectors of 16 or 32 | |||
4298 | // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2 | |||
4299 | // (the equivalent stores only require AVX). | |||
4300 | if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32)) | |||
4301 | return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2(); | |||
4302 | ||||
4303 | return false; | |||
4304 | } | |||
4305 | ||||
4306 | bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) { | |||
4307 | unsigned DataSize = DL.getTypeStoreSize(DataType); | |||
4308 | ||||
4309 | // SSE4A supports nontemporal stores of float and double at arbitrary | |||
4310 | // alignment. | |||
4311 | if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy())) | |||
4312 | return true; | |||
4313 | ||||
4314 | // Besides the SSE4A subtarget exception above, only aligned stores are | |||
4315 | // available nontemporaly on any other subtarget. And only stores with a size | |||
4316 | // of 4..32 bytes (powers of 2, only) are permitted. | |||
4317 | if (Alignment < DataSize || DataSize < 4 || DataSize > 32 || | |||
4318 | !isPowerOf2_32(DataSize)) | |||
4319 | return false; | |||
4320 | ||||
4321 | // 32-byte vector nontemporal stores are supported by AVX (the equivalent | |||
4322 | // loads require AVX2). | |||
4323 | if (DataSize == 32) | |||
4324 | return ST->hasAVX(); | |||
4325 | else if (DataSize == 16) | |||
4326 | return ST->hasSSE1(); | |||
4327 | return true; | |||
4328 | } | |||
4329 | ||||
4330 | bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) { | |||
4331 | if (!isa<VectorType>(DataTy)) | |||
4332 | return false; | |||
4333 | ||||
4334 | if (!ST->hasAVX512()) | |||
4335 | return false; | |||
4336 | ||||
4337 | // The backend can't handle a single element vector. | |||
4338 | if (cast<FixedVectorType>(DataTy)->getNumElements() == 1) | |||
4339 | return false; | |||
4340 | ||||
4341 | Type *ScalarTy = cast<VectorType>(DataTy)->getElementType(); | |||
4342 | ||||
4343 | if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) | |||
4344 | return true; | |||
4345 | ||||
4346 | if (!ScalarTy->isIntegerTy()) | |||
4347 | return false; | |||
4348 | ||||
4349 | unsigned IntWidth = ScalarTy->getIntegerBitWidth(); | |||
4350 | return IntWidth == 32 || IntWidth == 64 || | |||
4351 | ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2()); | |||
4352 | } | |||
4353 | ||||
4354 | bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) { | |||
4355 | return isLegalMaskedExpandLoad(DataTy); | |||
4356 | } | |||
4357 | ||||
4358 | bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) { | |||
4359 | // Some CPUs have better gather performance than others. | |||
4360 | // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only | |||
4361 | // enable gather with a -march. | |||
4362 | if (!(ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2()))) | |||
4363 | return false; | |||
4364 | ||||
4365 | // This function is called now in two cases: from the Loop Vectorizer | |||
4366 | // and from the Scalarizer. | |||
4367 | // When the Loop Vectorizer asks about legality of the feature, | |||
4368 | // the vectorization factor is not calculated yet. The Loop Vectorizer | |||
4369 | // sends a scalar type and the decision is based on the width of the | |||
4370 | // scalar element. | |||
4371 | // Later on, the cost model will estimate usage this intrinsic based on | |||
4372 | // the vector type. | |||
4373 | // The Scalarizer asks again about legality. It sends a vector type. | |||
4374 | // In this case we can reject non-power-of-2 vectors. | |||
4375 | // We also reject single element vectors as the type legalizer can't | |||
4376 | // scalarize it. | |||
4377 | if (auto *DataVTy = dyn_cast<FixedVectorType>(DataTy)) { | |||
4378 | unsigned NumElts = DataVTy->getNumElements(); | |||
4379 | if (NumElts == 1) | |||
4380 | return false; | |||
4381 | } | |||
4382 | Type *ScalarTy = DataTy->getScalarType(); | |||
4383 | if (ScalarTy->isPointerTy()) | |||
4384 | return true; | |||
4385 | ||||
4386 | if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) | |||
4387 | return true; | |||
4388 | ||||
4389 | if (!ScalarTy->isIntegerTy()) | |||
4390 | return false; | |||
4391 | ||||
4392 | unsigned IntWidth = ScalarTy->getIntegerBitWidth(); | |||
4393 | return IntWidth == 32 || IntWidth == 64; | |||
4394 | } | |||
4395 | ||||
4396 | bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) { | |||
4397 | // AVX2 doesn't support scatter | |||
4398 | if (!ST->hasAVX512()) | |||
4399 | return false; | |||
4400 | return isLegalMaskedGather(DataType, Alignment); | |||
4401 | } | |||
4402 | ||||
4403 | bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) { | |||
4404 | EVT VT = TLI->getValueType(DL, DataType); | |||
4405 | return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT); | |||
4406 | } | |||
4407 | ||||
4408 | bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) { | |||
4409 | return false; | |||
4410 | } | |||
4411 | ||||
4412 | bool X86TTIImpl::areInlineCompatible(const Function *Caller, | |||
4413 | const Function *Callee) const { | |||
4414 | const TargetMachine &TM = getTLI()->getTargetMachine(); | |||
4415 | ||||
4416 | // Work this as a subsetting of subtarget features. | |||
4417 | const FeatureBitset &CallerBits = | |||
4418 | TM.getSubtargetImpl(*Caller)->getFeatureBits(); | |||
4419 | const FeatureBitset &CalleeBits = | |||
4420 | TM.getSubtargetImpl(*Callee)->getFeatureBits(); | |||
4421 | ||||
4422 | FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList; | |||
4423 | FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList; | |||
4424 | return (RealCallerBits & RealCalleeBits) == RealCalleeBits; | |||
4425 | } | |||
4426 | ||||
4427 | bool X86TTIImpl::areFunctionArgsABICompatible( | |||
4428 | const Function *Caller, const Function *Callee, | |||
4429 | SmallPtrSetImpl<Argument *> &Args) const { | |||
4430 | if (!BaseT::areFunctionArgsABICompatible(Caller, Callee, Args)) | |||
4431 | return false; | |||
4432 | ||||
4433 | // If we get here, we know the target features match. If one function | |||
4434 | // considers 512-bit vectors legal and the other does not, consider them | |||
4435 | // incompatible. | |||
4436 | const TargetMachine &TM = getTLI()->getTargetMachine(); | |||
4437 | ||||
4438 | if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() == | |||
4439 | TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs()) | |||
4440 | return true; | |||
4441 | ||||
4442 | // Consider the arguments compatible if they aren't vectors or aggregates. | |||
4443 | // FIXME: Look at the size of vectors. | |||
4444 | // FIXME: Look at the element types of aggregates to see if there are vectors. | |||
4445 | // FIXME: The API of this function seems intended to allow arguments | |||
4446 | // to be removed from the set, but the caller doesn't check if the set | |||
4447 | // becomes empty so that may not work in practice. | |||
4448 | return llvm::none_of(Args, [](Argument *A) { | |||
4449 | auto *EltTy = cast<PointerType>(A->getType())->getElementType(); | |||
4450 | return EltTy->isVectorTy() || EltTy->isAggregateType(); | |||
4451 | }); | |||
4452 | } | |||
4453 | ||||
4454 | X86TTIImpl::TTI::MemCmpExpansionOptions | |||
4455 | X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { | |||
4456 | TTI::MemCmpExpansionOptions Options; | |||
4457 | Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); | |||
4458 | Options.NumLoadsPerBlock = 2; | |||
4459 | // All GPR and vector loads can be unaligned. | |||
4460 | Options.AllowOverlappingLoads = true; | |||
4461 | if (IsZeroCmp) { | |||
4462 | // Only enable vector loads for equality comparison. Right now the vector | |||
4463 | // version is not as fast for three way compare (see #33329). | |||
4464 | const unsigned PreferredWidth = ST->getPreferVectorWidth(); | |||
4465 | if (PreferredWidth >= 512 && ST->hasAVX512()) Options.LoadSizes.push_back(64); | |||
4466 | if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32); | |||
4467 | if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16); | |||
4468 | } | |||
4469 | if (ST->is64Bit()) { | |||
4470 | Options.LoadSizes.push_back(8); | |||
4471 | } | |||
4472 | Options.LoadSizes.push_back(4); | |||
4473 | Options.LoadSizes.push_back(2); | |||
4474 | Options.LoadSizes.push_back(1); | |||
4475 | return Options; | |||
4476 | } | |||
4477 | ||||
4478 | bool X86TTIImpl::enableInterleavedAccessVectorization() { | |||
4479 | // TODO: We expect this to be beneficial regardless of arch, | |||
4480 | // but there are currently some unexplained performance artifacts on Atom. | |||
4481 | // As a temporary solution, disable on Atom. | |||
4482 | return !(ST->isAtom()); | |||
4483 | } | |||
4484 | ||||
4485 | // Get estimation for interleaved load/store operations for AVX2. | |||
4486 | // \p Factor is the interleaved-access factor (stride) - number of | |||
4487 | // (interleaved) elements in the group. | |||
4488 | // \p Indices contains the indices for a strided load: when the | |||
4489 | // interleaved load has gaps they indicate which elements are used. | |||
4490 | // If Indices is empty (or if the number of indices is equal to the size | |||
4491 | // of the interleaved-access as given in \p Factor) the access has no gaps. | |||
4492 | // | |||
4493 | // As opposed to AVX-512, AVX2 does not have generic shuffles that allow | |||
4494 | // computing the cost using a generic formula as a function of generic | |||
4495 | // shuffles. We therefore use a lookup table instead, filled according to | |||
4496 | // the instruction sequences that codegen currently generates. | |||
4497 | int X86TTIImpl::getInterleavedMemoryOpCostAVX2( | |||
4498 | unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, | |||
4499 | ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace, | |||
4500 | TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) { | |||
4501 | ||||
4502 | if (UseMaskForCond || UseMaskForGaps) | |||
4503 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | |||
4504 | Alignment, AddressSpace, CostKind, | |||
4505 | UseMaskForCond, UseMaskForGaps); | |||
4506 | ||||
4507 | // We currently Support only fully-interleaved groups, with no gaps. | |||
4508 | // TODO: Support also strided loads (interleaved-groups with gaps). | |||
4509 | if (Indices.size() && Indices.size() != Factor) | |||
4510 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | |||
4511 | Alignment, AddressSpace, | |||
4512 | CostKind); | |||
4513 | ||||
4514 | // VecTy for interleave memop is <VF*Factor x Elt>. | |||
4515 | // So, for VF=4, Interleave Factor = 3, Element type = i32 we have | |||
4516 | // VecTy = <12 x i32>. | |||
4517 | MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; | |||
4518 | ||||
4519 | // This function can be called with VecTy=<6xi128>, Factor=3, in which case | |||
4520 | // the VF=2, while v2i128 is an unsupported MVT vector type | |||
4521 | // (see MachineValueType.h::getVectorVT()). | |||
4522 | if (!LegalVT.isVector()) | |||
4523 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | |||
4524 | Alignment, AddressSpace, | |||
4525 | CostKind); | |||
4526 | ||||
4527 | unsigned VF = VecTy->getNumElements() / Factor; | |||
4528 | Type *ScalarTy = VecTy->getElementType(); | |||
4529 | ||||
4530 | // Calculate the number of memory operations (NumOfMemOps), required | |||
4531 | // for load/store the VecTy. | |||
4532 | unsigned VecTySize = DL.getTypeStoreSize(VecTy); | |||
4533 | unsigned LegalVTSize = LegalVT.getStoreSize(); | |||
4534 | unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize; | |||
4535 | ||||
4536 | // Get the cost of one memory operation. | |||
4537 | auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(), | |||
4538 | LegalVT.getVectorNumElements()); | |||
4539 | unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, | |||
4540 | MaybeAlign(Alignment), AddressSpace, | |||
4541 | CostKind); | |||
4542 | ||||
4543 | auto *VT = FixedVectorType::get(ScalarTy, VF); | |||
4544 | EVT ETy = TLI->getValueType(DL, VT); | |||
4545 | if (!ETy.isSimple()) | |||
4546 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | |||
4547 | Alignment, AddressSpace, | |||
4548 | CostKind); | |||
4549 | ||||
4550 | // TODO: Complete for other data-types and strides. | |||
4551 | // Each combination of Stride, ElementTy and VF results in a different | |||
4552 | // sequence; The cost tables are therefore accessed with: | |||
4553 | // Factor (stride) and VectorType=VFxElemType. | |||
4554 | // The Cost accounts only for the shuffle sequence; | |||
4555 | // The cost of the loads/stores is accounted for separately. | |||
4556 | // | |||
4557 | static const CostTblEntry AVX2InterleavedLoadTbl[] = { | |||
4558 | { 2, MVT::v4i64, 6 }, //(load 8i64 and) deinterleave into 2 x 4i64 | |||
4559 | { 2, MVT::v4f64, 6 }, //(load 8f64 and) deinterleave into 2 x 4f64 | |||
4560 | ||||
4561 | { 3, MVT::v2i8, 10 }, //(load 6i8 and) deinterleave into 3 x 2i8 | |||
4562 | { 3, MVT::v4i8, 4 }, //(load 12i8 and) deinterleave into 3 x 4i8 | |||
4563 | { 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8 | |||
4564 | { 3, MVT::v16i8, 11}, //(load 48i8 and) deinterleave into 3 x 16i8 | |||
4565 | { 3, MVT::v32i8, 13}, //(load 96i8 and) deinterleave into 3 x 32i8 | |||
4566 | { 3, MVT::v8f32, 17 }, //(load 24f32 and)deinterleave into 3 x 8f32 | |||
4567 | ||||
4568 | { 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8 | |||
4569 | { 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8 | |||
4570 | { 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8 | |||
4571 | { 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8 | |||
4572 | { 4, MVT::v32i8, 80 }, //(load 128i8 and) deinterleave into 4 x 32i8 | |||
4573 | ||||
4574 | { 8, MVT::v8f32, 40 } //(load 64f32 and)deinterleave into 8 x 8f32 | |||
4575 | }; | |||
4576 | ||||
4577 | static const CostTblEntry AVX2InterleavedStoreTbl[] = { | |||
4578 | { 2, MVT::v4i64, 6 }, //interleave into 2 x 4i64 into 8i64 (and store) | |||
4579 | { 2, MVT::v4f64, 6 }, //interleave into 2 x 4f64 into 8f64 (and store) | |||
4580 | ||||
4581 | { 3, MVT::v2i8, 7 }, //interleave 3 x 2i8 into 6i8 (and store) | |||
4582 | { 3, MVT::v4i8, 8 }, //interleave 3 x 4i8 into 12i8 (and store) | |||
4583 | { 3, MVT::v8i8, 11 }, //interleave 3 x 8i8 into 24i8 (and store) | |||
4584 | { 3, MVT::v16i8, 11 }, //interleave 3 x 16i8 into 48i8 (and store) | |||
4585 | { 3, MVT::v32i8, 13 }, //interleave 3 x 32i8 into 96i8 (and store) | |||
4586 | ||||
4587 | { 4, MVT::v2i8, 12 }, //interleave 4 x 2i8 into 8i8 (and store) | |||
4588 | { 4, MVT::v4i8, 9 }, //interleave 4 x 4i8 into 16i8 (and store) | |||
4589 | { 4, MVT::v8i8, 10 }, //interleave 4 x 8i8 into 32i8 (and store) | |||
4590 | { 4, MVT::v16i8, 10 }, //interleave 4 x 16i8 into 64i8 (and store) | |||
4591 | { 4, MVT::v32i8, 12 } //interleave 4 x 32i8 into 128i8 (and store) | |||
4592 | }; | |||
4593 | ||||
4594 | if (Opcode == Instruction::Load) { | |||
4595 | if (const auto *Entry = | |||
4596 | CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT())) | |||
4597 | return NumOfMemOps * MemOpCost + Entry->Cost; | |||
4598 | } else { | |||
4599 | assert(Opcode == Instruction::Store &&((Opcode == Instruction::Store && "Expected Store Instruction at this point" ) ? static_cast<void> (0) : __assert_fail ("Opcode == Instruction::Store && \"Expected Store Instruction at this point\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 4600, __PRETTY_FUNCTION__)) | |||
4600 | "Expected Store Instruction at this point")((Opcode == Instruction::Store && "Expected Store Instruction at this point" ) ? static_cast<void> (0) : __assert_fail ("Opcode == Instruction::Store && \"Expected Store Instruction at this point\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 4600, __PRETTY_FUNCTION__)); | |||
4601 | if (const auto *Entry = | |||
4602 | CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT())) | |||
4603 | return NumOfMemOps * MemOpCost + Entry->Cost; | |||
4604 | } | |||
4605 | ||||
4606 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | |||
4607 | Alignment, AddressSpace, CostKind); | |||
4608 | } | |||
4609 | ||||
4610 | // Get estimation for interleaved load/store operations and strided load. | |||
4611 | // \p Indices contains indices for strided load. | |||
4612 | // \p Factor - the factor of interleaving. | |||
4613 | // AVX-512 provides 3-src shuffles that significantly reduces the cost. | |||
4614 | int X86TTIImpl::getInterleavedMemoryOpCostAVX512( | |||
4615 | unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, | |||
4616 | ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace, | |||
4617 | TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) { | |||
4618 | ||||
4619 | if (UseMaskForCond || UseMaskForGaps) | |||
4620 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | |||
4621 | Alignment, AddressSpace, CostKind, | |||
4622 | UseMaskForCond, UseMaskForGaps); | |||
4623 | ||||
4624 | // VecTy for interleave memop is <VF*Factor x Elt>. | |||
4625 | // So, for VF=4, Interleave Factor = 3, Element type = i32 we have | |||
4626 | // VecTy = <12 x i32>. | |||
4627 | ||||
4628 | // Calculate the number of memory operations (NumOfMemOps), required | |||
4629 | // for load/store the VecTy. | |||
4630 | MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; | |||
4631 | unsigned VecTySize = DL.getTypeStoreSize(VecTy); | |||
4632 | unsigned LegalVTSize = LegalVT.getStoreSize(); | |||
4633 | unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize; | |||
4634 | ||||
4635 | // Get the cost of one memory operation. | |||
4636 | auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(), | |||
4637 | LegalVT.getVectorNumElements()); | |||
4638 | unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, | |||
4639 | MaybeAlign(Alignment), AddressSpace, | |||
4640 | CostKind); | |||
4641 | ||||
4642 | unsigned VF = VecTy->getNumElements() / Factor; | |||
4643 | MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF); | |||
4644 | ||||
4645 | if (Opcode == Instruction::Load) { | |||
4646 | // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl) | |||
4647 | // contain the cost of the optimized shuffle sequence that the | |||
4648 | // X86InterleavedAccess pass will generate. | |||
4649 | // The cost of loads and stores are computed separately from the table. | |||
4650 | ||||
4651 | // X86InterleavedAccess support only the following interleaved-access group. | |||
4652 | static const CostTblEntry AVX512InterleavedLoadTbl[] = { | |||
4653 | {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8 | |||
4654 | {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8 | |||
4655 | {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8 | |||
4656 | }; | |||
4657 | ||||
4658 | if (const auto *Entry = | |||
4659 | CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT)) | |||
4660 | return NumOfMemOps * MemOpCost + Entry->Cost; | |||
4661 | //If an entry does not exist, fallback to the default implementation. | |||
4662 | ||||
4663 | // Kind of shuffle depends on number of loaded values. | |||
4664 | // If we load the entire data in one register, we can use a 1-src shuffle. | |||
4665 | // Otherwise, we'll merge 2 sources in each operation. | |||
4666 | TTI::ShuffleKind ShuffleKind = | |||
4667 | (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc; | |||
4668 | ||||
4669 | unsigned ShuffleCost = | |||
4670 | getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr); | |||
4671 | ||||
4672 | unsigned NumOfLoadsInInterleaveGrp = | |||
4673 | Indices.size() ? Indices.size() : Factor; | |||
4674 | auto *ResultTy = FixedVectorType::get(VecTy->getElementType(), | |||
4675 | VecTy->getNumElements() / Factor); | |||
4676 | unsigned NumOfResults = | |||
4677 | getTLI()->getTypeLegalizationCost(DL, ResultTy).first * | |||
4678 | NumOfLoadsInInterleaveGrp; | |||
4679 | ||||
4680 | // About a half of the loads may be folded in shuffles when we have only | |||
4681 | // one result. If we have more than one result, we do not fold loads at all. | |||
4682 | unsigned NumOfUnfoldedLoads = | |||
4683 | NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2; | |||
4684 | ||||
4685 | // Get a number of shuffle operations per result. | |||
4686 | unsigned NumOfShufflesPerResult = | |||
4687 | std::max((unsigned)1, (unsigned)(NumOfMemOps - 1)); | |||
4688 | ||||
4689 | // The SK_MergeTwoSrc shuffle clobbers one of src operands. | |||
4690 | // When we have more than one destination, we need additional instructions | |||
4691 | // to keep sources. | |||
4692 | unsigned NumOfMoves = 0; | |||
4693 | if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc) | |||
4694 | NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2; | |||
4695 | ||||
4696 | int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost + | |||
4697 | NumOfUnfoldedLoads * MemOpCost + NumOfMoves; | |||
4698 | ||||
4699 | return Cost; | |||
4700 | } | |||
4701 | ||||
4702 | // Store. | |||
4703 | assert(Opcode == Instruction::Store &&((Opcode == Instruction::Store && "Expected Store Instruction at this point" ) ? static_cast<void> (0) : __assert_fail ("Opcode == Instruction::Store && \"Expected Store Instruction at this point\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 4704, __PRETTY_FUNCTION__)) | |||
4704 | "Expected Store Instruction at this point")((Opcode == Instruction::Store && "Expected Store Instruction at this point" ) ? static_cast<void> (0) : __assert_fail ("Opcode == Instruction::Store && \"Expected Store Instruction at this point\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 4704, __PRETTY_FUNCTION__)); | |||
4705 | // X86InterleavedAccess support only the following interleaved-access group. | |||
4706 | static const CostTblEntry AVX512InterleavedStoreTbl[] = { | |||
4707 | {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store) | |||
4708 | {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store) | |||
4709 | {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store) | |||
4710 | ||||
4711 | {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store) | |||
4712 | {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store) | |||
4713 | {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store) | |||
4714 | {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store) | |||
4715 | }; | |||
4716 | ||||
4717 | if (const auto *Entry = | |||
4718 | CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT)) | |||
4719 | return NumOfMemOps * MemOpCost + Entry->Cost; | |||
4720 | //If an entry does not exist, fallback to the default implementation. | |||
4721 | ||||
4722 | // There is no strided stores meanwhile. And store can't be folded in | |||
4723 | // shuffle. | |||
4724 | unsigned NumOfSources = Factor; // The number of values to be merged. | |||
4725 | unsigned ShuffleCost = | |||
4726 | getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr); | |||
4727 | unsigned NumOfShufflesPerStore = NumOfSources - 1; | |||
4728 | ||||
4729 | // The SK_MergeTwoSrc shuffle clobbers one of src operands. | |||
4730 | // We need additional instructions to keep sources. | |||
4731 | unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2; | |||
4732 | int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) + | |||
4733 | NumOfMoves; | |||
4734 | return Cost; | |||
4735 | } | |||
4736 | ||||
4737 | int X86TTIImpl::getInterleavedMemoryOpCost( | |||
4738 | unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, | |||
4739 | Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, | |||
4740 | bool UseMaskForCond, bool UseMaskForGaps) { | |||
4741 | auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) { | |||
4742 | Type *EltTy = cast<VectorType>(VecTy)->getElementType(); | |||
4743 | if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) || | |||
4744 | EltTy->isIntegerTy(32) || EltTy->isPointerTy()) | |||
4745 | return true; | |||
4746 | if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8)) | |||
4747 | return HasBW; | |||
4748 | return false; | |||
4749 | }; | |||
4750 | if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI())) | |||
| ||||
4751 | return getInterleavedMemoryOpCostAVX512( | |||
4752 | Opcode, cast<FixedVectorType>(VecTy), Factor, Indices, Alignment, | |||
4753 | AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps); | |||
4754 | if (ST->hasAVX2()) | |||
4755 | return getInterleavedMemoryOpCostAVX2( | |||
4756 | Opcode, cast<FixedVectorType>(VecTy), Factor, Indices, Alignment, | |||
4757 | AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps); | |||
4758 | ||||
4759 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | |||
4760 | Alignment, AddressSpace, CostKind, | |||
4761 | UseMaskForCond, UseMaskForGaps); | |||
4762 | } |
1 | //===-- X86Subtarget.h - Define Subtarget for the X86 ----------*- C++ -*--===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file declares the X86 specific subclass of TargetSubtargetInfo. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #ifndef LLVM_LIB_TARGET_X86_X86SUBTARGET_H |
14 | #define LLVM_LIB_TARGET_X86_X86SUBTARGET_H |
15 | |
16 | #include "X86FrameLowering.h" |
17 | #include "X86ISelLowering.h" |
18 | #include "X86InstrInfo.h" |
19 | #include "X86SelectionDAGInfo.h" |
20 | #include "llvm/ADT/Triple.h" |
21 | #include "llvm/CodeGen/TargetSubtargetInfo.h" |
22 | #include "llvm/IR/CallingConv.h" |
23 | #include <climits> |
24 | #include <memory> |
25 | |
26 | #define GET_SUBTARGETINFO_HEADER |
27 | #include "X86GenSubtargetInfo.inc" |
28 | |
29 | namespace llvm { |
30 | |
31 | class CallLowering; |
32 | class GlobalValue; |
33 | class InstructionSelector; |
34 | class LegalizerInfo; |
35 | class RegisterBankInfo; |
36 | class StringRef; |
37 | class TargetMachine; |
38 | |
39 | /// The X86 backend supports a number of different styles of PIC. |
40 | /// |
41 | namespace PICStyles { |
42 | |
43 | enum class Style { |
44 | StubPIC, // Used on i386-darwin in pic mode. |
45 | GOT, // Used on 32 bit elf on when in pic mode. |
46 | RIPRel, // Used on X86-64 when in pic mode. |
47 | None // Set when not in pic mode. |
48 | }; |
49 | |
50 | } // end namespace PICStyles |
51 | |
52 | class X86Subtarget final : public X86GenSubtargetInfo { |
53 | // NOTE: Do not add anything new to this list. Coarse, CPU name based flags |
54 | // are not a good idea. We should be migrating away from these. |
55 | enum X86ProcFamilyEnum { |
56 | Others, |
57 | IntelAtom, |
58 | IntelSLM |
59 | }; |
60 | |
61 | enum X86SSEEnum { |
62 | NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F |
63 | }; |
64 | |
65 | enum X863DNowEnum { |
66 | NoThreeDNow, MMX, ThreeDNow, ThreeDNowA |
67 | }; |
68 | |
69 | /// X86 processor family: Intel Atom, and others |
70 | X86ProcFamilyEnum X86ProcFamily = Others; |
71 | |
72 | /// Which PIC style to use |
73 | PICStyles::Style PICStyle; |
74 | |
75 | const TargetMachine &TM; |
76 | |
77 | /// SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or none supported. |
78 | X86SSEEnum X86SSELevel = NoSSE; |
79 | |
80 | /// MMX, 3DNow, 3DNow Athlon, or none supported. |
81 | X863DNowEnum X863DNowLevel = NoThreeDNow; |
82 | |
83 | /// True if the processor supports X87 instructions. |
84 | bool HasX87 = false; |
85 | |
86 | /// True if the processor supports CMPXCHG8B. |
87 | bool HasCmpxchg8b = false; |
88 | |
89 | /// True if this processor has NOPL instruction |
90 | /// (generally pentium pro+). |
91 | bool HasNOPL = false; |
92 | |
93 | /// True if this processor has conditional move instructions |
94 | /// (generally pentium pro+). |
95 | bool HasCMov = false; |
96 | |
97 | /// True if the processor supports X86-64 instructions. |
98 | bool HasX86_64 = false; |
99 | |
100 | /// True if the processor supports POPCNT. |
101 | bool HasPOPCNT = false; |
102 | |
103 | /// True if the processor supports SSE4A instructions. |
104 | bool HasSSE4A = false; |
105 | |
106 | /// Target has AES instructions |
107 | bool HasAES = false; |
108 | bool HasVAES = false; |
109 | |
110 | /// Target has FXSAVE/FXRESTOR instructions |
111 | bool HasFXSR = false; |
112 | |
113 | /// Target has XSAVE instructions |
114 | bool HasXSAVE = false; |
115 | |
116 | /// Target has XSAVEOPT instructions |
117 | bool HasXSAVEOPT = false; |
118 | |
119 | /// Target has XSAVEC instructions |
120 | bool HasXSAVEC = false; |
121 | |
122 | /// Target has XSAVES instructions |
123 | bool HasXSAVES = false; |
124 | |
125 | /// Target has carry-less multiplication |
126 | bool HasPCLMUL = false; |
127 | bool HasVPCLMULQDQ = false; |
128 | |
129 | /// Target has Galois Field Arithmetic instructions |
130 | bool HasGFNI = false; |
131 | |
132 | /// Target has 3-operand fused multiply-add |
133 | bool HasFMA = false; |
134 | |
135 | /// Target has 4-operand fused multiply-add |
136 | bool HasFMA4 = false; |
137 | |
138 | /// Target has XOP instructions |
139 | bool HasXOP = false; |
140 | |
141 | /// Target has TBM instructions. |
142 | bool HasTBM = false; |
143 | |
144 | /// Target has LWP instructions |
145 | bool HasLWP = false; |
146 | |
147 | /// True if the processor has the MOVBE instruction. |
148 | bool HasMOVBE = false; |
149 | |
150 | /// True if the processor has the RDRAND instruction. |
151 | bool HasRDRAND = false; |
152 | |
153 | /// Processor has 16-bit floating point conversion instructions. |
154 | bool HasF16C = false; |
155 | |
156 | /// Processor has FS/GS base insturctions. |
157 | bool HasFSGSBase = false; |
158 | |
159 | /// Processor has LZCNT instruction. |
160 | bool HasLZCNT = false; |
161 | |
162 | /// Processor has BMI1 instructions. |
163 | bool HasBMI = false; |
164 | |
165 | /// Processor has BMI2 instructions. |
166 | bool HasBMI2 = false; |
167 | |
168 | /// Processor has VBMI instructions. |
169 | bool HasVBMI = false; |
170 | |
171 | /// Processor has VBMI2 instructions. |
172 | bool HasVBMI2 = false; |
173 | |
174 | /// Processor has Integer Fused Multiply Add |
175 | bool HasIFMA = false; |
176 | |
177 | /// Processor has RTM instructions. |
178 | bool HasRTM = false; |
179 | |
180 | /// Processor has ADX instructions. |
181 | bool HasADX = false; |
182 | |
183 | /// Processor has SHA instructions. |
184 | bool HasSHA = false; |
185 | |
186 | /// Processor has PRFCHW instructions. |
187 | bool HasPRFCHW = false; |
188 | |
189 | /// Processor has RDSEED instructions. |
190 | bool HasRDSEED = false; |
191 | |
192 | /// Processor has LAHF/SAHF instructions in 64-bit mode. |
193 | bool HasLAHFSAHF64 = false; |
194 | |
195 | /// Processor has MONITORX/MWAITX instructions. |
196 | bool HasMWAITX = false; |
197 | |
198 | /// Processor has Cache Line Zero instruction |
199 | bool HasCLZERO = false; |
200 | |
201 | /// Processor has Cache Line Demote instruction |
202 | bool HasCLDEMOTE = false; |
203 | |
204 | /// Processor has MOVDIRI instruction (direct store integer). |
205 | bool HasMOVDIRI = false; |
206 | |
207 | /// Processor has MOVDIR64B instruction (direct store 64 bytes). |
208 | bool HasMOVDIR64B = false; |
209 | |
210 | /// Processor has ptwrite instruction. |
211 | bool HasPTWRITE = false; |
212 | |
213 | /// Processor has Prefetch with intent to Write instruction |
214 | bool HasPREFETCHWT1 = false; |
215 | |
216 | /// True if SHLD instructions are slow. |
217 | bool IsSHLDSlow = false; |
218 | |
219 | /// True if the PMULLD instruction is slow compared to PMULLW/PMULHW and |
220 | // PMULUDQ. |
221 | bool IsPMULLDSlow = false; |
222 | |
223 | /// True if the PMADDWD instruction is slow compared to PMULLD. |
224 | bool IsPMADDWDSlow = false; |
225 | |
226 | /// True if unaligned memory accesses of 16-bytes are slow. |
227 | bool IsUAMem16Slow = false; |
228 | |
229 | /// True if unaligned memory accesses of 32-bytes are slow. |
230 | bool IsUAMem32Slow = false; |
231 | |
232 | /// True if SSE operations can have unaligned memory operands. |
233 | /// This may require setting a configuration bit in the processor. |
234 | bool HasSSEUnalignedMem = false; |
235 | |
236 | /// True if this processor has the CMPXCHG16B instruction; |
237 | /// this is true for most x86-64 chips, but not the first AMD chips. |
238 | bool HasCmpxchg16b = false; |
239 | |
240 | /// True if the LEA instruction should be used for adjusting |
241 | /// the stack pointer. This is an optimization for Intel Atom processors. |
242 | bool UseLeaForSP = false; |
243 | |
244 | /// True if POPCNT instruction has a false dependency on the destination register. |
245 | bool HasPOPCNTFalseDeps = false; |
246 | |
247 | /// True if LZCNT/TZCNT instructions have a false dependency on the destination register. |
248 | bool HasLZCNTFalseDeps = false; |
249 | |
250 | /// True if its preferable to combine to a single shuffle using a variable |
251 | /// mask over multiple fixed shuffles. |
252 | bool HasFastVariableShuffle = false; |
253 | |
254 | /// True if vzeroupper instructions should be inserted after code that uses |
255 | /// ymm or zmm registers. |
256 | bool InsertVZEROUPPER = false; |
257 | |
258 | /// True if there is no performance penalty for writing NOPs with up to |
259 | /// 7 bytes. |
260 | bool HasFast7ByteNOP = false; |
261 | |
262 | /// True if there is no performance penalty for writing NOPs with up to |
263 | /// 11 bytes. |
264 | bool HasFast11ByteNOP = false; |
265 | |
266 | /// True if there is no performance penalty for writing NOPs with up to |
267 | /// 15 bytes. |
268 | bool HasFast15ByteNOP = false; |
269 | |
270 | /// True if gather is reasonably fast. This is true for Skylake client and |
271 | /// all AVX-512 CPUs. |
272 | bool HasFastGather = false; |
273 | |
274 | /// True if hardware SQRTSS instruction is at least as fast (latency) as |
275 | /// RSQRTSS followed by a Newton-Raphson iteration. |
276 | bool HasFastScalarFSQRT = false; |
277 | |
278 | /// True if hardware SQRTPS/VSQRTPS instructions are at least as fast |
279 | /// (throughput) as RSQRTPS/VRSQRTPS followed by a Newton-Raphson iteration. |
280 | bool HasFastVectorFSQRT = false; |
281 | |
282 | /// True if 8-bit divisions are significantly faster than |
283 | /// 32-bit divisions and should be used when possible. |
284 | bool HasSlowDivide32 = false; |
285 | |
286 | /// True if 32-bit divides are significantly faster than |
287 | /// 64-bit divisions and should be used when possible. |
288 | bool HasSlowDivide64 = false; |
289 | |
290 | /// True if LZCNT instruction is fast. |
291 | bool HasFastLZCNT = false; |
292 | |
293 | /// True if SHLD based rotate is fast. |
294 | bool HasFastSHLDRotate = false; |
295 | |
296 | /// True if the processor supports macrofusion. |
297 | bool HasMacroFusion = false; |
298 | |
299 | /// True if the processor supports branch fusion. |
300 | bool HasBranchFusion = false; |
301 | |
302 | /// True if the processor has enhanced REP MOVSB/STOSB. |
303 | bool HasERMSB = false; |
304 | |
305 | /// True if the processor has fast short REP MOV. |
306 | bool HasFSRM = false; |
307 | |
308 | /// True if the short functions should be padded to prevent |
309 | /// a stall when returning too early. |
310 | bool PadShortFunctions = false; |
311 | |
312 | /// True if two memory operand instructions should use a temporary register |
313 | /// instead. |
314 | bool SlowTwoMemOps = false; |
315 | |
316 | /// True if the LEA instruction inputs have to be ready at address generation |
317 | /// (AG) time. |
318 | bool LEAUsesAG = false; |
319 | |
320 | /// True if the LEA instruction with certain arguments is slow |
321 | bool SlowLEA = false; |
322 | |
323 | /// True if the LEA instruction has all three source operands: base, index, |
324 | /// and offset or if the LEA instruction uses base and index registers where |
325 | /// the base is EBP, RBP,or R13 |
326 | bool Slow3OpsLEA = false; |
327 | |
328 | /// True if INC and DEC instructions are slow when writing to flags |
329 | bool SlowIncDec = false; |
330 | |
331 | /// Processor has AVX-512 PreFetch Instructions |
332 | bool HasPFI = false; |
333 | |
334 | /// Processor has AVX-512 Exponential and Reciprocal Instructions |
335 | bool HasERI = false; |
336 | |
337 | /// Processor has AVX-512 Conflict Detection Instructions |
338 | bool HasCDI = false; |
339 | |
340 | /// Processor has AVX-512 population count Instructions |
341 | bool HasVPOPCNTDQ = false; |
342 | |
343 | /// Processor has AVX-512 Doubleword and Quadword instructions |
344 | bool HasDQI = false; |
345 | |
346 | /// Processor has AVX-512 Byte and Word instructions |
347 | bool HasBWI = false; |
348 | |
349 | /// Processor has AVX-512 Vector Length eXtenstions |
350 | bool HasVLX = false; |
351 | |
352 | /// Processor has PKU extenstions |
353 | bool HasPKU = false; |
354 | |
355 | /// Processor has AVX-512 Vector Neural Network Instructions |
356 | bool HasVNNI = false; |
357 | |
358 | /// Processor has AVX Vector Neural Network Instructions |
359 | bool HasAVXVNNI = false; |
360 | |
361 | /// Processor has AVX-512 bfloat16 floating-point extensions |
362 | bool HasBF16 = false; |
363 | |
364 | /// Processor supports ENQCMD instructions |
365 | bool HasENQCMD = false; |
366 | |
367 | /// Processor has AVX-512 Bit Algorithms instructions |
368 | bool HasBITALG = false; |
369 | |
370 | /// Processor has AVX-512 vp2intersect instructions |
371 | bool HasVP2INTERSECT = false; |
372 | |
373 | /// Processor supports CET SHSTK - Control-Flow Enforcement Technology |
374 | /// using Shadow Stack |
375 | bool HasSHSTK = false; |
376 | |
377 | /// Processor supports Invalidate Process-Context Identifier |
378 | bool HasINVPCID = false; |
379 | |
380 | /// Processor has Software Guard Extensions |
381 | bool HasSGX = false; |
382 | |
383 | /// Processor supports Flush Cache Line instruction |
384 | bool HasCLFLUSHOPT = false; |
385 | |
386 | /// Processor supports Cache Line Write Back instruction |
387 | bool HasCLWB = false; |
388 | |
389 | /// Processor supports Write Back No Invalidate instruction |
390 | bool HasWBNOINVD = false; |
391 | |
392 | /// Processor support RDPID instruction |
393 | bool HasRDPID = false; |
394 | |
395 | /// Processor supports WaitPKG instructions |
396 | bool HasWAITPKG = false; |
397 | |
398 | /// Processor supports PCONFIG instruction |
399 | bool HasPCONFIG = false; |
400 | |
401 | /// Processor support key locker instructions |
402 | bool HasKL = false; |
403 | |
404 | /// Processor support key locker wide instructions |
405 | bool HasWIDEKL = false; |
406 | |
407 | /// Processor supports HRESET instruction |
408 | bool HasHRESET = false; |
409 | |
410 | /// Processor supports SERIALIZE instruction |
411 | bool HasSERIALIZE = false; |
412 | |
413 | /// Processor supports TSXLDTRK instruction |
414 | bool HasTSXLDTRK = false; |
415 | |
416 | /// Processor has AMX support |
417 | bool HasAMXTILE = false; |
418 | bool HasAMXBF16 = false; |
419 | bool HasAMXINT8 = false; |
420 | |
421 | /// Processor supports User Level Interrupt instructions |
422 | bool HasUINTR = false; |
423 | |
424 | /// Processor has a single uop BEXTR implementation. |
425 | bool HasFastBEXTR = false; |
426 | |
427 | /// Try harder to combine to horizontal vector ops if they are fast. |
428 | bool HasFastHorizontalOps = false; |
429 | |
430 | /// Prefer a left/right scalar logical shifts pair over a shift+and pair. |
431 | bool HasFastScalarShiftMasks = false; |
432 | |
433 | /// Prefer a left/right vector logical shifts pair over a shift+and pair. |
434 | bool HasFastVectorShiftMasks = false; |
435 | |
436 | /// Use a retpoline thunk rather than indirect calls to block speculative |
437 | /// execution. |
438 | bool UseRetpolineIndirectCalls = false; |
439 | |
440 | /// Use a retpoline thunk or remove any indirect branch to block speculative |
441 | /// execution. |
442 | bool UseRetpolineIndirectBranches = false; |
443 | |
444 | /// Deprecated flag, query `UseRetpolineIndirectCalls` and |
445 | /// `UseRetpolineIndirectBranches` instead. |
446 | bool DeprecatedUseRetpoline = false; |
447 | |
448 | /// When using a retpoline thunk, call an externally provided thunk rather |
449 | /// than emitting one inside the compiler. |
450 | bool UseRetpolineExternalThunk = false; |
451 | |
452 | /// Prevent generation of indirect call/branch instructions from memory, |
453 | /// and force all indirect call/branch instructions from a register to be |
454 | /// preceded by an LFENCE. Also decompose RET instructions into a |
455 | /// POP+LFENCE+JMP sequence. |
456 | bool UseLVIControlFlowIntegrity = false; |
457 | |
458 | /// Enable Speculative Execution Side Effect Suppression |
459 | bool UseSpeculativeExecutionSideEffectSuppression = false; |
460 | |
461 | /// Insert LFENCE instructions to prevent data speculatively injected into |
462 | /// loads from being used maliciously. |
463 | bool UseLVILoadHardening = false; |
464 | |
465 | /// Use software floating point for code generation. |
466 | bool UseSoftFloat = false; |
467 | |
468 | /// Use alias analysis during code generation. |
469 | bool UseAA = false; |
470 | |
471 | /// The minimum alignment known to hold of the stack frame on |
472 | /// entry to the function and which must be maintained by every function. |
473 | Align stackAlignment = Align(4); |
474 | |
475 | Align TileConfigAlignment = Align(4); |
476 | |
477 | /// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops. |
478 | /// |
479 | // FIXME: this is a known good value for Yonah. How about others? |
480 | unsigned MaxInlineSizeThreshold = 128; |
481 | |
482 | /// Indicates target prefers 128 bit instructions. |
483 | bool Prefer128Bit = false; |
484 | |
485 | /// Indicates target prefers 256 bit instructions. |
486 | bool Prefer256Bit = false; |
487 | |
488 | /// Indicates target prefers AVX512 mask registers. |
489 | bool PreferMaskRegisters = false; |
490 | |
491 | /// Use Goldmont specific floating point div/sqrt costs. |
492 | bool UseGLMDivSqrtCosts = false; |
493 | |
494 | /// What processor and OS we're targeting. |
495 | Triple TargetTriple; |
496 | |
497 | /// GlobalISel related APIs. |
498 | std::unique_ptr<CallLowering> CallLoweringInfo; |
499 | std::unique_ptr<LegalizerInfo> Legalizer; |
500 | std::unique_ptr<RegisterBankInfo> RegBankInfo; |
501 | std::unique_ptr<InstructionSelector> InstSelector; |
502 | |
503 | private: |
504 | /// Override the stack alignment. |
505 | MaybeAlign StackAlignOverride; |
506 | |
507 | /// Preferred vector width from function attribute. |
508 | unsigned PreferVectorWidthOverride; |
509 | |
510 | /// Resolved preferred vector width from function attribute and subtarget |
511 | /// features. |
512 | unsigned PreferVectorWidth = UINT32_MAX(4294967295U); |
513 | |
514 | /// Required vector width from function attribute. |
515 | unsigned RequiredVectorWidth; |
516 | |
517 | /// True if compiling for 64-bit, false for 16-bit or 32-bit. |
518 | bool In64BitMode = false; |
519 | |
520 | /// True if compiling for 32-bit, false for 16-bit or 64-bit. |
521 | bool In32BitMode = false; |
522 | |
523 | /// True if compiling for 16-bit, false for 32-bit or 64-bit. |
524 | bool In16BitMode = false; |
525 | |
526 | X86SelectionDAGInfo TSInfo; |
527 | // Ordering here is important. X86InstrInfo initializes X86RegisterInfo which |
528 | // X86TargetLowering needs. |
529 | X86InstrInfo InstrInfo; |
530 | X86TargetLowering TLInfo; |
531 | X86FrameLowering FrameLowering; |
532 | |
533 | public: |
534 | /// This constructor initializes the data members to match that |
535 | /// of the specified triple. |
536 | /// |
537 | X86Subtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU, StringRef FS, |
538 | const X86TargetMachine &TM, MaybeAlign StackAlignOverride, |
539 | unsigned PreferVectorWidthOverride, |
540 | unsigned RequiredVectorWidth); |
541 | |
542 | const X86TargetLowering *getTargetLowering() const override { |
543 | return &TLInfo; |
544 | } |
545 | |
546 | const X86InstrInfo *getInstrInfo() const override { return &InstrInfo; } |
547 | |
548 | const X86FrameLowering *getFrameLowering() const override { |
549 | return &FrameLowering; |
550 | } |
551 | |
552 | const X86SelectionDAGInfo *getSelectionDAGInfo() const override { |
553 | return &TSInfo; |
554 | } |
555 | |
556 | const X86RegisterInfo *getRegisterInfo() const override { |
557 | return &getInstrInfo()->getRegisterInfo(); |
558 | } |
559 | |
560 | unsigned getTileConfigSize() const { return 64; } |
561 | Align getTileConfigAlignment() const { return TileConfigAlignment; } |
562 | |
563 | /// Returns the minimum alignment known to hold of the |
564 | /// stack frame on entry to the function and which must be maintained by every |
565 | /// function for this subtarget. |
566 | Align getStackAlignment() const { return stackAlignment; } |
567 | |
568 | /// Returns the maximum memset / memcpy size |
569 | /// that still makes it profitable to inline the call. |
570 | unsigned getMaxInlineSizeThreshold() const { return MaxInlineSizeThreshold; } |
571 | |
572 | /// ParseSubtargetFeatures - Parses features string setting specified |
573 | /// subtarget options. Definition of function is auto generated by tblgen. |
574 | void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); |
575 | |
576 | /// Methods used by Global ISel |
577 | const CallLowering *getCallLowering() const override; |
578 | InstructionSelector *getInstructionSelector() const override; |
579 | const LegalizerInfo *getLegalizerInfo() const override; |
580 | const RegisterBankInfo *getRegBankInfo() const override; |
581 | |
582 | private: |
583 | /// Initialize the full set of dependencies so we can use an initializer |
584 | /// list for X86Subtarget. |
585 | X86Subtarget &initializeSubtargetDependencies(StringRef CPU, |
586 | StringRef TuneCPU, |
587 | StringRef FS); |
588 | void initSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); |
589 | |
590 | public: |
591 | /// Is this x86_64? (disregarding specific ABI / programming model) |
592 | bool is64Bit() const { |
593 | return In64BitMode; |
594 | } |
595 | |
596 | bool is32Bit() const { |
597 | return In32BitMode; |
598 | } |
599 | |
600 | bool is16Bit() const { |
601 | return In16BitMode; |
602 | } |
603 | |
604 | /// Is this x86_64 with the ILP32 programming model (x32 ABI)? |
605 | bool isTarget64BitILP32() const { |
606 | return In64BitMode && (TargetTriple.getEnvironment() == Triple::GNUX32 || |
607 | TargetTriple.isOSNaCl()); |
608 | } |
609 | |
610 | /// Is this x86_64 with the LP64 programming model (standard AMD64, no x32)? |
611 | bool isTarget64BitLP64() const { |
612 | return In64BitMode && (TargetTriple.getEnvironment() != Triple::GNUX32 && |
613 | !TargetTriple.isOSNaCl()); |
614 | } |
615 | |
616 | PICStyles::Style getPICStyle() const { return PICStyle; } |
617 | void setPICStyle(PICStyles::Style Style) { PICStyle = Style; } |
618 | |
619 | bool hasX87() const { return HasX87; } |
620 | bool hasCmpxchg8b() const { return HasCmpxchg8b; } |
621 | bool hasNOPL() const { return HasNOPL; } |
622 | // SSE codegen depends on cmovs, and all SSE1+ processors support them. |
623 | // All 64-bit processors support cmov. |
624 | bool hasCMov() const { return HasCMov || X86SSELevel >= SSE1 || is64Bit(); } |
625 | bool hasSSE1() const { return X86SSELevel >= SSE1; } |
626 | bool hasSSE2() const { return X86SSELevel >= SSE2; } |
627 | bool hasSSE3() const { return X86SSELevel >= SSE3; } |
628 | bool hasSSSE3() const { return X86SSELevel >= SSSE3; } |
629 | bool hasSSE41() const { return X86SSELevel >= SSE41; } |
630 | bool hasSSE42() const { return X86SSELevel >= SSE42; } |
631 | bool hasAVX() const { return X86SSELevel >= AVX; } |
632 | bool hasAVX2() const { return X86SSELevel >= AVX2; } |
633 | bool hasAVX512() const { return X86SSELevel >= AVX512F; } |
634 | bool hasInt256() const { return hasAVX2(); } |
635 | bool hasSSE4A() const { return HasSSE4A; } |
636 | bool hasMMX() const { return X863DNowLevel >= MMX; } |
637 | bool has3DNow() const { return X863DNowLevel >= ThreeDNow; } |
638 | bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; } |
639 | bool hasPOPCNT() const { return HasPOPCNT; } |
640 | bool hasAES() const { return HasAES; } |
641 | bool hasVAES() const { return HasVAES; } |
642 | bool hasFXSR() const { return HasFXSR; } |
643 | bool hasXSAVE() const { return HasXSAVE; } |
644 | bool hasXSAVEOPT() const { return HasXSAVEOPT; } |
645 | bool hasXSAVEC() const { return HasXSAVEC; } |
646 | bool hasXSAVES() const { return HasXSAVES; } |
647 | bool hasPCLMUL() const { return HasPCLMUL; } |
648 | bool hasVPCLMULQDQ() const { return HasVPCLMULQDQ; } |
649 | bool hasGFNI() const { return HasGFNI; } |
650 | // Prefer FMA4 to FMA - its better for commutation/memory folding and |
651 | // has equal or better performance on all supported targets. |
652 | bool hasFMA() const { return HasFMA; } |
653 | bool hasFMA4() const { return HasFMA4; } |
654 | bool hasAnyFMA() const { return hasFMA() || hasFMA4(); } |
655 | bool hasXOP() const { return HasXOP; } |
656 | bool hasTBM() const { return HasTBM; } |
657 | bool hasLWP() const { return HasLWP; } |
658 | bool hasMOVBE() const { return HasMOVBE; } |
659 | bool hasRDRAND() const { return HasRDRAND; } |
660 | bool hasF16C() const { return HasF16C; } |
661 | bool hasFSGSBase() const { return HasFSGSBase; } |
662 | bool hasLZCNT() const { return HasLZCNT; } |
663 | bool hasBMI() const { return HasBMI; } |
664 | bool hasBMI2() const { return HasBMI2; } |
665 | bool hasVBMI() const { return HasVBMI; } |
666 | bool hasVBMI2() const { return HasVBMI2; } |
667 | bool hasIFMA() const { return HasIFMA; } |
668 | bool hasRTM() const { return HasRTM; } |
669 | bool hasADX() const { return HasADX; } |
670 | bool hasSHA() const { return HasSHA; } |
671 | bool hasPRFCHW() const { return HasPRFCHW; } |
672 | bool hasPREFETCHWT1() const { return HasPREFETCHWT1; } |
673 | bool hasPrefetchW() const { |
674 | // The PREFETCHW instruction was added with 3DNow but later CPUs gave it |
675 | // its own CPUID bit as part of deprecating 3DNow. Intel eventually added |
676 | // it and KNL has another that prefetches to L2 cache. We assume the |
677 | // L1 version exists if the L2 version does. |
678 | return has3DNow() || hasPRFCHW() || hasPREFETCHWT1(); |
679 | } |
680 | bool hasSSEPrefetch() const { |
681 | // We implicitly enable these when we have a write prefix supporting cache |
682 | // level OR if we have prfchw, but don't already have a read prefetch from |
683 | // 3dnow. |
684 | return hasSSE1() || (hasPRFCHW() && !has3DNow()) || hasPREFETCHWT1(); |
685 | } |
686 | bool hasRDSEED() const { return HasRDSEED; } |
687 | bool hasLAHFSAHF() const { return HasLAHFSAHF64 || !is64Bit(); } |
688 | bool hasMWAITX() const { return HasMWAITX; } |
689 | bool hasCLZERO() const { return HasCLZERO; } |
690 | bool hasCLDEMOTE() const { return HasCLDEMOTE; } |
691 | bool hasMOVDIRI() const { return HasMOVDIRI; } |
692 | bool hasMOVDIR64B() const { return HasMOVDIR64B; } |
693 | bool hasPTWRITE() const { return HasPTWRITE; } |
694 | bool isSHLDSlow() const { return IsSHLDSlow; } |
695 | bool isPMULLDSlow() const { return IsPMULLDSlow; } |
696 | bool isPMADDWDSlow() const { return IsPMADDWDSlow; } |
697 | bool isUnalignedMem16Slow() const { return IsUAMem16Slow; } |
698 | bool isUnalignedMem32Slow() const { return IsUAMem32Slow; } |
699 | bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; } |
700 | bool hasCmpxchg16b() const { return HasCmpxchg16b && is64Bit(); } |
701 | bool useLeaForSP() const { return UseLeaForSP; } |
702 | bool hasPOPCNTFalseDeps() const { return HasPOPCNTFalseDeps; } |
703 | bool hasLZCNTFalseDeps() const { return HasLZCNTFalseDeps; } |
704 | bool hasFastVariableShuffle() const { |
705 | return HasFastVariableShuffle; |
706 | } |
707 | bool insertVZEROUPPER() const { return InsertVZEROUPPER; } |
708 | bool hasFastGather() const { return HasFastGather; } |
709 | bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; } |
710 | bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; } |
711 | bool hasFastLZCNT() const { return HasFastLZCNT; } |
712 | bool hasFastSHLDRotate() const { return HasFastSHLDRotate; } |
713 | bool hasFastBEXTR() const { return HasFastBEXTR; } |
714 | bool hasFastHorizontalOps() const { return HasFastHorizontalOps; } |
715 | bool hasFastScalarShiftMasks() const { return HasFastScalarShiftMasks; } |
716 | bool hasFastVectorShiftMasks() const { return HasFastVectorShiftMasks; } |
717 | bool hasMacroFusion() const { return HasMacroFusion; } |
718 | bool hasBranchFusion() const { return HasBranchFusion; } |
719 | bool hasERMSB() const { return HasERMSB; } |
720 | bool hasFSRM() const { return HasFSRM; } |
721 | bool hasSlowDivide32() const { return HasSlowDivide32; } |
722 | bool hasSlowDivide64() const { return HasSlowDivide64; } |
723 | bool padShortFunctions() const { return PadShortFunctions; } |
724 | bool slowTwoMemOps() const { return SlowTwoMemOps; } |
725 | bool LEAusesAG() const { return LEAUsesAG; } |
726 | bool slowLEA() const { return SlowLEA; } |
727 | bool slow3OpsLEA() const { return Slow3OpsLEA; } |
728 | bool slowIncDec() const { return SlowIncDec; } |
729 | bool hasCDI() const { return HasCDI; } |
730 | bool hasVPOPCNTDQ() const { return HasVPOPCNTDQ; } |
731 | bool hasPFI() const { return HasPFI; } |
732 | bool hasERI() const { return HasERI; } |
733 | bool hasDQI() const { return HasDQI; } |
734 | bool hasBWI() const { return HasBWI; } |
735 | bool hasVLX() const { return HasVLX; } |
736 | bool hasPKU() const { return HasPKU; } |
737 | bool hasVNNI() const { return HasVNNI; } |
738 | bool hasBF16() const { return HasBF16; } |
739 | bool hasVP2INTERSECT() const { return HasVP2INTERSECT; } |
740 | bool hasBITALG() const { return HasBITALG; } |
741 | bool hasSHSTK() const { return HasSHSTK; } |
742 | bool hasCLFLUSHOPT() const { return HasCLFLUSHOPT; } |
743 | bool hasCLWB() const { return HasCLWB; } |
744 | bool hasWBNOINVD() const { return HasWBNOINVD; } |
745 | bool hasRDPID() const { return HasRDPID; } |
746 | bool hasWAITPKG() const { return HasWAITPKG; } |
747 | bool hasPCONFIG() const { return HasPCONFIG; } |
748 | bool hasSGX() const { return HasSGX; } |
749 | bool hasINVPCID() const { return HasINVPCID; } |
750 | bool hasENQCMD() const { return HasENQCMD; } |
751 | bool hasKL() const { return HasKL; } |
752 | bool hasWIDEKL() const { return HasWIDEKL; } |
753 | bool hasHRESET() const { return HasHRESET; } |
754 | bool hasSERIALIZE() const { return HasSERIALIZE; } |
755 | bool hasTSXLDTRK() const { return HasTSXLDTRK; } |
756 | bool hasUINTR() const { return HasUINTR; } |
757 | bool useRetpolineIndirectCalls() const { return UseRetpolineIndirectCalls; } |
758 | bool useRetpolineIndirectBranches() const { |
759 | return UseRetpolineIndirectBranches; |
760 | } |
761 | bool hasAVXVNNI() const { return HasAVXVNNI; } |
762 | bool hasAMXTILE() const { return HasAMXTILE; } |
763 | bool hasAMXBF16() const { return HasAMXBF16; } |
764 | bool hasAMXINT8() const { return HasAMXINT8; } |
765 | bool useRetpolineExternalThunk() const { return UseRetpolineExternalThunk; } |
766 | |
767 | // These are generic getters that OR together all of the thunk types |
768 | // supported by the subtarget. Therefore useIndirectThunk*() will return true |
769 | // if any respective thunk feature is enabled. |
770 | bool useIndirectThunkCalls() const { |
771 | return useRetpolineIndirectCalls() || useLVIControlFlowIntegrity(); |
772 | } |
773 | bool useIndirectThunkBranches() const { |
774 | return useRetpolineIndirectBranches() || useLVIControlFlowIntegrity(); |
775 | } |
776 | |
777 | bool preferMaskRegisters() const { return PreferMaskRegisters; } |
778 | bool useGLMDivSqrtCosts() const { return UseGLMDivSqrtCosts; } |
779 | bool useLVIControlFlowIntegrity() const { return UseLVIControlFlowIntegrity; } |
780 | bool useLVILoadHardening() const { return UseLVILoadHardening; } |
781 | bool useSpeculativeExecutionSideEffectSuppression() const { |
782 | return UseSpeculativeExecutionSideEffectSuppression; |
783 | } |
784 | |
785 | unsigned getPreferVectorWidth() const { return PreferVectorWidth; } |
786 | unsigned getRequiredVectorWidth() const { return RequiredVectorWidth; } |
787 | |
788 | // Helper functions to determine when we should allow widening to 512-bit |
789 | // during codegen. |
790 | // TODO: Currently we're always allowing widening on CPUs without VLX, |
791 | // because for many cases we don't have a better option. |
792 | bool canExtendTo512DQ() const { |
793 | return hasAVX512() && (!hasVLX() || getPreferVectorWidth() >= 512); |
794 | } |
795 | bool canExtendTo512BW() const { |
796 | return hasBWI() && canExtendTo512DQ(); |
797 | } |
798 | |
799 | // If there are no 512-bit vectors and we prefer not to use 512-bit registers, |
800 | // disable them in the legalizer. |
801 | bool useAVX512Regs() const { |
802 | return hasAVX512() && (canExtendTo512DQ() || RequiredVectorWidth > 256); |
803 | } |
804 | |
805 | bool useBWIRegs() const { |
806 | return hasBWI() && useAVX512Regs(); |
807 | } |
808 | |
809 | bool isXRaySupported() const override { return is64Bit(); } |
810 | |
811 | /// TODO: to be removed later and replaced with suitable properties |
812 | bool isAtom() const { return X86ProcFamily == IntelAtom; } |
813 | bool isSLM() const { return X86ProcFamily == IntelSLM; } |
814 | bool useSoftFloat() const { return UseSoftFloat; } |
815 | bool useAA() const override { return UseAA; } |
816 | |
817 | /// Use mfence if we have SSE2 or we're on x86-64 (even if we asked for |
818 | /// no-sse2). There isn't any reason to disable it if the target processor |
819 | /// supports it. |
820 | bool hasMFence() const { return hasSSE2() || is64Bit(); } |
821 | |
822 | const Triple &getTargetTriple() const { return TargetTriple; } |
823 | |
824 | bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); } |
825 | bool isTargetFreeBSD() const { return TargetTriple.isOSFreeBSD(); } |
826 | bool isTargetDragonFly() const { return TargetTriple.isOSDragonFly(); } |
827 | bool isTargetSolaris() const { return TargetTriple.isOSSolaris(); } |
828 | bool isTargetPS4() const { return TargetTriple.isPS4CPU(); } |
829 | |
830 | bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } |
831 | bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); } |
832 | bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); } |
833 | |
834 | bool isTargetLinux() const { return TargetTriple.isOSLinux(); } |
835 | bool isTargetKFreeBSD() const { return TargetTriple.isOSKFreeBSD(); } |
836 | bool isTargetGlibc() const { return TargetTriple.isOSGlibc(); } |
837 | bool isTargetAndroid() const { return TargetTriple.isAndroid(); } |
838 | bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); } |
839 | bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); } |
840 | bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); } |
841 | bool isTargetMCU() const { return TargetTriple.isOSIAMCU(); } |
842 | bool isTargetFuchsia() const { return TargetTriple.isOSFuchsia(); } |
843 | |
844 | bool isTargetWindowsMSVC() const { |
845 | return TargetTriple.isWindowsMSVCEnvironment(); |
846 | } |
847 | |
848 | bool isTargetWindowsCoreCLR() const { |
849 | return TargetTriple.isWindowsCoreCLREnvironment(); |
850 | } |
851 | |
852 | bool isTargetWindowsCygwin() const { |
853 | return TargetTriple.isWindowsCygwinEnvironment(); |
854 | } |
855 | |
856 | bool isTargetWindowsGNU() const { |
857 | return TargetTriple.isWindowsGNUEnvironment(); |
858 | } |
859 | |
860 | bool isTargetWindowsItanium() const { |
861 | return TargetTriple.isWindowsItaniumEnvironment(); |
862 | } |
863 | |
864 | bool isTargetCygMing() const { return TargetTriple.isOSCygMing(); } |
865 | |
866 | bool isOSWindows() const { return TargetTriple.isOSWindows(); } |
867 | |
868 | bool isTargetWin64() const { return In64BitMode && isOSWindows(); } |
869 | |
870 | bool isTargetWin32() const { return !In64BitMode && isOSWindows(); } |
871 | |
872 | bool isPICStyleGOT() const { return PICStyle == PICStyles::Style::GOT; } |
873 | bool isPICStyleRIPRel() const { return PICStyle == PICStyles::Style::RIPRel; } |
874 | |
875 | bool isPICStyleStubPIC() const { |
876 | return PICStyle == PICStyles::Style::StubPIC; |
877 | } |
878 | |
879 | bool isPositionIndependent() const; |
880 | |
881 | bool isCallingConvWin64(CallingConv::ID CC) const { |
882 | switch (CC) { |
883 | // On Win64, all these conventions just use the default convention. |
884 | case CallingConv::C: |
885 | case CallingConv::Fast: |
886 | case CallingConv::Tail: |
887 | case CallingConv::Swift: |
888 | case CallingConv::X86_FastCall: |
889 | case CallingConv::X86_StdCall: |
890 | case CallingConv::X86_ThisCall: |
891 | case CallingConv::X86_VectorCall: |
892 | case CallingConv::Intel_OCL_BI: |
893 | return isTargetWin64(); |
894 | // This convention allows using the Win64 convention on other targets. |
895 | case CallingConv::Win64: |
896 | return true; |
897 | // This convention allows using the SysV convention on Windows targets. |
898 | case CallingConv::X86_64_SysV: |
899 | return false; |
900 | // Otherwise, who knows what this is. |
901 | default: |
902 | return false; |
903 | } |
904 | } |
905 | |
906 | /// Classify a global variable reference for the current subtarget according |
907 | /// to how we should reference it in a non-pcrel context. |
908 | unsigned char classifyLocalReference(const GlobalValue *GV) const; |
909 | |
910 | unsigned char classifyGlobalReference(const GlobalValue *GV, |
911 | const Module &M) const; |
912 | unsigned char classifyGlobalReference(const GlobalValue *GV) const; |
913 | |
914 | /// Classify a global function reference for the current subtarget. |
915 | unsigned char classifyGlobalFunctionReference(const GlobalValue *GV, |
916 | const Module &M) const; |
917 | unsigned char classifyGlobalFunctionReference(const GlobalValue *GV) const; |
918 | |
919 | /// Classify a blockaddress reference for the current subtarget according to |
920 | /// how we should reference it in a non-pcrel context. |
921 | unsigned char classifyBlockAddressReference() const; |
922 | |
923 | /// Return true if the subtarget allows calls to immediate address. |
924 | bool isLegalToCallImmediateAddr() const; |
925 | |
926 | /// If we are using indirect thunks, we need to expand indirectbr to avoid it |
927 | /// lowering to an actual indirect jump. |
928 | bool enableIndirectBrExpand() const override { |
929 | return useIndirectThunkBranches(); |
930 | } |
931 | |
932 | /// Enable the MachineScheduler pass for all X86 subtargets. |
933 | bool enableMachineScheduler() const override { return true; } |
934 | |
935 | bool enableEarlyIfConversion() const override; |
936 | |
937 | void getPostRAMutations(std::vector<std::unique_ptr<ScheduleDAGMutation>> |
938 | &Mutations) const override; |
939 | |
940 | AntiDepBreakMode getAntiDepBreakMode() const override { |
941 | return TargetSubtargetInfo::ANTIDEP_CRITICAL; |
942 | } |
943 | |
944 | bool enableAdvancedRASplitCost() const override { return true; } |
945 | }; |
946 | |
947 | } // end namespace llvm |
948 | |
949 | #endif // LLVM_LIB_TARGET_X86_X86SUBTARGET_H |
1 | //===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// This file provides a helper that implements much of the TTI interface in |
11 | /// terms of the target-independent code generator and TargetLowering |
12 | /// interfaces. |
13 | // |
14 | //===----------------------------------------------------------------------===// |
15 | |
16 | #ifndef LLVM_CODEGEN_BASICTTIIMPL_H |
17 | #define LLVM_CODEGEN_BASICTTIIMPL_H |
18 | |
19 | #include "llvm/ADT/APInt.h" |
20 | #include "llvm/ADT/ArrayRef.h" |
21 | #include "llvm/ADT/BitVector.h" |
22 | #include "llvm/ADT/SmallPtrSet.h" |
23 | #include "llvm/ADT/SmallVector.h" |
24 | #include "llvm/Analysis/LoopInfo.h" |
25 | #include "llvm/Analysis/TargetTransformInfo.h" |
26 | #include "llvm/Analysis/TargetTransformInfoImpl.h" |
27 | #include "llvm/CodeGen/ISDOpcodes.h" |
28 | #include "llvm/CodeGen/TargetLowering.h" |
29 | #include "llvm/CodeGen/TargetSubtargetInfo.h" |
30 | #include "llvm/CodeGen/ValueTypes.h" |
31 | #include "llvm/IR/BasicBlock.h" |
32 | #include "llvm/IR/Constant.h" |
33 | #include "llvm/IR/Constants.h" |
34 | #include "llvm/IR/DataLayout.h" |
35 | #include "llvm/IR/DerivedTypes.h" |
36 | #include "llvm/IR/InstrTypes.h" |
37 | #include "llvm/IR/Instruction.h" |
38 | #include "llvm/IR/Instructions.h" |
39 | #include "llvm/IR/Intrinsics.h" |
40 | #include "llvm/IR/Operator.h" |
41 | #include "llvm/IR/Type.h" |
42 | #include "llvm/IR/Value.h" |
43 | #include "llvm/Support/Casting.h" |
44 | #include "llvm/Support/CommandLine.h" |
45 | #include "llvm/Support/ErrorHandling.h" |
46 | #include "llvm/Support/MachineValueType.h" |
47 | #include "llvm/Support/MathExtras.h" |
48 | #include <algorithm> |
49 | #include <cassert> |
50 | #include <cstdint> |
51 | #include <limits> |
52 | #include <utility> |
53 | |
54 | namespace llvm { |
55 | |
56 | class Function; |
57 | class GlobalValue; |
58 | class LLVMContext; |
59 | class ScalarEvolution; |
60 | class SCEV; |
61 | class TargetMachine; |
62 | |
63 | extern cl::opt<unsigned> PartialUnrollingThreshold; |
64 | |
65 | /// Base class which can be used to help build a TTI implementation. |
66 | /// |
67 | /// This class provides as much implementation of the TTI interface as is |
68 | /// possible using the target independent parts of the code generator. |
69 | /// |
70 | /// In order to subclass it, your class must implement a getST() method to |
71 | /// return the subtarget, and a getTLI() method to return the target lowering. |
72 | /// We need these methods implemented in the derived class so that this class |
73 | /// doesn't have to duplicate storage for them. |
74 | template <typename T> |
75 | class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { |
76 | private: |
77 | using BaseT = TargetTransformInfoImplCRTPBase<T>; |
78 | using TTI = TargetTransformInfo; |
79 | |
80 | /// Helper function to access this as a T. |
81 | T *thisT() { return static_cast<T *>(this); } |
82 | |
83 | /// Estimate a cost of Broadcast as an extract and sequence of insert |
84 | /// operations. |
85 | unsigned getBroadcastShuffleOverhead(FixedVectorType *VTy) { |
86 | unsigned Cost = 0; |
87 | // Broadcast cost is equal to the cost of extracting the zero'th element |
88 | // plus the cost of inserting it into every element of the result vector. |
89 | Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 0); |
90 | |
91 | for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { |
92 | Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i); |
93 | } |
94 | return Cost; |
95 | } |
96 | |
97 | /// Estimate a cost of shuffle as a sequence of extract and insert |
98 | /// operations. |
99 | unsigned getPermuteShuffleOverhead(FixedVectorType *VTy) { |
100 | unsigned Cost = 0; |
101 | // Shuffle cost is equal to the cost of extracting element from its argument |
102 | // plus the cost of inserting them onto the result vector. |
103 | |
104 | // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from |
105 | // index 0 of first vector, index 1 of second vector,index 2 of first |
106 | // vector and finally index 3 of second vector and insert them at index |
107 | // <0,1,2,3> of result vector. |
108 | for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { |
109 | Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i); |
110 | Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, i); |
111 | } |
112 | return Cost; |
113 | } |
114 | |
115 | /// Estimate a cost of subvector extraction as a sequence of extract and |
116 | /// insert operations. |
117 | unsigned getExtractSubvectorOverhead(VectorType *VTy, int Index, |
118 | FixedVectorType *SubVTy) { |
119 | assert(VTy && SubVTy &&((VTy && SubVTy && "Can only extract subvectors from vectors" ) ? static_cast<void> (0) : __assert_fail ("VTy && SubVTy && \"Can only extract subvectors from vectors\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 120, __PRETTY_FUNCTION__)) |
120 | "Can only extract subvectors from vectors")((VTy && SubVTy && "Can only extract subvectors from vectors" ) ? static_cast<void> (0) : __assert_fail ("VTy && SubVTy && \"Can only extract subvectors from vectors\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 120, __PRETTY_FUNCTION__)); |
121 | int NumSubElts = SubVTy->getNumElements(); |
122 | assert((!isa<FixedVectorType>(VTy) ||(((!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && "SK_ExtractSubvector index out of range") ? static_cast<void > (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_ExtractSubvector index out of range\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 125, __PRETTY_FUNCTION__)) |
123 | (Index + NumSubElts) <=(((!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && "SK_ExtractSubvector index out of range") ? static_cast<void > (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_ExtractSubvector index out of range\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 125, __PRETTY_FUNCTION__)) |
124 | (int)cast<FixedVectorType>(VTy)->getNumElements()) &&(((!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && "SK_ExtractSubvector index out of range") ? static_cast<void > (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_ExtractSubvector index out of range\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 125, __PRETTY_FUNCTION__)) |
125 | "SK_ExtractSubvector index out of range")(((!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && "SK_ExtractSubvector index out of range") ? static_cast<void > (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_ExtractSubvector index out of range\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 125, __PRETTY_FUNCTION__)); |
126 | |
127 | unsigned Cost = 0; |
128 | // Subvector extraction cost is equal to the cost of extracting element from |
129 | // the source type plus the cost of inserting them into the result vector |
130 | // type. |
131 | for (int i = 0; i != NumSubElts; ++i) { |
132 | Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, |
133 | i + Index); |
134 | Cost += |
135 | thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, i); |
136 | } |
137 | return Cost; |
138 | } |
139 | |
140 | /// Estimate a cost of subvector insertion as a sequence of extract and |
141 | /// insert operations. |
142 | unsigned getInsertSubvectorOverhead(VectorType *VTy, int Index, |
143 | FixedVectorType *SubVTy) { |
144 | assert(VTy && SubVTy &&((VTy && SubVTy && "Can only insert subvectors into vectors" ) ? static_cast<void> (0) : __assert_fail ("VTy && SubVTy && \"Can only insert subvectors into vectors\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 145, __PRETTY_FUNCTION__)) |
145 | "Can only insert subvectors into vectors")((VTy && SubVTy && "Can only insert subvectors into vectors" ) ? static_cast<void> (0) : __assert_fail ("VTy && SubVTy && \"Can only insert subvectors into vectors\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 145, __PRETTY_FUNCTION__)); |
146 | int NumSubElts = SubVTy->getNumElements(); |
147 | assert((!isa<FixedVectorType>(VTy) ||(((!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && "SK_InsertSubvector index out of range") ? static_cast<void > (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_InsertSubvector index out of range\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 150, __PRETTY_FUNCTION__)) |
148 | (Index + NumSubElts) <=(((!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && "SK_InsertSubvector index out of range") ? static_cast<void > (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_InsertSubvector index out of range\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 150, __PRETTY_FUNCTION__)) |
149 | (int)cast<FixedVectorType>(VTy)->getNumElements()) &&(((!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && "SK_InsertSubvector index out of range") ? static_cast<void > (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_InsertSubvector index out of range\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 150, __PRETTY_FUNCTION__)) |
150 | "SK_InsertSubvector index out of range")(((!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && "SK_InsertSubvector index out of range") ? static_cast<void > (0) : __assert_fail ("(!isa<FixedVectorType>(VTy) || (Index + NumSubElts) <= (int)cast<FixedVectorType>(VTy)->getNumElements()) && \"SK_InsertSubvector index out of range\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 150, __PRETTY_FUNCTION__)); |
151 | |
152 | unsigned Cost = 0; |
153 | // Subvector insertion cost is equal to the cost of extracting element from |
154 | // the source type plus the cost of inserting them into the result vector |
155 | // type. |
156 | for (int i = 0; i != NumSubElts; ++i) { |
157 | Cost += |
158 | thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, i); |
159 | Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, |
160 | i + Index); |
161 | } |
162 | return Cost; |
163 | } |
164 | |
165 | /// Local query method delegates up to T which *must* implement this! |
166 | const TargetSubtargetInfo *getST() const { |
167 | return static_cast<const T *>(this)->getST(); |
168 | } |
169 | |
170 | /// Local query method delegates up to T which *must* implement this! |
171 | const TargetLoweringBase *getTLI() const { |
172 | return static_cast<const T *>(this)->getTLI(); |
173 | } |
174 | |
175 | static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) { |
176 | switch (M) { |
177 | case TTI::MIM_Unindexed: |
178 | return ISD::UNINDEXED; |
179 | case TTI::MIM_PreInc: |
180 | return ISD::PRE_INC; |
181 | case TTI::MIM_PreDec: |
182 | return ISD::PRE_DEC; |
183 | case TTI::MIM_PostInc: |
184 | return ISD::POST_INC; |
185 | case TTI::MIM_PostDec: |
186 | return ISD::POST_DEC; |
187 | } |
188 | llvm_unreachable("Unexpected MemIndexedMode")::llvm::llvm_unreachable_internal("Unexpected MemIndexedMode" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 188); |
189 | } |
190 | |
191 | protected: |
192 | explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL) |
193 | : BaseT(DL) {} |
194 | virtual ~BasicTTIImplBase() = default; |
195 | |
196 | using TargetTransformInfoImplBase::DL; |
197 | |
198 | public: |
199 | /// \name Scalar TTI Implementations |
200 | /// @{ |
201 | bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, |
202 | unsigned AddressSpace, unsigned Alignment, |
203 | bool *Fast) const { |
204 | EVT E = EVT::getIntegerVT(Context, BitWidth); |
205 | return getTLI()->allowsMisalignedMemoryAccesses( |
206 | E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast); |
207 | } |
208 | |
209 | bool hasBranchDivergence() { return false; } |
210 | |
211 | bool useGPUDivergenceAnalysis() { return false; } |
212 | |
213 | bool isSourceOfDivergence(const Value *V) { return false; } |
214 | |
215 | bool isAlwaysUniform(const Value *V) { return false; } |
216 | |
217 | unsigned getFlatAddressSpace() { |
218 | // Return an invalid address space. |
219 | return -1; |
220 | } |
221 | |
222 | bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, |
223 | Intrinsic::ID IID) const { |
224 | return false; |
225 | } |
226 | |
227 | bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const { |
228 | return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS); |
229 | } |
230 | |
231 | unsigned getAssumedAddrSpace(const Value *V) const { |
232 | return getTLI()->getTargetMachine().getAssumedAddrSpace(V); |
233 | } |
234 | |
235 | Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, |
236 | Value *NewV) const { |
237 | return nullptr; |
238 | } |
239 | |
240 | bool isLegalAddImmediate(int64_t imm) { |
241 | return getTLI()->isLegalAddImmediate(imm); |
242 | } |
243 | |
244 | bool isLegalICmpImmediate(int64_t imm) { |
245 | return getTLI()->isLegalICmpImmediate(imm); |
246 | } |
247 | |
248 | bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, |
249 | bool HasBaseReg, int64_t Scale, |
250 | unsigned AddrSpace, Instruction *I = nullptr) { |
251 | TargetLoweringBase::AddrMode AM; |
252 | AM.BaseGV = BaseGV; |
253 | AM.BaseOffs = BaseOffset; |
254 | AM.HasBaseReg = HasBaseReg; |
255 | AM.Scale = Scale; |
256 | return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I); |
257 | } |
258 | |
259 | bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty, |
260 | const DataLayout &DL) const { |
261 | EVT VT = getTLI()->getValueType(DL, Ty); |
262 | return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT); |
263 | } |
264 | |
265 | bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty, |
266 | const DataLayout &DL) const { |
267 | EVT VT = getTLI()->getValueType(DL, Ty); |
268 | return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT); |
269 | } |
270 | |
271 | bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) { |
272 | return TargetTransformInfoImplBase::isLSRCostLess(C1, C2); |
273 | } |
274 | |
275 | bool isNumRegsMajorCostOfLSR() { |
276 | return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR(); |
277 | } |
278 | |
279 | bool isProfitableLSRChainElement(Instruction *I) { |
280 | return TargetTransformInfoImplBase::isProfitableLSRChainElement(I); |
281 | } |
282 | |
283 | int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, |
284 | bool HasBaseReg, int64_t Scale, unsigned AddrSpace) { |
285 | TargetLoweringBase::AddrMode AM; |
286 | AM.BaseGV = BaseGV; |
287 | AM.BaseOffs = BaseOffset; |
288 | AM.HasBaseReg = HasBaseReg; |
289 | AM.Scale = Scale; |
290 | return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace); |
291 | } |
292 | |
293 | bool isTruncateFree(Type *Ty1, Type *Ty2) { |
294 | return getTLI()->isTruncateFree(Ty1, Ty2); |
295 | } |
296 | |
297 | bool isProfitableToHoist(Instruction *I) { |
298 | return getTLI()->isProfitableToHoist(I); |
299 | } |
300 | |
301 | bool useAA() const { return getST()->useAA(); } |
302 | |
303 | bool isTypeLegal(Type *Ty) { |
304 | EVT VT = getTLI()->getValueType(DL, Ty); |
305 | return getTLI()->isTypeLegal(VT); |
306 | } |
307 | |
308 | unsigned getRegUsageForType(Type *Ty) { |
309 | return getTLI()->getTypeLegalizationCost(DL, Ty).first; |
310 | } |
311 | |
312 | int getGEPCost(Type *PointeeType, const Value *Ptr, |
313 | ArrayRef<const Value *> Operands) { |
314 | return BaseT::getGEPCost(PointeeType, Ptr, Operands); |
315 | } |
316 | |
317 | unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, |
318 | unsigned &JumpTableSize, |
319 | ProfileSummaryInfo *PSI, |
320 | BlockFrequencyInfo *BFI) { |
321 | /// Try to find the estimated number of clusters. Note that the number of |
322 | /// clusters identified in this function could be different from the actual |
323 | /// numbers found in lowering. This function ignore switches that are |
324 | /// lowered with a mix of jump table / bit test / BTree. This function was |
325 | /// initially intended to be used when estimating the cost of switch in |
326 | /// inline cost heuristic, but it's a generic cost model to be used in other |
327 | /// places (e.g., in loop unrolling). |
328 | unsigned N = SI.getNumCases(); |
329 | const TargetLoweringBase *TLI = getTLI(); |
330 | const DataLayout &DL = this->getDataLayout(); |
331 | |
332 | JumpTableSize = 0; |
333 | bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent()); |
334 | |
335 | // Early exit if both a jump table and bit test are not allowed. |
336 | if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N)) |
337 | return N; |
338 | |
339 | APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue(); |
340 | APInt MinCaseVal = MaxCaseVal; |
341 | for (auto CI : SI.cases()) { |
342 | const APInt &CaseVal = CI.getCaseValue()->getValue(); |
343 | if (CaseVal.sgt(MaxCaseVal)) |
344 | MaxCaseVal = CaseVal; |
345 | if (CaseVal.slt(MinCaseVal)) |
346 | MinCaseVal = CaseVal; |
347 | } |
348 | |
349 | // Check if suitable for a bit test |
350 | if (N <= DL.getIndexSizeInBits(0u)) { |
351 | SmallPtrSet<const BasicBlock *, 4> Dests; |
352 | for (auto I : SI.cases()) |
353 | Dests.insert(I.getCaseSuccessor()); |
354 | |
355 | if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal, |
356 | DL)) |
357 | return 1; |
358 | } |
359 | |
360 | // Check if suitable for a jump table. |
361 | if (IsJTAllowed) { |
362 | if (N < 2 || N < TLI->getMinimumJumpTableEntries()) |
363 | return N; |
364 | uint64_t Range = |
365 | (MaxCaseVal - MinCaseVal) |
366 | .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1; |
367 | // Check whether a range of clusters is dense enough for a jump table |
368 | if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) { |
369 | JumpTableSize = Range; |
370 | return 1; |
371 | } |
372 | } |
373 | return N; |
374 | } |
375 | |
376 | bool shouldBuildLookupTables() { |
377 | const TargetLoweringBase *TLI = getTLI(); |
378 | return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) || |
379 | TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other); |
380 | } |
381 | |
382 | bool haveFastSqrt(Type *Ty) { |
383 | const TargetLoweringBase *TLI = getTLI(); |
384 | EVT VT = TLI->getValueType(DL, Ty); |
385 | return TLI->isTypeLegal(VT) && |
386 | TLI->isOperationLegalOrCustom(ISD::FSQRT, VT); |
387 | } |
388 | |
389 | bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) { |
390 | return true; |
391 | } |
392 | |
393 | unsigned getFPOpCost(Type *Ty) { |
394 | // Check whether FADD is available, as a proxy for floating-point in |
395 | // general. |
396 | const TargetLoweringBase *TLI = getTLI(); |
397 | EVT VT = TLI->getValueType(DL, Ty); |
398 | if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT)) |
399 | return TargetTransformInfo::TCC_Basic; |
400 | return TargetTransformInfo::TCC_Expensive; |
401 | } |
402 | |
403 | unsigned getInliningThresholdMultiplier() { return 1; } |
404 | |
405 | int getInlinerVectorBonusPercent() { return 150; } |
406 | |
407 | void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, |
408 | TTI::UnrollingPreferences &UP) { |
409 | // This unrolling functionality is target independent, but to provide some |
410 | // motivation for its intended use, for x86: |
411 | |
412 | // According to the Intel 64 and IA-32 Architectures Optimization Reference |
413 | // Manual, Intel Core models and later have a loop stream detector (and |
414 | // associated uop queue) that can benefit from partial unrolling. |
415 | // The relevant requirements are: |
416 | // - The loop must have no more than 4 (8 for Nehalem and later) branches |
417 | // taken, and none of them may be calls. |
418 | // - The loop can have no more than 18 (28 for Nehalem and later) uops. |
419 | |
420 | // According to the Software Optimization Guide for AMD Family 15h |
421 | // Processors, models 30h-4fh (Steamroller and later) have a loop predictor |
422 | // and loop buffer which can benefit from partial unrolling. |
423 | // The relevant requirements are: |
424 | // - The loop must have fewer than 16 branches |
425 | // - The loop must have less than 40 uops in all executed loop branches |
426 | |
427 | // The number of taken branches in a loop is hard to estimate here, and |
428 | // benchmarking has revealed that it is better not to be conservative when |
429 | // estimating the branch count. As a result, we'll ignore the branch limits |
430 | // until someone finds a case where it matters in practice. |
431 | |
432 | unsigned MaxOps; |
433 | const TargetSubtargetInfo *ST = getST(); |
434 | if (PartialUnrollingThreshold.getNumOccurrences() > 0) |
435 | MaxOps = PartialUnrollingThreshold; |
436 | else if (ST->getSchedModel().LoopMicroOpBufferSize > 0) |
437 | MaxOps = ST->getSchedModel().LoopMicroOpBufferSize; |
438 | else |
439 | return; |
440 | |
441 | // Scan the loop: don't unroll loops with calls. |
442 | for (BasicBlock *BB : L->blocks()) { |
443 | for (Instruction &I : *BB) { |
444 | if (isa<CallInst>(I) || isa<InvokeInst>(I)) { |
445 | if (const Function *F = cast<CallBase>(I).getCalledFunction()) { |
446 | if (!thisT()->isLoweredToCall(F)) |
447 | continue; |
448 | } |
449 | |
450 | return; |
451 | } |
452 | } |
453 | } |
454 | |
455 | // Enable runtime and partial unrolling up to the specified size. |
456 | // Enable using trip count upper bound to unroll loops. |
457 | UP.Partial = UP.Runtime = UP.UpperBound = true; |
458 | UP.PartialThreshold = MaxOps; |
459 | |
460 | // Avoid unrolling when optimizing for size. |
461 | UP.OptSizeThreshold = 0; |
462 | UP.PartialOptSizeThreshold = 0; |
463 | |
464 | // Set number of instructions optimized when "back edge" |
465 | // becomes "fall through" to default value of 2. |
466 | UP.BEInsns = 2; |
467 | } |
468 | |
469 | void getPeelingPreferences(Loop *L, ScalarEvolution &SE, |
470 | TTI::PeelingPreferences &PP) { |
471 | PP.PeelCount = 0; |
472 | PP.AllowPeeling = true; |
473 | PP.AllowLoopNestsPeeling = false; |
474 | PP.PeelProfiledIterations = true; |
475 | } |
476 | |
477 | bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, |
478 | AssumptionCache &AC, |
479 | TargetLibraryInfo *LibInfo, |
480 | HardwareLoopInfo &HWLoopInfo) { |
481 | return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo); |
482 | } |
483 | |
484 | bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, |
485 | AssumptionCache &AC, TargetLibraryInfo *TLI, |
486 | DominatorTree *DT, |
487 | const LoopAccessInfo *LAI) { |
488 | return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI); |
489 | } |
490 | |
491 | bool emitGetActiveLaneMask() { |
492 | return BaseT::emitGetActiveLaneMask(); |
493 | } |
494 | |
495 | Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC, |
496 | IntrinsicInst &II) { |
497 | return BaseT::instCombineIntrinsic(IC, II); |
498 | } |
499 | |
500 | Optional<Value *> simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, |
501 | IntrinsicInst &II, |
502 | APInt DemandedMask, |
503 | KnownBits &Known, |
504 | bool &KnownBitsComputed) { |
505 | return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known, |
506 | KnownBitsComputed); |
507 | } |
508 | |
509 | Optional<Value *> simplifyDemandedVectorEltsIntrinsic( |
510 | InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, |
511 | APInt &UndefElts2, APInt &UndefElts3, |
512 | std::function<void(Instruction *, unsigned, APInt, APInt &)> |
513 | SimplifyAndSetOp) { |
514 | return BaseT::simplifyDemandedVectorEltsIntrinsic( |
515 | IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3, |
516 | SimplifyAndSetOp); |
517 | } |
518 | |
519 | int getInstructionLatency(const Instruction *I) { |
520 | if (isa<LoadInst>(I)) |
521 | return getST()->getSchedModel().DefaultLoadLatency; |
522 | |
523 | return BaseT::getInstructionLatency(I); |
524 | } |
525 | |
526 | virtual Optional<unsigned> |
527 | getCacheSize(TargetTransformInfo::CacheLevel Level) const { |
528 | return Optional<unsigned>( |
529 | getST()->getCacheSize(static_cast<unsigned>(Level))); |
530 | } |
531 | |
532 | virtual Optional<unsigned> |
533 | getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const { |
534 | Optional<unsigned> TargetResult = |
535 | getST()->getCacheAssociativity(static_cast<unsigned>(Level)); |
536 | |
537 | if (TargetResult) |
538 | return TargetResult; |
539 | |
540 | return BaseT::getCacheAssociativity(Level); |
541 | } |
542 | |
543 | virtual unsigned getCacheLineSize() const { |
544 | return getST()->getCacheLineSize(); |
545 | } |
546 | |
547 | virtual unsigned getPrefetchDistance() const { |
548 | return getST()->getPrefetchDistance(); |
549 | } |
550 | |
551 | virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, |
552 | unsigned NumStridedMemAccesses, |
553 | unsigned NumPrefetches, |
554 | bool HasCall) const { |
555 | return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses, |
556 | NumPrefetches, HasCall); |
557 | } |
558 | |
559 | virtual unsigned getMaxPrefetchIterationsAhead() const { |
560 | return getST()->getMaxPrefetchIterationsAhead(); |
561 | } |
562 | |
563 | virtual bool enableWritePrefetching() const { |
564 | return getST()->enableWritePrefetching(); |
565 | } |
566 | |
567 | /// @} |
568 | |
569 | /// \name Vector TTI Implementations |
570 | /// @{ |
571 | |
572 | unsigned getRegisterBitWidth(bool Vector) const { return 32; } |
573 | |
574 | Optional<unsigned> getMaxVScale() const { return None; } |
575 | |
576 | /// Estimate the overhead of scalarizing an instruction. Insert and Extract |
577 | /// are set if the demanded result elements need to be inserted and/or |
578 | /// extracted from vectors. |
579 | unsigned getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, |
580 | bool Insert, bool Extract) { |
581 | /// FIXME: a bitfield is not a reasonable abstraction for talking about |
582 | /// which elements are needed from a scalable vector |
583 | auto *Ty = cast<FixedVectorType>(InTy); |
584 | |
585 | assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&((DemandedElts.getBitWidth() == Ty->getNumElements() && "Vector size mismatch") ? static_cast<void> (0) : __assert_fail ("DemandedElts.getBitWidth() == Ty->getNumElements() && \"Vector size mismatch\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 586, __PRETTY_FUNCTION__)) |
586 | "Vector size mismatch")((DemandedElts.getBitWidth() == Ty->getNumElements() && "Vector size mismatch") ? static_cast<void> (0) : __assert_fail ("DemandedElts.getBitWidth() == Ty->getNumElements() && \"Vector size mismatch\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 586, __PRETTY_FUNCTION__)); |
587 | |
588 | unsigned Cost = 0; |
589 | |
590 | for (int i = 0, e = Ty->getNumElements(); i < e; ++i) { |
591 | if (!DemandedElts[i]) |
592 | continue; |
593 | if (Insert) |
594 | Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, i); |
595 | if (Extract) |
596 | Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, i); |
597 | } |
598 | |
599 | return Cost; |
600 | } |
601 | |
602 | /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead. |
603 | unsigned getScalarizationOverhead(VectorType *InTy, bool Insert, |
604 | bool Extract) { |
605 | auto *Ty = cast<FixedVectorType>(InTy); |
606 | |
607 | APInt DemandedElts = APInt::getAllOnesValue(Ty->getNumElements()); |
608 | return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract); |
609 | } |
610 | |
611 | /// Estimate the overhead of scalarizing an instruction's unique |
612 | /// non-constant operands. The types of the arguments are ordinarily |
613 | /// scalar, in which case the costs are multiplied with VF. |
614 | unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args, |
615 | unsigned VF) { |
616 | unsigned Cost = 0; |
617 | SmallPtrSet<const Value*, 4> UniqueOperands; |
618 | for (const Value *A : Args) { |
619 | // Disregard things like metadata arguments. |
620 | Type *Ty = A->getType(); |
621 | if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() && |
622 | !Ty->isPtrOrPtrVectorTy()) |
623 | continue; |
624 | |
625 | if (!isa<Constant>(A) && UniqueOperands.insert(A).second) { |
626 | auto *VecTy = dyn_cast<VectorType>(Ty); |
627 | if (VecTy) { |
628 | // If A is a vector operand, VF should be 1 or correspond to A. |
629 | assert((VF == 1 ||(((VF == 1 || VF == cast<FixedVectorType>(VecTy)->getNumElements ()) && "Vector argument does not match VF") ? static_cast <void> (0) : __assert_fail ("(VF == 1 || VF == cast<FixedVectorType>(VecTy)->getNumElements()) && \"Vector argument does not match VF\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 631, __PRETTY_FUNCTION__)) |
630 | VF == cast<FixedVectorType>(VecTy)->getNumElements()) &&(((VF == 1 || VF == cast<FixedVectorType>(VecTy)->getNumElements ()) && "Vector argument does not match VF") ? static_cast <void> (0) : __assert_fail ("(VF == 1 || VF == cast<FixedVectorType>(VecTy)->getNumElements()) && \"Vector argument does not match VF\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 631, __PRETTY_FUNCTION__)) |
631 | "Vector argument does not match VF")(((VF == 1 || VF == cast<FixedVectorType>(VecTy)->getNumElements ()) && "Vector argument does not match VF") ? static_cast <void> (0) : __assert_fail ("(VF == 1 || VF == cast<FixedVectorType>(VecTy)->getNumElements()) && \"Vector argument does not match VF\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 631, __PRETTY_FUNCTION__)); |
632 | } |
633 | else |
634 | VecTy = FixedVectorType::get(Ty, VF); |
635 | |
636 | Cost += getScalarizationOverhead(VecTy, false, true); |
637 | } |
638 | } |
639 | |
640 | return Cost; |
641 | } |
642 | |
643 | unsigned getScalarizationOverhead(VectorType *InTy, |
644 | ArrayRef<const Value *> Args) { |
645 | auto *Ty = cast<FixedVectorType>(InTy); |
646 | |
647 | unsigned Cost = 0; |
648 | |
649 | Cost += getScalarizationOverhead(Ty, true, false); |
650 | if (!Args.empty()) |
651 | Cost += getOperandsScalarizationOverhead(Args, Ty->getNumElements()); |
652 | else |
653 | // When no information on arguments is provided, we add the cost |
654 | // associated with one argument as a heuristic. |
655 | Cost += getScalarizationOverhead(Ty, false, true); |
656 | |
657 | return Cost; |
658 | } |
659 | |
660 | unsigned getMaxInterleaveFactor(unsigned VF) { return 1; } |
661 | |
662 | unsigned getArithmeticInstrCost( |
663 | unsigned Opcode, Type *Ty, |
664 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, |
665 | TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, |
666 | TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, |
667 | TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, |
668 | TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, |
669 | ArrayRef<const Value *> Args = ArrayRef<const Value *>(), |
670 | const Instruction *CxtI = nullptr) { |
671 | // Check if any of the operands are vector operands. |
672 | const TargetLoweringBase *TLI = getTLI(); |
673 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
674 | assert(ISD && "Invalid opcode")((ISD && "Invalid opcode") ? static_cast<void> ( 0) : __assert_fail ("ISD && \"Invalid opcode\"", "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 674, __PRETTY_FUNCTION__)); |
675 | |
676 | // TODO: Handle more cost kinds. |
677 | if (CostKind != TTI::TCK_RecipThroughput) |
678 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, |
679 | Opd1Info, Opd2Info, |
680 | Opd1PropInfo, Opd2PropInfo, |
681 | Args, CxtI); |
682 | |
683 | std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); |
684 | |
685 | bool IsFloat = Ty->isFPOrFPVectorTy(); |
686 | // Assume that floating point arithmetic operations cost twice as much as |
687 | // integer operations. |
688 | unsigned OpCost = (IsFloat ? 2 : 1); |
689 | |
690 | if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { |
691 | // The operation is legal. Assume it costs 1. |
692 | // TODO: Once we have extract/insert subvector cost we need to use them. |
693 | return LT.first * OpCost; |
694 | } |
695 | |
696 | if (!TLI->isOperationExpand(ISD, LT.second)) { |
697 | // If the operation is custom lowered, then assume that the code is twice |
698 | // as expensive. |
699 | return LT.first * 2 * OpCost; |
700 | } |
701 | |
702 | // Else, assume that we need to scalarize this op. |
703 | // TODO: If one of the types get legalized by splitting, handle this |
704 | // similarly to what getCastInstrCost() does. |
705 | if (auto *VTy = dyn_cast<VectorType>(Ty)) { |
706 | unsigned Num = cast<FixedVectorType>(VTy)->getNumElements(); |
707 | unsigned Cost = thisT()->getArithmeticInstrCost( |
708 | Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info, |
709 | Opd1PropInfo, Opd2PropInfo, Args, CxtI); |
710 | // Return the cost of multiple scalar invocation plus the cost of |
711 | // inserting and extracting the values. |
712 | return getScalarizationOverhead(VTy, Args) + Num * Cost; |
713 | } |
714 | |
715 | // We don't know anything about this scalar instruction. |
716 | return OpCost; |
717 | } |
718 | |
719 | unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, |
720 | VectorType *SubTp) { |
721 | |
722 | switch (Kind) { |
723 | case TTI::SK_Broadcast: |
724 | return getBroadcastShuffleOverhead(cast<FixedVectorType>(Tp)); |
725 | case TTI::SK_Select: |
726 | case TTI::SK_Reverse: |
727 | case TTI::SK_Transpose: |
728 | case TTI::SK_PermuteSingleSrc: |
729 | case TTI::SK_PermuteTwoSrc: |
730 | return getPermuteShuffleOverhead(cast<FixedVectorType>(Tp)); |
731 | case TTI::SK_ExtractSubvector: |
732 | return getExtractSubvectorOverhead(Tp, Index, |
733 | cast<FixedVectorType>(SubTp)); |
734 | case TTI::SK_InsertSubvector: |
735 | return getInsertSubvectorOverhead(Tp, Index, |
736 | cast<FixedVectorType>(SubTp)); |
737 | } |
738 | llvm_unreachable("Unknown TTI::ShuffleKind")::llvm::llvm_unreachable_internal("Unknown TTI::ShuffleKind", "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 738); |
739 | } |
740 | |
741 | unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, |
742 | TTI::CastContextHint CCH, |
743 | TTI::TargetCostKind CostKind, |
744 | const Instruction *I = nullptr) { |
745 | if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0) |
746 | return 0; |
747 | |
748 | const TargetLoweringBase *TLI = getTLI(); |
749 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
750 | assert(ISD && "Invalid opcode")((ISD && "Invalid opcode") ? static_cast<void> ( 0) : __assert_fail ("ISD && \"Invalid opcode\"", "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 750, __PRETTY_FUNCTION__)); |
751 | std::pair<unsigned, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, Src); |
752 | std::pair<unsigned, MVT> DstLT = TLI->getTypeLegalizationCost(DL, Dst); |
753 | |
754 | TypeSize SrcSize = SrcLT.second.getSizeInBits(); |
755 | TypeSize DstSize = DstLT.second.getSizeInBits(); |
756 | bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy(); |
757 | bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy(); |
758 | |
759 | switch (Opcode) { |
760 | default: |
761 | break; |
762 | case Instruction::Trunc: |
763 | // Check for NOOP conversions. |
764 | if (TLI->isTruncateFree(SrcLT.second, DstLT.second)) |
765 | return 0; |
766 | LLVM_FALLTHROUGH[[gnu::fallthrough]]; |
767 | case Instruction::BitCast: |
768 | // Bitcast between types that are legalized to the same type are free and |
769 | // assume int to/from ptr of the same size is also free. |
770 | if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst && |
771 | SrcSize == DstSize) |
772 | return 0; |
773 | break; |
774 | case Instruction::FPExt: |
775 | if (I && getTLI()->isExtFree(I)) |
776 | return 0; |
777 | break; |
778 | case Instruction::ZExt: |
779 | if (TLI->isZExtFree(SrcLT.second, DstLT.second)) |
780 | return 0; |
781 | LLVM_FALLTHROUGH[[gnu::fallthrough]]; |
782 | case Instruction::SExt: |
783 | if (I && getTLI()->isExtFree(I)) |
784 | return 0; |
785 | |
786 | // If this is a zext/sext of a load, return 0 if the corresponding |
787 | // extending load exists on target. |
788 | if (CCH == TTI::CastContextHint::Normal) { |
789 | EVT ExtVT = EVT::getEVT(Dst); |
790 | EVT LoadVT = EVT::getEVT(Src); |
791 | unsigned LType = |
792 | ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD); |
793 | if (TLI->isLoadExtLegal(LType, ExtVT, LoadVT)) |
794 | return 0; |
795 | } |
796 | break; |
797 | case Instruction::AddrSpaceCast: |
798 | if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(), |
799 | Dst->getPointerAddressSpace())) |
800 | return 0; |
801 | break; |
802 | } |
803 | |
804 | auto *SrcVTy = dyn_cast<VectorType>(Src); |
805 | auto *DstVTy = dyn_cast<VectorType>(Dst); |
806 | |
807 | // If the cast is marked as legal (or promote) then assume low cost. |
808 | if (SrcLT.first == DstLT.first && |
809 | TLI->isOperationLegalOrPromote(ISD, DstLT.second)) |
810 | return SrcLT.first; |
811 | |
812 | // Handle scalar conversions. |
813 | if (!SrcVTy && !DstVTy) { |
814 | // Just check the op cost. If the operation is legal then assume it costs |
815 | // 1. |
816 | if (!TLI->isOperationExpand(ISD, DstLT.second)) |
817 | return 1; |
818 | |
819 | // Assume that illegal scalar instruction are expensive. |
820 | return 4; |
821 | } |
822 | |
823 | // Check vector-to-vector casts. |
824 | if (DstVTy && SrcVTy) { |
825 | // If the cast is between same-sized registers, then the check is simple. |
826 | if (SrcLT.first == DstLT.first && SrcSize == DstSize) { |
827 | |
828 | // Assume that Zext is done using AND. |
829 | if (Opcode == Instruction::ZExt) |
830 | return SrcLT.first; |
831 | |
832 | // Assume that sext is done using SHL and SRA. |
833 | if (Opcode == Instruction::SExt) |
834 | return SrcLT.first * 2; |
835 | |
836 | // Just check the op cost. If the operation is legal then assume it |
837 | // costs |
838 | // 1 and multiply by the type-legalization overhead. |
839 | if (!TLI->isOperationExpand(ISD, DstLT.second)) |
840 | return SrcLT.first * 1; |
841 | } |
842 | |
843 | // If we are legalizing by splitting, query the concrete TTI for the cost |
844 | // of casting the original vector twice. We also need to factor in the |
845 | // cost of the split itself. Count that as 1, to be consistent with |
846 | // TLI->getTypeLegalizationCost(). |
847 | bool SplitSrc = |
848 | TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) == |
849 | TargetLowering::TypeSplitVector; |
850 | bool SplitDst = |
851 | TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) == |
852 | TargetLowering::TypeSplitVector; |
853 | if ((SplitSrc || SplitDst) && |
854 | cast<FixedVectorType>(SrcVTy)->getNumElements() > 1 && |
855 | cast<FixedVectorType>(DstVTy)->getNumElements() > 1) { |
856 | Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy); |
857 | Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy); |
858 | T *TTI = static_cast<T *>(this); |
859 | // If both types need to be split then the split is free. |
860 | unsigned SplitCost = |
861 | (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0; |
862 | return SplitCost + |
863 | (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH, |
864 | CostKind, I)); |
865 | } |
866 | |
867 | // In other cases where the source or destination are illegal, assume |
868 | // the operation will get scalarized. |
869 | unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements(); |
870 | unsigned Cost = thisT()->getCastInstrCost( |
871 | Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I); |
872 | |
873 | // Return the cost of multiple scalar invocation plus the cost of |
874 | // inserting and extracting the values. |
875 | return getScalarizationOverhead(DstVTy, true, true) + Num * Cost; |
876 | } |
877 | |
878 | // We already handled vector-to-vector and scalar-to-scalar conversions. |
879 | // This |
880 | // is where we handle bitcast between vectors and scalars. We need to assume |
881 | // that the conversion is scalarized in one way or another. |
882 | if (Opcode == Instruction::BitCast) { |
883 | // Illegal bitcasts are done by storing and loading from a stack slot. |
884 | return (SrcVTy ? getScalarizationOverhead(SrcVTy, false, true) : 0) + |
885 | (DstVTy ? getScalarizationOverhead(DstVTy, true, false) : 0); |
886 | } |
887 | |
888 | llvm_unreachable("Unhandled cast")::llvm::llvm_unreachable_internal("Unhandled cast", "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 888); |
889 | } |
890 | |
891 | unsigned getExtractWithExtendCost(unsigned Opcode, Type *Dst, |
892 | VectorType *VecTy, unsigned Index) { |
893 | return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy, |
894 | Index) + |
895 | thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(), |
896 | TTI::CastContextHint::None, TTI::TCK_RecipThroughput); |
897 | } |
898 | |
899 | unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) { |
900 | return BaseT::getCFInstrCost(Opcode, CostKind); |
901 | } |
902 | |
903 | unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, |
904 | CmpInst::Predicate VecPred, |
905 | TTI::TargetCostKind CostKind, |
906 | const Instruction *I = nullptr) { |
907 | const TargetLoweringBase *TLI = getTLI(); |
908 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
909 | assert(ISD && "Invalid opcode")((ISD && "Invalid opcode") ? static_cast<void> ( 0) : __assert_fail ("ISD && \"Invalid opcode\"", "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 909, __PRETTY_FUNCTION__)); |
910 | |
911 | // TODO: Handle other cost kinds. |
912 | if (CostKind != TTI::TCK_RecipThroughput) |
913 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, |
914 | I); |
915 | |
916 | // Selects on vectors are actually vector selects. |
917 | if (ISD == ISD::SELECT) { |
918 | assert(CondTy && "CondTy must exist")((CondTy && "CondTy must exist") ? static_cast<void > (0) : __assert_fail ("CondTy && \"CondTy must exist\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 918, __PRETTY_FUNCTION__)); |
919 | if (CondTy->isVectorTy()) |
920 | ISD = ISD::VSELECT; |
921 | } |
922 | std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); |
923 | |
924 | if (!(ValTy->isVectorTy() && !LT.second.isVector()) && |
925 | !TLI->isOperationExpand(ISD, LT.second)) { |
926 | // The operation is legal. Assume it costs 1. Multiply |
927 | // by the type-legalization overhead. |
928 | return LT.first * 1; |
929 | } |
930 | |
931 | // Otherwise, assume that the cast is scalarized. |
932 | // TODO: If one of the types get legalized by splitting, handle this |
933 | // similarly to what getCastInstrCost() does. |
934 | if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) { |
935 | unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements(); |
936 | if (CondTy) |
937 | CondTy = CondTy->getScalarType(); |
938 | unsigned Cost = thisT()->getCmpSelInstrCost( |
939 | Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I); |
940 | |
941 | // Return the cost of multiple scalar invocation plus the cost of |
942 | // inserting and extracting the values. |
943 | return getScalarizationOverhead(ValVTy, true, false) + Num * Cost; |
944 | } |
945 | |
946 | // Unknown scalar opcode. |
947 | return 1; |
948 | } |
949 | |
950 | unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { |
951 | std::pair<unsigned, MVT> LT = |
952 | getTLI()->getTypeLegalizationCost(DL, Val->getScalarType()); |
953 | |
954 | return LT.first; |
955 | } |
956 | |
957 | unsigned getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, |
958 | unsigned AddressSpace, |
959 | TTI::TargetCostKind CostKind, |
960 | const Instruction *I = nullptr) { |
961 | assert(!Src->isVoidTy() && "Invalid type")((!Src->isVoidTy() && "Invalid type") ? static_cast <void> (0) : __assert_fail ("!Src->isVoidTy() && \"Invalid type\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 961, __PRETTY_FUNCTION__)); |
962 | // Assume types, such as structs, are expensive. |
963 | if (getTLI()->getValueType(DL, Src, true) == MVT::Other) |
964 | return 4; |
965 | std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Src); |
966 | |
967 | // Assuming that all loads of legal types cost 1. |
968 | unsigned Cost = LT.first; |
969 | if (CostKind != TTI::TCK_RecipThroughput) |
970 | return Cost; |
971 | |
972 | if (Src->isVectorTy() && |
973 | // In practice it's not currently possible to have a change in lane |
974 | // length for extending loads or truncating stores so both types should |
975 | // have the same scalable property. |
976 | TypeSize::isKnownLT(Src->getPrimitiveSizeInBits(), |
977 | LT.second.getSizeInBits())) { |
978 | // This is a vector load that legalizes to a larger type than the vector |
979 | // itself. Unless the corresponding extending load or truncating store is |
980 | // legal, then this will scalarize. |
981 | TargetLowering::LegalizeAction LA = TargetLowering::Expand; |
982 | EVT MemVT = getTLI()->getValueType(DL, Src); |
983 | if (Opcode == Instruction::Store) |
984 | LA = getTLI()->getTruncStoreAction(LT.second, MemVT); |
985 | else |
986 | LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT); |
987 | |
988 | if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) { |
989 | // This is a vector load/store for some illegal type that is scalarized. |
990 | // We must account for the cost of building or decomposing the vector. |
991 | Cost += getScalarizationOverhead(cast<VectorType>(Src), |
992 | Opcode != Instruction::Store, |
993 | Opcode == Instruction::Store); |
994 | } |
995 | } |
996 | |
997 | return Cost; |
998 | } |
999 | |
1000 | unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, |
1001 | const Value *Ptr, bool VariableMask, |
1002 | Align Alignment, TTI::TargetCostKind CostKind, |
1003 | const Instruction *I = nullptr) { |
1004 | auto *VT = cast<FixedVectorType>(DataTy); |
1005 | // Assume the target does not have support for gather/scatter operations |
1006 | // and provide a rough estimate. |
1007 | // |
1008 | // First, compute the cost of extracting the individual addresses and the |
1009 | // individual memory operations. |
1010 | int LoadCost = |
1011 | VT->getNumElements() * |
1012 | (getVectorInstrCost( |
1013 | Instruction::ExtractElement, |
1014 | FixedVectorType::get(PointerType::get(VT->getElementType(), 0), |
1015 | VT->getNumElements()), |
1016 | -1) + |
1017 | getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind)); |
1018 | |
1019 | // Next, compute the cost of packing the result in a vector. |
1020 | int PackingCost = getScalarizationOverhead(VT, Opcode != Instruction::Store, |
1021 | Opcode == Instruction::Store); |
1022 | |
1023 | int ConditionalCost = 0; |
1024 | if (VariableMask) { |
1025 | // Compute the cost of conditionally executing the memory operations with |
1026 | // variable masks. This includes extracting the individual conditions, a |
1027 | // branches and PHIs to combine the results. |
1028 | // NOTE: Estimating the cost of conditionally executing the memory |
1029 | // operations accurately is quite difficult and the current solution |
1030 | // provides a very rough estimate only. |
1031 | ConditionalCost = |
1032 | VT->getNumElements() * |
1033 | (getVectorInstrCost( |
1034 | Instruction::ExtractElement, |
1035 | FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()), |
1036 | VT->getNumElements()), |
1037 | -1) + |
1038 | getCFInstrCost(Instruction::Br, CostKind) + |
1039 | getCFInstrCost(Instruction::PHI, CostKind)); |
1040 | } |
1041 | |
1042 | return LoadCost + PackingCost + ConditionalCost; |
1043 | } |
1044 | |
1045 | unsigned getInterleavedMemoryOpCost( |
1046 | unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, |
1047 | Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, |
1048 | bool UseMaskForCond = false, bool UseMaskForGaps = false) { |
1049 | auto *VT = cast<FixedVectorType>(VecTy); |
1050 | |
1051 | unsigned NumElts = VT->getNumElements(); |
1052 | assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor")((Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor" ) ? static_cast<void> (0) : __assert_fail ("Factor > 1 && NumElts % Factor == 0 && \"Invalid interleave factor\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 1052, __PRETTY_FUNCTION__)); |
1053 | |
1054 | unsigned NumSubElts = NumElts / Factor; |
1055 | auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts); |
1056 | |
1057 | // Firstly, the cost of load/store operation. |
1058 | unsigned Cost; |
1059 | if (UseMaskForCond || UseMaskForGaps) |
1060 | Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment, |
1061 | AddressSpace, CostKind); |
1062 | else |
1063 | Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, |
1064 | CostKind); |
1065 | |
1066 | // Legalize the vector type, and get the legalized and unlegalized type |
1067 | // sizes. |
1068 | MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; |
1069 | unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy); |
1070 | unsigned VecTyLTSize = VecTyLT.getStoreSize(); |
1071 | |
1072 | // Return the ceiling of dividing A by B. |
1073 | auto ceil = [](unsigned A, unsigned B) { return (A + B - 1) / B; }; |
1074 | |
1075 | // Scale the cost of the memory operation by the fraction of legalized |
1076 | // instructions that will actually be used. We shouldn't account for the |
1077 | // cost of dead instructions since they will be removed. |
1078 | // |
1079 | // E.g., An interleaved load of factor 8: |
1080 | // %vec = load <16 x i64>, <16 x i64>* %ptr |
1081 | // %v0 = shufflevector %vec, undef, <0, 8> |
1082 | // |
1083 | // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be |
1084 | // used (those corresponding to elements [0:1] and [8:9] of the unlegalized |
1085 | // type). The other loads are unused. |
1086 | // |
1087 | // We only scale the cost of loads since interleaved store groups aren't |
1088 | // allowed to have gaps. |
1089 | if (Opcode == Instruction::Load && VecTySize > VecTyLTSize) { |
1090 | // The number of loads of a legal type it will take to represent a load |
1091 | // of the unlegalized vector type. |
1092 | unsigned NumLegalInsts = ceil(VecTySize, VecTyLTSize); |
1093 | |
1094 | // The number of elements of the unlegalized type that correspond to a |
1095 | // single legal instruction. |
1096 | unsigned NumEltsPerLegalInst = ceil(NumElts, NumLegalInsts); |
1097 | |
1098 | // Determine which legal instructions will be used. |
1099 | BitVector UsedInsts(NumLegalInsts, false); |
1100 | for (unsigned Index : Indices) |
1101 | for (unsigned Elt = 0; Elt < NumSubElts; ++Elt) |
1102 | UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst); |
1103 | |
1104 | // Scale the cost of the load by the fraction of legal instructions that |
1105 | // will be used. |
1106 | Cost *= UsedInsts.count() / NumLegalInsts; |
1107 | } |
1108 | |
1109 | // Then plus the cost of interleave operation. |
1110 | if (Opcode == Instruction::Load) { |
1111 | // The interleave cost is similar to extract sub vectors' elements |
1112 | // from the wide vector, and insert them into sub vectors. |
1113 | // |
1114 | // E.g. An interleaved load of factor 2 (with one member of index 0): |
1115 | // %vec = load <8 x i32>, <8 x i32>* %ptr |
1116 | // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0 |
1117 | // The cost is estimated as extract elements at 0, 2, 4, 6 from the |
1118 | // <8 x i32> vector and insert them into a <4 x i32> vector. |
1119 | |
1120 | assert(Indices.size() <= Factor &&((Indices.size() <= Factor && "Interleaved memory op has too many members" ) ? static_cast<void> (0) : __assert_fail ("Indices.size() <= Factor && \"Interleaved memory op has too many members\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 1121, __PRETTY_FUNCTION__)) |
1121 | "Interleaved memory op has too many members")((Indices.size() <= Factor && "Interleaved memory op has too many members" ) ? static_cast<void> (0) : __assert_fail ("Indices.size() <= Factor && \"Interleaved memory op has too many members\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 1121, __PRETTY_FUNCTION__)); |
1122 | |
1123 | for (unsigned Index : Indices) { |
1124 | assert(Index < Factor && "Invalid index for interleaved memory op")((Index < Factor && "Invalid index for interleaved memory op" ) ? static_cast<void> (0) : __assert_fail ("Index < Factor && \"Invalid index for interleaved memory op\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 1124, __PRETTY_FUNCTION__)); |
1125 | |
1126 | // Extract elements from loaded vector for each sub vector. |
1127 | for (unsigned i = 0; i < NumSubElts; i++) |
1128 | Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VT, |
1129 | Index + i * Factor); |
1130 | } |
1131 | |
1132 | unsigned InsSubCost = 0; |
1133 | for (unsigned i = 0; i < NumSubElts; i++) |
1134 | InsSubCost += |
1135 | thisT()->getVectorInstrCost(Instruction::InsertElement, SubVT, i); |
1136 | |
1137 | Cost += Indices.size() * InsSubCost; |
1138 | } else { |
1139 | // The interleave cost is extract all elements from sub vectors, and |
1140 | // insert them into the wide vector. |
1141 | // |
1142 | // E.g. An interleaved store of factor 2: |
1143 | // %v0_v1 = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7> |
1144 | // store <8 x i32> %interleaved.vec, <8 x i32>* %ptr |
1145 | // The cost is estimated as extract all elements from both <4 x i32> |
1146 | // vectors and insert into the <8 x i32> vector. |
1147 | |
1148 | unsigned ExtSubCost = 0; |
1149 | for (unsigned i = 0; i < NumSubElts; i++) |
1150 | ExtSubCost += |
1151 | thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i); |
1152 | Cost += ExtSubCost * Factor; |
1153 | |
1154 | for (unsigned i = 0; i < NumElts; i++) |
1155 | Cost += static_cast<T *>(this) |
1156 | ->getVectorInstrCost(Instruction::InsertElement, VT, i); |
1157 | } |
1158 | |
1159 | if (!UseMaskForCond) |
1160 | return Cost; |
1161 | |
1162 | Type *I8Type = Type::getInt8Ty(VT->getContext()); |
1163 | auto *MaskVT = FixedVectorType::get(I8Type, NumElts); |
1164 | SubVT = FixedVectorType::get(I8Type, NumSubElts); |
1165 | |
1166 | // The Mask shuffling cost is extract all the elements of the Mask |
1167 | // and insert each of them Factor times into the wide vector: |
1168 | // |
1169 | // E.g. an interleaved group with factor 3: |
1170 | // %mask = icmp ult <8 x i32> %vec1, %vec2 |
1171 | // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef, |
1172 | // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7> |
1173 | // The cost is estimated as extract all mask elements from the <8xi1> mask |
1174 | // vector and insert them factor times into the <24xi1> shuffled mask |
1175 | // vector. |
1176 | for (unsigned i = 0; i < NumSubElts; i++) |
1177 | Cost += |
1178 | thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i); |
1179 | |
1180 | for (unsigned i = 0; i < NumElts; i++) |
1181 | Cost += |
1182 | thisT()->getVectorInstrCost(Instruction::InsertElement, MaskVT, i); |
1183 | |
1184 | // The Gaps mask is invariant and created outside the loop, therefore the |
1185 | // cost of creating it is not accounted for here. However if we have both |
1186 | // a MaskForGaps and some other mask that guards the execution of the |
1187 | // memory access, we need to account for the cost of And-ing the two masks |
1188 | // inside the loop. |
1189 | if (UseMaskForGaps) |
1190 | Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT, |
1191 | CostKind); |
1192 | |
1193 | return Cost; |
1194 | } |
1195 | |
1196 | /// Get intrinsic cost based on arguments. |
1197 | unsigned getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, |
1198 | TTI::TargetCostKind CostKind) { |
1199 | // Check for generically free intrinsics. |
1200 | if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0) |
1201 | return 0; |
1202 | |
1203 | // Assume that target intrinsics are cheap. |
1204 | Intrinsic::ID IID = ICA.getID(); |
1205 | if (Function::isTargetIntrinsic(IID)) |
1206 | return TargetTransformInfo::TCC_Basic; |
1207 | |
1208 | if (ICA.isTypeBasedOnly()) |
1209 | return getTypeBasedIntrinsicInstrCost(ICA, CostKind); |
1210 | |
1211 | Type *RetTy = ICA.getReturnType(); |
1212 | |
1213 | ElementCount VF = ICA.getVectorFactor(); |
1214 | ElementCount RetVF = |
1215 | (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount() |
1216 | : ElementCount::getFixed(1)); |
1217 | assert((RetVF.isScalar() || VF.isScalar()) &&(((RetVF.isScalar() || VF.isScalar()) && "VF > 1 and RetVF is a vector type" ) ? static_cast<void> (0) : __assert_fail ("(RetVF.isScalar() || VF.isScalar()) && \"VF > 1 and RetVF is a vector type\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 1218, __PRETTY_FUNCTION__)) |
1218 | "VF > 1 and RetVF is a vector type")(((RetVF.isScalar() || VF.isScalar()) && "VF > 1 and RetVF is a vector type" ) ? static_cast<void> (0) : __assert_fail ("(RetVF.isScalar() || VF.isScalar()) && \"VF > 1 and RetVF is a vector type\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 1218, __PRETTY_FUNCTION__)); |
1219 | const IntrinsicInst *I = ICA.getInst(); |
1220 | const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); |
1221 | FastMathFlags FMF = ICA.getFlags(); |
1222 | switch (IID) { |
1223 | default: |
1224 | break; |
1225 | |
1226 | case Intrinsic::cttz: |
1227 | // FIXME: If necessary, this should go in target-specific overrides. |
1228 | if (VF.isScalar() && RetVF.isScalar() && |
1229 | getTLI()->isCheapToSpeculateCttz()) |
1230 | return TargetTransformInfo::TCC_Basic; |
1231 | break; |
1232 | |
1233 | case Intrinsic::ctlz: |
1234 | // FIXME: If necessary, this should go in target-specific overrides. |
1235 | if (VF.isScalar() && RetVF.isScalar() && |
1236 | getTLI()->isCheapToSpeculateCtlz()) |
1237 | return TargetTransformInfo::TCC_Basic; |
1238 | break; |
1239 | |
1240 | case Intrinsic::memcpy: |
1241 | return thisT()->getMemcpyCost(ICA.getInst()); |
1242 | |
1243 | case Intrinsic::masked_scatter: { |
1244 | assert(VF.isScalar() && "Can't vectorize types here.")((VF.isScalar() && "Can't vectorize types here.") ? static_cast <void> (0) : __assert_fail ("VF.isScalar() && \"Can't vectorize types here.\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 1244, __PRETTY_FUNCTION__)); |
1245 | const Value *Mask = Args[3]; |
1246 | bool VarMask = !isa<Constant>(Mask); |
1247 | Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue(); |
1248 | return thisT()->getGatherScatterOpCost(Instruction::Store, |
1249 | Args[0]->getType(), Args[1], |
1250 | VarMask, Alignment, CostKind, I); |
1251 | } |
1252 | case Intrinsic::masked_gather: { |
1253 | assert(VF.isScalar() && "Can't vectorize types here.")((VF.isScalar() && "Can't vectorize types here.") ? static_cast <void> (0) : __assert_fail ("VF.isScalar() && \"Can't vectorize types here.\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 1253, __PRETTY_FUNCTION__)); |
1254 | const Value *Mask = Args[2]; |
1255 | bool VarMask = !isa<Constant>(Mask); |
1256 | Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue(); |
1257 | return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0], |
1258 | VarMask, Alignment, CostKind, I); |
1259 | } |
1260 | case Intrinsic::experimental_vector_extract: { |
1261 | // FIXME: Handle case where a scalable vector is extracted from a scalable |
1262 | // vector |
1263 | if (isa<ScalableVectorType>(RetTy)) |
1264 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); |
1265 | unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue(); |
1266 | return thisT()->getShuffleCost(TTI::SK_ExtractSubvector, |
1267 | cast<VectorType>(Args[0]->getType()), |
1268 | Index, cast<VectorType>(RetTy)); |
1269 | } |
1270 | case Intrinsic::experimental_vector_insert: { |
1271 | // FIXME: Handle case where a scalable vector is inserted into a scalable |
1272 | // vector |
1273 | if (isa<ScalableVectorType>(Args[1]->getType())) |
1274 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); |
1275 | unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue(); |
1276 | return thisT()->getShuffleCost( |
1277 | TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()), Index, |
1278 | cast<VectorType>(Args[1]->getType())); |
1279 | } |
1280 | case Intrinsic::vector_reduce_add: |
1281 | case Intrinsic::vector_reduce_mul: |
1282 | case Intrinsic::vector_reduce_and: |
1283 | case Intrinsic::vector_reduce_or: |
1284 | case Intrinsic::vector_reduce_xor: |
1285 | case Intrinsic::vector_reduce_smax: |
1286 | case Intrinsic::vector_reduce_smin: |
1287 | case Intrinsic::vector_reduce_fmax: |
1288 | case Intrinsic::vector_reduce_fmin: |
1289 | case Intrinsic::vector_reduce_umax: |
1290 | case Intrinsic::vector_reduce_umin: { |
1291 | if (isa<ScalableVectorType>(RetTy)) |
1292 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); |
1293 | IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, 1, I); |
1294 | return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); |
1295 | } |
1296 | case Intrinsic::vector_reduce_fadd: |
1297 | case Intrinsic::vector_reduce_fmul: { |
1298 | if (isa<ScalableVectorType>(RetTy)) |
1299 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); |
1300 | IntrinsicCostAttributes Attrs( |
1301 | IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, 1, I); |
1302 | return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); |
1303 | } |
1304 | case Intrinsic::fshl: |
1305 | case Intrinsic::fshr: { |
1306 | if (isa<ScalableVectorType>(RetTy)) |
1307 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); |
1308 | const Value *X = Args[0]; |
1309 | const Value *Y = Args[1]; |
1310 | const Value *Z = Args[2]; |
1311 | TTI::OperandValueProperties OpPropsX, OpPropsY, OpPropsZ, OpPropsBW; |
1312 | TTI::OperandValueKind OpKindX = TTI::getOperandInfo(X, OpPropsX); |
1313 | TTI::OperandValueKind OpKindY = TTI::getOperandInfo(Y, OpPropsY); |
1314 | TTI::OperandValueKind OpKindZ = TTI::getOperandInfo(Z, OpPropsZ); |
1315 | TTI::OperandValueKind OpKindBW = TTI::OK_UniformConstantValue; |
1316 | OpPropsBW = isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2 |
1317 | : TTI::OP_None; |
1318 | // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW))) |
1319 | // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW)) |
1320 | unsigned Cost = 0; |
1321 | Cost += |
1322 | thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind); |
1323 | Cost += |
1324 | thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind); |
1325 | Cost += thisT()->getArithmeticInstrCost( |
1326 | BinaryOperator::Shl, RetTy, CostKind, OpKindX, OpKindZ, OpPropsX); |
1327 | Cost += thisT()->getArithmeticInstrCost( |
1328 | BinaryOperator::LShr, RetTy, CostKind, OpKindY, OpKindZ, OpPropsY); |
1329 | // Non-constant shift amounts requires a modulo. |
1330 | if (OpKindZ != TTI::OK_UniformConstantValue && |
1331 | OpKindZ != TTI::OK_NonUniformConstantValue) |
1332 | Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy, |
1333 | CostKind, OpKindZ, OpKindBW, |
1334 | OpPropsZ, OpPropsBW); |
1335 | // For non-rotates (X != Y) we must add shift-by-zero handling costs. |
1336 | if (X != Y) { |
1337 | Type *CondTy = RetTy->getWithNewBitWidth(1); |
1338 | Cost += |
1339 | thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, |
1340 | CmpInst::BAD_ICMP_PREDICATE, CostKind); |
1341 | Cost += |
1342 | thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, |
1343 | CmpInst::BAD_ICMP_PREDICATE, CostKind); |
1344 | } |
1345 | return Cost; |
1346 | } |
1347 | } |
1348 | // TODO: Handle the remaining intrinsic with scalable vector type |
1349 | if (isa<ScalableVectorType>(RetTy)) |
1350 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); |
1351 | |
1352 | // Assume that we need to scalarize this intrinsic. |
1353 | SmallVector<Type *, 4> Types; |
1354 | for (const Value *Op : Args) { |
1355 | Type *OpTy = Op->getType(); |
1356 | assert(VF.isScalar() || !OpTy->isVectorTy())((VF.isScalar() || !OpTy->isVectorTy()) ? static_cast<void > (0) : __assert_fail ("VF.isScalar() || !OpTy->isVectorTy()" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 1356, __PRETTY_FUNCTION__)); |
1357 | Types.push_back(VF.isScalar() |
1358 | ? OpTy |
1359 | : FixedVectorType::get(OpTy, VF.getKnownMinValue())); |
1360 | } |
1361 | |
1362 | if (VF.isVector() && !RetTy->isVoidTy()) |
1363 | RetTy = FixedVectorType::get(RetTy, VF.getKnownMinValue()); |
1364 | |
1365 | // Compute the scalarization overhead based on Args for a vector |
1366 | // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while |
1367 | // CostModel will pass a vector RetTy and VF is 1. |
1368 | unsigned ScalarizationCost = std::numeric_limits<unsigned>::max(); |
1369 | if (RetVF.isVector() || VF.isVector()) { |
1370 | ScalarizationCost = 0; |
1371 | if (!RetTy->isVoidTy()) |
1372 | ScalarizationCost += |
1373 | getScalarizationOverhead(cast<VectorType>(RetTy), true, false); |
1374 | ScalarizationCost += |
1375 | getOperandsScalarizationOverhead(Args, VF.getKnownMinValue()); |
1376 | } |
1377 | |
1378 | IntrinsicCostAttributes Attrs(IID, RetTy, Types, FMF, ScalarizationCost, I); |
1379 | return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind); |
1380 | } |
1381 | |
1382 | /// Get intrinsic cost based on argument types. |
1383 | /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the |
1384 | /// cost of scalarizing the arguments and the return value will be computed |
1385 | /// based on types. |
1386 | unsigned getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, |
1387 | TTI::TargetCostKind CostKind) { |
1388 | Intrinsic::ID IID = ICA.getID(); |
1389 | Type *RetTy = ICA.getReturnType(); |
1390 | const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes(); |
1391 | FastMathFlags FMF = ICA.getFlags(); |
1392 | unsigned ScalarizationCostPassed = ICA.getScalarizationCost(); |
1393 | bool SkipScalarizationCost = ICA.skipScalarizationCost(); |
1394 | |
1395 | VectorType *VecOpTy = nullptr; |
1396 | if (!Tys.empty()) { |
1397 | // The vector reduction operand is operand 0 except for fadd/fmul. |
1398 | // Their operand 0 is a scalar start value, so the vector op is operand 1. |
1399 | unsigned VecTyIndex = 0; |
1400 | if (IID == Intrinsic::vector_reduce_fadd || |
1401 | IID == Intrinsic::vector_reduce_fmul) |
1402 | VecTyIndex = 1; |
1403 | assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes")((Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes" ) ? static_cast<void> (0) : __assert_fail ("Tys.size() > VecTyIndex && \"Unexpected IntrinsicCostAttributes\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 1403, __PRETTY_FUNCTION__)); |
1404 | VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]); |
1405 | } |
1406 | |
1407 | // Library call cost - other than size, make it expensive. |
1408 | unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10; |
1409 | SmallVector<unsigned, 2> ISDs; |
1410 | switch (IID) { |
1411 | default: { |
1412 | // Assume that we need to scalarize this intrinsic. |
1413 | unsigned ScalarizationCost = ScalarizationCostPassed; |
1414 | unsigned ScalarCalls = 1; |
1415 | Type *ScalarRetTy = RetTy; |
1416 | if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) { |
1417 | if (!SkipScalarizationCost) |
1418 | ScalarizationCost = getScalarizationOverhead(RetVTy, true, false); |
1419 | ScalarCalls = std::max(ScalarCalls, |
1420 | cast<FixedVectorType>(RetVTy)->getNumElements()); |
1421 | ScalarRetTy = RetTy->getScalarType(); |
1422 | } |
1423 | SmallVector<Type *, 4> ScalarTys; |
1424 | for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { |
1425 | Type *Ty = Tys[i]; |
1426 | if (auto *VTy = dyn_cast<VectorType>(Ty)) { |
1427 | if (!SkipScalarizationCost) |
1428 | ScalarizationCost += getScalarizationOverhead(VTy, false, true); |
1429 | ScalarCalls = std::max(ScalarCalls, |
1430 | cast<FixedVectorType>(VTy)->getNumElements()); |
1431 | Ty = Ty->getScalarType(); |
1432 | } |
1433 | ScalarTys.push_back(Ty); |
1434 | } |
1435 | if (ScalarCalls == 1) |
1436 | return 1; // Return cost of a scalar intrinsic. Assume it to be cheap. |
1437 | |
1438 | IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF); |
1439 | unsigned ScalarCost = |
1440 | thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind); |
1441 | |
1442 | return ScalarCalls * ScalarCost + ScalarizationCost; |
1443 | } |
1444 | // Look for intrinsics that can be lowered directly or turned into a scalar |
1445 | // intrinsic call. |
1446 | case Intrinsic::sqrt: |
1447 | ISDs.push_back(ISD::FSQRT); |
1448 | break; |
1449 | case Intrinsic::sin: |
1450 | ISDs.push_back(ISD::FSIN); |
1451 | break; |
1452 | case Intrinsic::cos: |
1453 | ISDs.push_back(ISD::FCOS); |
1454 | break; |
1455 | case Intrinsic::exp: |
1456 | ISDs.push_back(ISD::FEXP); |
1457 | break; |
1458 | case Intrinsic::exp2: |
1459 | ISDs.push_back(ISD::FEXP2); |
1460 | break; |
1461 | case Intrinsic::log: |
1462 | ISDs.push_back(ISD::FLOG); |
1463 | break; |
1464 | case Intrinsic::log10: |
1465 | ISDs.push_back(ISD::FLOG10); |
1466 | break; |
1467 | case Intrinsic::log2: |
1468 | ISDs.push_back(ISD::FLOG2); |
1469 | break; |
1470 | case Intrinsic::fabs: |
1471 | ISDs.push_back(ISD::FABS); |
1472 | break; |
1473 | case Intrinsic::canonicalize: |
1474 | ISDs.push_back(ISD::FCANONICALIZE); |
1475 | break; |
1476 | case Intrinsic::minnum: |
1477 | ISDs.push_back(ISD::FMINNUM); |
1478 | break; |
1479 | case Intrinsic::maxnum: |
1480 | ISDs.push_back(ISD::FMAXNUM); |
1481 | break; |
1482 | case Intrinsic::minimum: |
1483 | ISDs.push_back(ISD::FMINIMUM); |
1484 | break; |
1485 | case Intrinsic::maximum: |
1486 | ISDs.push_back(ISD::FMAXIMUM); |
1487 | break; |
1488 | case Intrinsic::copysign: |
1489 | ISDs.push_back(ISD::FCOPYSIGN); |
1490 | break; |
1491 | case Intrinsic::floor: |
1492 | ISDs.push_back(ISD::FFLOOR); |
1493 | break; |
1494 | case Intrinsic::ceil: |
1495 | ISDs.push_back(ISD::FCEIL); |
1496 | break; |
1497 | case Intrinsic::trunc: |
1498 | ISDs.push_back(ISD::FTRUNC); |
1499 | break; |
1500 | case Intrinsic::nearbyint: |
1501 | ISDs.push_back(ISD::FNEARBYINT); |
1502 | break; |
1503 | case Intrinsic::rint: |
1504 | ISDs.push_back(ISD::FRINT); |
1505 | break; |
1506 | case Intrinsic::round: |
1507 | ISDs.push_back(ISD::FROUND); |
1508 | break; |
1509 | case Intrinsic::roundeven: |
1510 | ISDs.push_back(ISD::FROUNDEVEN); |
1511 | break; |
1512 | case Intrinsic::pow: |
1513 | ISDs.push_back(ISD::FPOW); |
1514 | break; |
1515 | case Intrinsic::fma: |
1516 | ISDs.push_back(ISD::FMA); |
1517 | break; |
1518 | case Intrinsic::fmuladd: |
1519 | ISDs.push_back(ISD::FMA); |
1520 | break; |
1521 | case Intrinsic::experimental_constrained_fmuladd: |
1522 | ISDs.push_back(ISD::STRICT_FMA); |
1523 | break; |
1524 | // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free. |
1525 | case Intrinsic::lifetime_start: |
1526 | case Intrinsic::lifetime_end: |
1527 | case Intrinsic::sideeffect: |
1528 | case Intrinsic::pseudoprobe: |
1529 | return 0; |
1530 | case Intrinsic::masked_store: { |
1531 | Type *Ty = Tys[0]; |
1532 | Align TyAlign = thisT()->DL.getABITypeAlign(Ty); |
1533 | return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0, |
1534 | CostKind); |
1535 | } |
1536 | case Intrinsic::masked_load: { |
1537 | Type *Ty = RetTy; |
1538 | Align TyAlign = thisT()->DL.getABITypeAlign(Ty); |
1539 | return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0, |
1540 | CostKind); |
1541 | } |
1542 | case Intrinsic::vector_reduce_add: |
1543 | return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy, |
1544 | /*IsPairwiseForm=*/false, |
1545 | CostKind); |
1546 | case Intrinsic::vector_reduce_mul: |
1547 | return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy, |
1548 | /*IsPairwiseForm=*/false, |
1549 | CostKind); |
1550 | case Intrinsic::vector_reduce_and: |
1551 | return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy, |
1552 | /*IsPairwiseForm=*/false, |
1553 | CostKind); |
1554 | case Intrinsic::vector_reduce_or: |
1555 | return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy, |
1556 | /*IsPairwiseForm=*/false, |
1557 | CostKind); |
1558 | case Intrinsic::vector_reduce_xor: |
1559 | return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy, |
1560 | /*IsPairwiseForm=*/false, |
1561 | CostKind); |
1562 | case Intrinsic::vector_reduce_fadd: |
1563 | // FIXME: Add new flag for cost of strict reductions. |
1564 | return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy, |
1565 | /*IsPairwiseForm=*/false, |
1566 | CostKind); |
1567 | case Intrinsic::vector_reduce_fmul: |
1568 | // FIXME: Add new flag for cost of strict reductions. |
1569 | return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy, |
1570 | /*IsPairwiseForm=*/false, |
1571 | CostKind); |
1572 | case Intrinsic::vector_reduce_smax: |
1573 | case Intrinsic::vector_reduce_smin: |
1574 | case Intrinsic::vector_reduce_fmax: |
1575 | case Intrinsic::vector_reduce_fmin: |
1576 | return thisT()->getMinMaxReductionCost( |
1577 | VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)), |
1578 | /*IsPairwiseForm=*/false, |
1579 | /*IsUnsigned=*/false, CostKind); |
1580 | case Intrinsic::vector_reduce_umax: |
1581 | case Intrinsic::vector_reduce_umin: |
1582 | return thisT()->getMinMaxReductionCost( |
1583 | VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)), |
1584 | /*IsPairwiseForm=*/false, |
1585 | /*IsUnsigned=*/true, CostKind); |
1586 | case Intrinsic::abs: |
1587 | case Intrinsic::smax: |
1588 | case Intrinsic::smin: |
1589 | case Intrinsic::umax: |
1590 | case Intrinsic::umin: { |
1591 | // abs(X) = select(icmp(X,0),X,sub(0,X)) |
1592 | // minmax(X,Y) = select(icmp(X,Y),X,Y) |
1593 | Type *CondTy = RetTy->getWithNewBitWidth(1); |
1594 | unsigned Cost = 0; |
1595 | // TODO: Ideally getCmpSelInstrCost would accept an icmp condition code. |
1596 | Cost += |
1597 | thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, |
1598 | CmpInst::BAD_ICMP_PREDICATE, CostKind); |
1599 | Cost += |
1600 | thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, |
1601 | CmpInst::BAD_ICMP_PREDICATE, CostKind); |
1602 | // TODO: Should we add an OperandValueProperties::OP_Zero property? |
1603 | if (IID == Intrinsic::abs) |
1604 | Cost += thisT()->getArithmeticInstrCost( |
1605 | BinaryOperator::Sub, RetTy, CostKind, TTI::OK_UniformConstantValue); |
1606 | return Cost; |
1607 | } |
1608 | case Intrinsic::sadd_sat: |
1609 | case Intrinsic::ssub_sat: { |
1610 | Type *CondTy = RetTy->getWithNewBitWidth(1); |
1611 | |
1612 | Type *OpTy = StructType::create({RetTy, CondTy}); |
1613 | Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat |
1614 | ? Intrinsic::sadd_with_overflow |
1615 | : Intrinsic::ssub_with_overflow; |
1616 | |
1617 | // SatMax -> Overflow && SumDiff < 0 |
1618 | // SatMin -> Overflow && SumDiff >= 0 |
1619 | unsigned Cost = 0; |
1620 | IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, |
1621 | ScalarizationCostPassed); |
1622 | Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); |
1623 | Cost += |
1624 | thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, |
1625 | CmpInst::BAD_ICMP_PREDICATE, CostKind); |
1626 | Cost += 2 * thisT()->getCmpSelInstrCost( |
1627 | BinaryOperator::Select, RetTy, CondTy, |
1628 | CmpInst::BAD_ICMP_PREDICATE, CostKind); |
1629 | return Cost; |
1630 | } |
1631 | case Intrinsic::uadd_sat: |
1632 | case Intrinsic::usub_sat: { |
1633 | Type *CondTy = RetTy->getWithNewBitWidth(1); |
1634 | |
1635 | Type *OpTy = StructType::create({RetTy, CondTy}); |
1636 | Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat |
1637 | ? Intrinsic::uadd_with_overflow |
1638 | : Intrinsic::usub_with_overflow; |
1639 | |
1640 | unsigned Cost = 0; |
1641 | IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, |
1642 | ScalarizationCostPassed); |
1643 | Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); |
1644 | Cost += |
1645 | thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, |
1646 | CmpInst::BAD_ICMP_PREDICATE, CostKind); |
1647 | return Cost; |
1648 | } |
1649 | case Intrinsic::smul_fix: |
1650 | case Intrinsic::umul_fix: { |
1651 | unsigned ExtSize = RetTy->getScalarSizeInBits() * 2; |
1652 | Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize); |
1653 | |
1654 | unsigned ExtOp = |
1655 | IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt; |
1656 | TTI::CastContextHint CCH = TTI::CastContextHint::None; |
1657 | |
1658 | unsigned Cost = 0; |
1659 | Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind); |
1660 | Cost += |
1661 | thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); |
1662 | Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy, |
1663 | CCH, CostKind); |
1664 | Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy, |
1665 | CostKind, TTI::OK_AnyValue, |
1666 | TTI::OK_UniformConstantValue); |
1667 | Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind, |
1668 | TTI::OK_AnyValue, |
1669 | TTI::OK_UniformConstantValue); |
1670 | Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind); |
1671 | return Cost; |
1672 | } |
1673 | case Intrinsic::sadd_with_overflow: |
1674 | case Intrinsic::ssub_with_overflow: { |
1675 | Type *SumTy = RetTy->getContainedType(0); |
1676 | Type *OverflowTy = RetTy->getContainedType(1); |
1677 | unsigned Opcode = IID == Intrinsic::sadd_with_overflow |
1678 | ? BinaryOperator::Add |
1679 | : BinaryOperator::Sub; |
1680 | |
1681 | // LHSSign -> LHS >= 0 |
1682 | // RHSSign -> RHS >= 0 |
1683 | // SumSign -> Sum >= 0 |
1684 | // |
1685 | // Add: |
1686 | // Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign) |
1687 | // Sub: |
1688 | // Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign) |
1689 | unsigned Cost = 0; |
1690 | Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind); |
1691 | Cost += 3 * thisT()->getCmpSelInstrCost( |
1692 | Instruction::ICmp, SumTy, OverflowTy, |
1693 | CmpInst::BAD_ICMP_PREDICATE, CostKind); |
1694 | Cost += 2 * thisT()->getCmpSelInstrCost( |
1695 | Instruction::Select, OverflowTy, OverflowTy, |
1696 | CmpInst::BAD_ICMP_PREDICATE, CostKind); |
1697 | Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, OverflowTy, |
1698 | CostKind); |
1699 | return Cost; |
1700 | } |
1701 | case Intrinsic::uadd_with_overflow: |
1702 | case Intrinsic::usub_with_overflow: { |
1703 | Type *SumTy = RetTy->getContainedType(0); |
1704 | Type *OverflowTy = RetTy->getContainedType(1); |
1705 | unsigned Opcode = IID == Intrinsic::uadd_with_overflow |
1706 | ? BinaryOperator::Add |
1707 | : BinaryOperator::Sub; |
1708 | |
1709 | unsigned Cost = 0; |
1710 | Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind); |
1711 | Cost += |
1712 | thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy, |
1713 | CmpInst::BAD_ICMP_PREDICATE, CostKind); |
1714 | return Cost; |
1715 | } |
1716 | case Intrinsic::smul_with_overflow: |
1717 | case Intrinsic::umul_with_overflow: { |
1718 | Type *MulTy = RetTy->getContainedType(0); |
1719 | Type *OverflowTy = RetTy->getContainedType(1); |
1720 | unsigned ExtSize = MulTy->getScalarSizeInBits() * 2; |
1721 | Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize); |
1722 | |
1723 | unsigned ExtOp = |
1724 | IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt; |
1725 | TTI::CastContextHint CCH = TTI::CastContextHint::None; |
1726 | |
1727 | unsigned Cost = 0; |
1728 | Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind); |
1729 | Cost += |
1730 | thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); |
1731 | Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy, |
1732 | CCH, CostKind); |
1733 | Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, MulTy, |
1734 | CostKind, TTI::OK_AnyValue, |
1735 | TTI::OK_UniformConstantValue); |
1736 | |
1737 | if (IID == Intrinsic::smul_with_overflow) |
1738 | Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy, |
1739 | CostKind, TTI::OK_AnyValue, |
1740 | TTI::OK_UniformConstantValue); |
1741 | |
1742 | Cost += |
1743 | thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, MulTy, OverflowTy, |
1744 | CmpInst::BAD_ICMP_PREDICATE, CostKind); |
1745 | return Cost; |
1746 | } |
1747 | case Intrinsic::ctpop: |
1748 | ISDs.push_back(ISD::CTPOP); |
1749 | // In case of legalization use TCC_Expensive. This is cheaper than a |
1750 | // library call but still not a cheap instruction. |
1751 | SingleCallCost = TargetTransformInfo::TCC_Expensive; |
1752 | break; |
1753 | // FIXME: ctlz, cttz, ... |
1754 | case Intrinsic::bswap: |
1755 | ISDs.push_back(ISD::BSWAP); |
1756 | break; |
1757 | case Intrinsic::bitreverse: |
1758 | ISDs.push_back(ISD::BITREVERSE); |
1759 | break; |
1760 | } |
1761 | |
1762 | const TargetLoweringBase *TLI = getTLI(); |
1763 | std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy); |
1764 | |
1765 | SmallVector<unsigned, 2> LegalCost; |
1766 | SmallVector<unsigned, 2> CustomCost; |
1767 | for (unsigned ISD : ISDs) { |
1768 | if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { |
1769 | if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() && |
1770 | TLI->isFAbsFree(LT.second)) { |
1771 | return 0; |
1772 | } |
1773 | |
1774 | // The operation is legal. Assume it costs 1. |
1775 | // If the type is split to multiple registers, assume that there is some |
1776 | // overhead to this. |
1777 | // TODO: Once we have extract/insert subvector cost we need to use them. |
1778 | if (LT.first > 1) |
1779 | LegalCost.push_back(LT.first * 2); |
1780 | else |
1781 | LegalCost.push_back(LT.first * 1); |
1782 | } else if (!TLI->isOperationExpand(ISD, LT.second)) { |
1783 | // If the operation is custom lowered then assume |
1784 | // that the code is twice as expensive. |
1785 | CustomCost.push_back(LT.first * 2); |
1786 | } |
1787 | } |
1788 | |
1789 | auto *MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end()); |
1790 | if (MinLegalCostI != LegalCost.end()) |
1791 | return *MinLegalCostI; |
1792 | |
1793 | auto MinCustomCostI = |
1794 | std::min_element(CustomCost.begin(), CustomCost.end()); |
1795 | if (MinCustomCostI != CustomCost.end()) |
1796 | return *MinCustomCostI; |
1797 | |
1798 | // If we can't lower fmuladd into an FMA estimate the cost as a floating |
1799 | // point mul followed by an add. |
1800 | if (IID == Intrinsic::fmuladd) |
1801 | return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy, |
1802 | CostKind) + |
1803 | thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy, |
1804 | CostKind); |
1805 | if (IID == Intrinsic::experimental_constrained_fmuladd) { |
1806 | IntrinsicCostAttributes FMulAttrs( |
1807 | Intrinsic::experimental_constrained_fmul, RetTy, Tys); |
1808 | IntrinsicCostAttributes FAddAttrs( |
1809 | Intrinsic::experimental_constrained_fadd, RetTy, Tys); |
1810 | return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) + |
1811 | thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind); |
1812 | } |
1813 | |
1814 | // Else, assume that we need to scalarize this intrinsic. For math builtins |
1815 | // this will emit a costly libcall, adding call overhead and spills. Make it |
1816 | // very expensive. |
1817 | if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) { |
1818 | unsigned ScalarizationCost = SkipScalarizationCost ? |
1819 | ScalarizationCostPassed : getScalarizationOverhead(RetVTy, true, false); |
1820 | |
1821 | unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements(); |
1822 | SmallVector<Type *, 4> ScalarTys; |
1823 | for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { |
1824 | Type *Ty = Tys[i]; |
1825 | if (Ty->isVectorTy()) |
1826 | Ty = Ty->getScalarType(); |
1827 | ScalarTys.push_back(Ty); |
1828 | } |
1829 | IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF); |
1830 | unsigned ScalarCost = thisT()->getIntrinsicInstrCost(Attrs, CostKind); |
1831 | for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { |
1832 | if (auto *VTy = dyn_cast<VectorType>(Tys[i])) { |
1833 | if (!ICA.skipScalarizationCost()) |
1834 | ScalarizationCost += getScalarizationOverhead(VTy, false, true); |
1835 | ScalarCalls = std::max(ScalarCalls, |
1836 | cast<FixedVectorType>(VTy)->getNumElements()); |
1837 | } |
1838 | } |
1839 | return ScalarCalls * ScalarCost + ScalarizationCost; |
1840 | } |
1841 | |
1842 | // This is going to be turned into a library call, make it expensive. |
1843 | return SingleCallCost; |
1844 | } |
1845 | |
1846 | /// Compute a cost of the given call instruction. |
1847 | /// |
1848 | /// Compute the cost of calling function F with return type RetTy and |
1849 | /// argument types Tys. F might be nullptr, in this case the cost of an |
1850 | /// arbitrary call with the specified signature will be returned. |
1851 | /// This is used, for instance, when we estimate call of a vector |
1852 | /// counterpart of the given function. |
1853 | /// \param F Called function, might be nullptr. |
1854 | /// \param RetTy Return value types. |
1855 | /// \param Tys Argument types. |
1856 | /// \returns The cost of Call instruction. |
1857 | unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys, |
1858 | TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) { |
1859 | return 10; |
1860 | } |
1861 | |
1862 | unsigned getNumberOfParts(Type *Tp) { |
1863 | std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Tp); |
1864 | return LT.first; |
1865 | } |
1866 | |
1867 | unsigned getAddressComputationCost(Type *Ty, ScalarEvolution *, |
1868 | const SCEV *) { |
1869 | return 0; |
1870 | } |
1871 | |
1872 | /// Try to calculate arithmetic and shuffle op costs for reduction operations. |
1873 | /// We're assuming that reduction operation are performing the following way: |
1874 | /// 1. Non-pairwise reduction |
1875 | /// %val1 = shufflevector<n x t> %val, <n x t> %undef, |
1876 | /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef> |
1877 | /// \----------------v-------------/ \----------v------------/ |
1878 | /// n/2 elements n/2 elements |
1879 | /// %red1 = op <n x t> %val, <n x t> val1 |
1880 | /// After this operation we have a vector %red1 where only the first n/2 |
1881 | /// elements are meaningful, the second n/2 elements are undefined and can be |
1882 | /// dropped. All other operations are actually working with the vector of |
1883 | /// length n/2, not n, though the real vector length is still n. |
1884 | /// %val2 = shufflevector<n x t> %red1, <n x t> %undef, |
1885 | /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef> |
1886 | /// \----------------v-------------/ \----------v------------/ |
1887 | /// n/4 elements 3*n/4 elements |
1888 | /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of |
1889 | /// length n/2, the resulting vector has length n/4 etc. |
1890 | /// 2. Pairwise reduction: |
1891 | /// Everything is the same except for an additional shuffle operation which |
1892 | /// is used to produce operands for pairwise kind of reductions. |
1893 | /// %val1 = shufflevector<n x t> %val, <n x t> %undef, |
1894 | /// <n x i32> <i32 0, i32 2, ..., i32 n-2, i32 undef, ..., i32 undef> |
1895 | /// \-------------v----------/ \----------v------------/ |
1896 | /// n/2 elements n/2 elements |
1897 | /// %val2 = shufflevector<n x t> %val, <n x t> %undef, |
1898 | /// <n x i32> <i32 1, i32 3, ..., i32 n-1, i32 undef, ..., i32 undef> |
1899 | /// \-------------v----------/ \----------v------------/ |
1900 | /// n/2 elements n/2 elements |
1901 | /// %red1 = op <n x t> %val1, <n x t> val2 |
1902 | /// Again, the operation is performed on <n x t> vector, but the resulting |
1903 | /// vector %red1 is <n/2 x t> vector. |
1904 | /// |
1905 | /// The cost model should take into account that the actual length of the |
1906 | /// vector is reduced on each iteration. |
1907 | unsigned getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, |
1908 | bool IsPairwise, |
1909 | TTI::TargetCostKind CostKind) { |
1910 | Type *ScalarTy = Ty->getElementType(); |
1911 | unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements(); |
1912 | unsigned NumReduxLevels = Log2_32(NumVecElts); |
1913 | unsigned ArithCost = 0; |
1914 | unsigned ShuffleCost = 0; |
1915 | std::pair<unsigned, MVT> LT = |
1916 | thisT()->getTLI()->getTypeLegalizationCost(DL, Ty); |
1917 | unsigned LongVectorCount = 0; |
1918 | unsigned MVTLen = |
1919 | LT.second.isVector() ? LT.second.getVectorNumElements() : 1; |
1920 | while (NumVecElts > MVTLen) { |
1921 | NumVecElts /= 2; |
1922 | VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); |
1923 | // Assume the pairwise shuffles add a cost. |
1924 | ShuffleCost += |
1925 | (IsPairwise + 1) * thisT()->getShuffleCost(TTI::SK_ExtractSubvector, |
1926 | Ty, NumVecElts, SubTy); |
1927 | ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind); |
1928 | Ty = SubTy; |
1929 | ++LongVectorCount; |
1930 | } |
1931 | |
1932 | NumReduxLevels -= LongVectorCount; |
1933 | |
1934 | // The minimal length of the vector is limited by the real length of vector |
1935 | // operations performed on the current platform. That's why several final |
1936 | // reduction operations are performed on the vectors with the same |
1937 | // architecture-dependent length. |
1938 | |
1939 | // Non pairwise reductions need one shuffle per reduction level. Pairwise |
1940 | // reductions need two shuffles on every level, but the last one. On that |
1941 | // level one of the shuffles is <0, u, u, ...> which is identity. |
1942 | unsigned NumShuffles = NumReduxLevels; |
1943 | if (IsPairwise && NumReduxLevels >= 1) |
1944 | NumShuffles += NumReduxLevels - 1; |
1945 | ShuffleCost += NumShuffles * |
1946 | thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, 0, Ty); |
1947 | ArithCost += NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty); |
1948 | return ShuffleCost + ArithCost + |
1949 | thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0); |
1950 | } |
1951 | |
1952 | /// Try to calculate op costs for min/max reduction operations. |
1953 | /// \param CondTy Conditional type for the Select instruction. |
1954 | unsigned getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, |
1955 | bool IsPairwise, bool IsUnsigned, |
1956 | TTI::TargetCostKind CostKind) { |
1957 | Type *ScalarTy = Ty->getElementType(); |
1958 | Type *ScalarCondTy = CondTy->getElementType(); |
1959 | unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements(); |
1960 | unsigned NumReduxLevels = Log2_32(NumVecElts); |
1961 | unsigned CmpOpcode; |
1962 | if (Ty->isFPOrFPVectorTy()) { |
1963 | CmpOpcode = Instruction::FCmp; |
1964 | } else { |
1965 | assert(Ty->isIntOrIntVectorTy() &&((Ty->isIntOrIntVectorTy() && "expecting floating point or integer type for min/max reduction" ) ? static_cast<void> (0) : __assert_fail ("Ty->isIntOrIntVectorTy() && \"expecting floating point or integer type for min/max reduction\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 1966, __PRETTY_FUNCTION__)) |
1966 | "expecting floating point or integer type for min/max reduction")((Ty->isIntOrIntVectorTy() && "expecting floating point or integer type for min/max reduction" ) ? static_cast<void> (0) : __assert_fail ("Ty->isIntOrIntVectorTy() && \"expecting floating point or integer type for min/max reduction\"" , "/build/llvm-toolchain-snapshot-12~++20210105111114+53a341a61d1f/llvm/include/llvm/CodeGen/BasicTTIImpl.h" , 1966, __PRETTY_FUNCTION__)); |
1967 | CmpOpcode = Instruction::ICmp; |
1968 | } |
1969 | unsigned MinMaxCost = 0; |
1970 | unsigned ShuffleCost = 0; |
1971 | std::pair<unsigned, MVT> LT = |
1972 | thisT()->getTLI()->getTypeLegalizationCost(DL, Ty); |
1973 | unsigned LongVectorCount = 0; |
1974 | unsigned MVTLen = |
1975 | LT.second.isVector() ? LT.second.getVectorNumElements() : 1; |
1976 | while (NumVecElts > MVTLen) { |
1977 | NumVecElts /= 2; |
1978 | auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); |
1979 | CondTy = FixedVectorType::get(ScalarCondTy, NumVecElts); |
1980 | |
1981 | // Assume the pairwise shuffles add a cost. |
1982 | ShuffleCost += |
1983 | (IsPairwise + 1) * thisT()->getShuffleCost(TTI::SK_ExtractSubvector, |
1984 | Ty, NumVecElts, SubTy); |
1985 | MinMaxCost += |
1986 | thisT()->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy, |
1987 | CmpInst::BAD_ICMP_PREDICATE, CostKind) + |
1988 | thisT()->getCmpSelInstrCost(Instruction::Select, SubTy, CondTy, |
1989 | CmpInst::BAD_ICMP_PREDICATE, CostKind); |
1990 | Ty = SubTy; |
1991 | ++LongVectorCount; |
1992 | } |
1993 | |
1994 | NumReduxLevels -= LongVectorCount; |
1995 | |
1996 | // The minimal length of the vector is limited by the real length of vector |
1997 | // operations performed on the current platform. That's why several final |
1998 | // reduction opertions are perfomed on the vectors with the same |
1999 | // architecture-dependent length. |
2000 | |
2001 | // Non pairwise reductions need one shuffle per reduction level. Pairwise |
2002 | // reductions need two shuffles on every level, but the last one. On that |
2003 | // level one of the shuffles is <0, u, u, ...> which is identity. |
2004 | unsigned NumShuffles = NumReduxLevels; |
2005 | if (IsPairwise && NumReduxLevels >= 1) |
2006 | NumShuffles += NumReduxLevels - 1; |
2007 | ShuffleCost += NumShuffles * |
2008 | thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, 0, Ty); |
2009 | MinMaxCost += |
2010 | NumReduxLevels * |
2011 | (thisT()->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, |
2012 | CmpInst::BAD_ICMP_PREDICATE, CostKind) + |
2013 | thisT()->getCmpSelInstrCost(Instruction::Select, Ty, CondTy, |
2014 | CmpInst::BAD_ICMP_PREDICATE, CostKind)); |
2015 | // The last min/max should be in vector registers and we counted it above. |
2016 | // So just need a single extractelement. |
2017 | return ShuffleCost + MinMaxCost + |
2018 | thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0); |
2019 | } |
2020 | |
2021 | unsigned getVectorSplitCost() { return 1; } |
2022 | |
2023 | /// @} |
2024 | }; |
2025 | |
2026 | /// Concrete BasicTTIImpl that can be used if no further customization |
2027 | /// is needed. |
2028 | class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> { |
2029 | using BaseT = BasicTTIImplBase<BasicTTIImpl>; |
2030 | |
2031 | friend class BasicTTIImplBase<BasicTTIImpl>; |
2032 | |
2033 | const TargetSubtargetInfo *ST; |
2034 | const TargetLoweringBase *TLI; |
2035 | |
2036 | const TargetSubtargetInfo *getST() const { return ST; } |
2037 | const TargetLoweringBase *getTLI() const { return TLI; } |
2038 | |
2039 | public: |
2040 | explicit BasicTTIImpl(const TargetMachine *TM, const Function &F); |
2041 | }; |
2042 | |
2043 | } // end namespace llvm |
2044 | |
2045 | #endif // LLVM_CODEGEN_BASICTTIIMPL_H |